So we continue our journey to understand the nature of buffer cache and forcing DBWR process to write our changes to disk. If you want to follow this thread – you should get familiar with at least those blog posts:
Until now we have established, that DBWR will write to disk a database block changed in memory by us if another block from the same table is affected by a running transaction… but not just any block. In this article I will try to explain what is actually going on under the hoods by using bpftrace, GDB and my own tools.
Thanks to a great book by Jonathan Lewis "Oracle Core: Essential Internals for DBAs and Developers" I was able to mark some fields of X$BH to a Rust structure.
To check the addresses of buffer headers we can use oradebug:
SQL> oradebug setmypid Statement processed. SQL> oradebug dump buffers 1 Statement processed. SQL> oradebug tracefile_name /opt/oracle/diag/rdbms/orclcdb/ORCLCDB/trace/ORCLCDB_ora_141852.trc
Table HR.EMPLOYEES has 2 blocks with data in the buffer cache:
BH (0x81faa8e0) file#: 16 rdba: 0x03000093 (12/147) class: 1 ba: 0x8183c000 set: 10 pool: 3 bsz: 8192 bsi: 0 sflg: 0 pwc: 0,0 dbwrid: 0 obj: 73298 objn: 73298 tsn: [4/5] afn: 16 hint: f hash: [0x9ff92f48,0x762a27b8] lru: [0x80fe5d98,0x81faea78] ckptq: [NULL] fileq: [NULL] objq: [0x63125eb0,0x63125eb0] objaq: [0x63125ea0,0x63125ea0] st: XCURRENT md: NULL fpin: 'kdswh01: kdstgr' fscn: 0x718d4c tch: 1 flags: only_sequential_access LRBA: [0x0.0.0] LSCN: [0x0] HSCN: [0x0] HSUB: [65535]
BH (0xaaf6e868) file#: 16 rdba: 0x03000094 (12/148) class: 1 ba: 0xaa2e6000 set: 11 pool: 3 bsz: 8192 bsi: 0 sflg: 0 pwc: 0,0 dbwrid: 0 obj: 73298 objn: 73298 tsn: [4/5] afn: 16 hint: f hash: [0x9df58550,0x77f05250] lru: [0xacfcd810,0x90fbf878] ckptq: [NULL] fileq: [NULL] objq: [0x6cc05e20,0x6cc05e20] objaq: [0x6cc05e10,0x6cc05e10] st: XCURRENT md: NULL fpin: 'kdswh01: kdstgr' fscn: 0x718d4c tch: 1 flags: only_sequential_access LRBA: [0x0.0.0] LSCN: [0x0] HSCN: [0x0] HSUB: [65535]
Block 147 (rdba: 0x03000093) is located at address 0x81faa8e0 and block 148 (rdba: 0x03000094) is located at address 0xaaf6e868.
Soon you will find a new tool for Buffer Cache visualization on my GitHub, for now the code looks like this:
use std::fs::File; use std::io::Seek; use std::io::SeekFrom; use binread::BinRead; use binread::BinReaderExt; use clap::Parser; use std::{thread, time}; #[derive(BinRead)] #[derive(Debug)] #[allow(dead_code)] struct Xbh { ckpt1: u64, //@0 ckpt2: u64, //@8 #[br(count = 160)] data1: Vec<u8>, //unknown data @16 nxt_hash: u64, //@176 prv_hash: u64, //@184 #[br(count = 8)] data2: Vec<u8>, //unknown data @192 rdba: u32, //@200 flag1: u32, //@204 objd: u32, //@208 flag2: u32, //@212 dirty_flag: u32, //@216 #[br(count = 12)] data3: Vec<u8>, //unknown data @220 nxt_repl: u64, //@232 prv_repl: u64, //@240 #[br(count = 3)] data4: Vec<u8>, //unknown data @248 tch: u8, //@251 #[br(count = 4)] data5: Vec<u8>, //unknown data @252 ba: u64, //@256 } #[derive(BinRead)] #[derive(Debug)] #[allow(dead_code)] struct Kcbh { type_kcbh: u8, frmt_kcbh: u8, spare1_kcbh: u8, spare2_kcbh: u8, rdba_kcbh: u32, bas_kcbh: u32, wrp_kcbh: u16, seq_kcbh: u8, flg_kcbh: u8, chkval_kcbh: u16, spare3_kcbh: u16 } #[derive(BinRead)] #[derive(Debug)] #[allow(dead_code)] struct Ktbbhitl { kxidusn: u16, kxidslt: u16, kxidsqn: u32, kubadba: u32, kubaseq: u16, kubarec: u8, _spare1: u8, ktbitflg: u16, ktbitun: u16, ktbitbas: u32, } #[derive(BinRead)] #[derive(Debug)] #[allow(dead_code)] struct Ktbbhcsc { kscnbas: u32, kscnwrp: u32, } #[derive(BinRead)] #[derive(Debug)] #[allow(dead_code)] struct Ktbbh { ktbbhtyp: u32, ktbbhsid: u32, ktbbhcsc: Ktbbhcsc, ktbbhict: u16, ktbbhflg: u8, ktbbhfsl: u8, ktbbhfnx: u32, #[br(count = ktbbhict)] ktbbhitl: Vec<Ktbbhitl>, } #[derive(Parser, Debug)] #[clap(author, version, about, long_about = None)] struct Args { ///PID of the process to scan #[clap(short, long)] pid: u64, ///From which BH address to start? #[clap(short, long)] start_bh: u64, ///Options for a program [chains: show other buffers in the same hash chain] #[clap(default_value="none", short, long)] option: String, } fn main() { let args = Args::parse(); let sleep_interval = time::Duration::from_millis(100); let fname = format!("/proc/{}/mem", args.pid); let mut f = File::open(fname).unwrap(); loop { f.seek(SeekFrom::Start(args.start_bh)).unwrap(); let xbh: Xbh = f.read_ne().unwrap(); std::process::Command::new("clear").status().unwrap(); println!("BH ADDR: 0x{:x}", args.start_bh); println!("DATA OBJECT ID: {} \t BLOCK: {} 0x{:x}\n", xbh.objd, xbh.rdba % 4194304, xbh.rdba); println!("CKPT[0x{:x},0x{:x}] \t HASH[0x{:x},0x{:x}] \t REPL[0x{:x},0x{:x}]", xbh.ckpt1, xbh.ckpt2, xbh.nxt_hash, xbh.prv_hash, xbh.nxt_repl, xbh.prv_repl); println!("\nDIRTY FLAG: {} 0x{:x} \t BA: {} 0x{:x} \t TCH: {}", xbh.dirty_flag, xbh.dirty_flag, xbh.ba, xbh.ba, xbh.tch); if args.option == "none" || args.option == "chains" { f.seek(SeekFrom::Start(xbh.ba)).unwrap(); let kcbh: Kcbh = f.read_ne().unwrap(); let ktbbh: Ktbbh = f.read_ne().unwrap(); println!("\nBLOCK SCN: {} BLOCK CHECKSUM: {}", kcbh.bas_kcbh, kcbh.chkval_kcbh); println!("Number of ITLs: {} (kscnbas = {})", ktbbh.ktbbhict, ktbbh.ktbbhcsc.kscnbas); for itl in ktbbh.ktbbhitl { println!("\tXID {:04x}.{:04x}.{:08x}\tSCN: {:12} \t\tFLAG: {:04x}", itl.kxidusn, itl.kxidslt, itl.kxidsqn, itl.ktbitbas, itl.ktbitflg); } } if args.option == "chains" { println!("\nBlocks in the same hash chain\n"); let start_hash = args.start_bh + 176; let mut next_hash = xbh.nxt_hash; while start_hash != next_hash { let addr = next_hash - 176; f.seek(SeekFrom::Start(addr)).unwrap(); let xbh2: Xbh = f.read_ne().unwrap(); if xbh2.dirty_flag < 1000000000 { println!("\t=> ADDR: 0x{:x} \t RBA: 0x{:08x} \t OBJD: {:8} \t HASH[0x{:x},0x{:x}]", addr, xbh2.rdba, xbh2.objd, xbh2.nxt_hash, xbh2.prv_hash); } else { println!("\t=> ADDR: 0x{:x} \t HASH CHAIN HANDLE (?) \t\t\t\t HASH[0x{:x},0x{:x}]", addr, xbh2.nxt_hash, xbh2.prv_hash); } next_hash = xbh2.nxt_hash; } } thread::sleep(sleep_interval); } }
Once I learned the buffer header addresses, I can use my tool to show the most important information:
BH ADDR: 0x81faa8e0 DATA OBJECT ID: 73298 BLOCK: 147 0x3000093 CKPT[0x81faa8e0,0x81faa8e0] HASH[0x762a27b8,0x83f96bc0] REPL[0x80fe5d98,0x81faea78] DIRTY FLAG: 524288 0x80000 BA: 2172895232 0x8183c000 TCH: 1 BLOCK SCN: 6429846 BLOCK CHECKSUM: 36605 Number of ITLs: 3 (kscnbas = 6429845) XID 0003.0013.000004ea SCN: 6429846 FLAG: 2001 XID 0002.0007.00000491 SCN: 6429615 FLAG: 8000 XID 0001.0008.000004b0 SCN: 6429822 FLAG: 8000 Blocks in the same hash chain => ADDR: 0x762a2708 HASH CHAIN HANDLE (?) HASH[0x83f96bc0,0x81faa990] => ADDR: 0x83f96b10 RBA: 0x00c1506d OBJD: 81112 HASH[0x81faa990,0x762a27b8]
BH ADDR: 0xaaf6e868 DATA OBJECT ID: 73298 BLOCK: 148 0x3000094 CKPT[0xaaf6e868,0xaaf6e868] HASH[0x77f05250,0xa3fc4430] REPL[0xacfcd810,0x90fbf878] DIRTY FLAG: 524288 0x80000 BA: 2855165952 0xaa2e6000 TCH: 1 BLOCK SCN: 6429782 BLOCK CHECKSUM: 1404 Number of ITLs: 3 (kscnbas = 6429781) XID 000a.0017.0000042c SCN: 4969276 FLAG: 8000 XID 0007.0019.00000436 SCN: 4974019 FLAG: 8000 XID 000a.0009.0000048e SCN: 6429782 FLAG: 2001 Blocks in the same hash chain => ADDR: 0x77f051a0 HASH CHAIN HANDLE (?) HASH[0xa3fc4430,0xaaf6e918] => ADDR: 0xa3fc4380 RBA: 0x00c1506e OBJD: 81112 HASH[0xaaf6e918,0x77f05250]
As you can see, both blocks are in a different hash chains, but as I showed in this article setting dirty flag on one of those blocks will cause DBWR to write a block to disk if the other block is affected by a transaction.
So when I start a legal transaction on a block, I’ll see buffer header and block ITL summary looking like this:
BH ADDR: 0x81faa8e0 DATA OBJECT ID: 73298 BLOCK: 147 0x3000093 CKPT[0x81faa8e0,0x81faa8e0] HASH[0x762a27b8,0x83f96bc0] REPL[0x80fe5d98,0x81faea78] DIRTY FLAG: 8 0x8 BA: 2172895232 0x8183c000 TCH: 2 BLOCK SCN: 6429846 BLOCK CHECKSUM: 0 Number of ITLs: 3 (kscnbas = 7612911) XID 0003.0013.000004ea SCN: 6429846 FLAG: 8000 XID 000a.0017.00000549 SCN: 0 FLAG: 0001 XID 0001.0008.000004b0 SCN: 6429822 FLAG: 8000 Blocks in the same hash chain => ADDR: 0x762a2708 HASH CHAIN HANDLE (?) HASH[0x83f96bc0,0x81faa990] => ADDR: 0x83f96b10 RBA: 0x00c1506d OBJD: 81112 HASH[0x81faa990,0x762a27b8]
Please notice, that the block checksum was set to 0 and we have an ITL slot allocated. After the commit, the buffer header looks like this:
BH ADDR: 0x81faa8e0 DATA OBJECT ID: 73298 BLOCK: 147 0x3000093 CKPT[0x76855fe8,0xc0f870c0] HASH[0x762a27b8,0x83f96bc0] REPL[0x80fe5d98,0x81faea78] DIRTY FLAG: 1 0x1 BA: 2172895232 0x8183c000 TCH: 2 BLOCK SCN: 7612940 BLOCK CHECKSUM: 0 Number of ITLs: 3 (kscnbas = 7612911) XID 0003.0013.000004ea SCN: 6429846 FLAG: 8000 XID 000a.0017.00000549 SCN: 7612940 FLAG: 2001 XID 0001.0008.000004b0 SCN: 6429822 FLAG: 8000 Blocks in the same hash chain => ADDR: 0x762a2708 HASH CHAIN HANDLE (?) HASH[0x83f96bc0,0x81faa990] => ADDR: 0x83f96b10 RBA: 0x00c1506d OBJD: 81112 HASH[0x81faa990,0x762a27b8]
The dirty flag was set to 1, the block checksum is still 0 and SCN of affecting transaction was set, together with a transaction flag.
I can set a dirty flag manually on the second block with rico2
[root@dbVictim ~]# python2 Python 2.7.18 (default, Apr 5 2022, 05:50:43) [GCC 8.5.0 20210514 (Red Hat 8.5.0-10.0.1)] on linux2 Type "help", "copyright", "credits" or "license" for more information. >>> import rico2 >>> r = rico2.Rico(4952) >>> r.set_dirty_flag_bh(int('0xaaf6e868',16)+216)
Now my second block has also a dirty flag, tho nothing else has changed:
BH ADDR: 0xaaf6e868 DATA OBJECT ID: 73298 BLOCK: 148 0x3000094 CKPT[0xaaf6e868,0xaaf6e868] HASH[0x77f05250,0xa3fc4430] REPL[0xacfcd810,0x90fbf878] DIRTY FLAG: 1 0x1 BA: 2855165952 0xaa2e6000 TCH: 1 BLOCK SCN: 6429782 BLOCK CHECKSUM: 1404 Number of ITLs: 3 (kscnbas = 6429781) XID 000a.0017.0000042c SCN: 4969276 FLAG: 8000 XID 0007.0019.00000436 SCN: 4974019 FLAG: 8000 XID 000a.0009.0000048e SCN: 6429782 FLAG: 2001 Blocks in the same hash chain => ADDR: 0x77f051a0 HASH CHAIN HANDLE (?) HASH[0xa3fc4430,0xaaf6e918] => ADDR: 0xa3fc4380 RBA: 0x00c1506e OBJD: 81112 HASH[0xaaf6e918,0x77f05250]
But after the checkpoint situation looks like this:
BH ADDR: 0x81faa8e0 DATA OBJECT ID: 73298 BLOCK: 147 0x3000093 CKPT[0x81faa8e0,0x81faa8e0] HASH[0x762a27b8,0x83f96bc0] REPL[0x80fe5d98,0x81faea78] DIRTY FLAG: 2097152 0x200000 BA: 2172895232 0x8183c000 TCH: 2 BLOCK SCN: 7612940 BLOCK CHECKSUM: 33663 Number of ITLs: 3 (kscnbas = 7612911) XID 0003.0013.000004ea SCN: 6429846 FLAG: 8000 XID 000a.0017.00000549 SCN: 7612940 FLAG: 2001 XID 0001.0008.000004b0 SCN: 6429822 FLAG: 8000 Blocks in the same hash chain => ADDR: 0x762a2708 HASH CHAIN HANDLE (?) HASH[0x83f96bc0,0x81faa990] => ADDR: 0x83f96b10 RBA: 0x00c1506d OBJD: 81112 HASH[0x81faa990,0x762a27b8]
BH ADDR: 0xaaf6e868 DATA OBJECT ID: 73298 BLOCK: 148 0x3000094 CKPT[0xaaf6e868,0xaaf6e868] HASH[0x77f05250,0xa3fc4430] REPL[0xacfcd810,0x90fbf878] DIRTY FLAG: 2097152 0x200000 BA: 2855165952 0xaa2e6000 TCH: 1 BLOCK SCN: 6429782 BLOCK CHECKSUM: 1404 Number of ITLs: 3 (kscnbas = 6429781) XID 000a.0017.0000042c SCN: 4969276 FLAG: 8000 XID 0007.0019.00000436 SCN: 4974019 FLAG: 8000 XID 000a.0009.0000048e SCN: 6429782 FLAG: 2001 Blocks in the same hash chain => ADDR: 0x77f051a0 HASH CHAIN HANDLE (?) HASH[0xa3fc4430,0xaaf6e918] => ADDR: 0xa3fc4380 RBA: 0x00c1506e OBJD: 81112 HASH[0xaaf6e918,0x77f05250]
Dirty flags on both blocks has changed and both blocks were written to disk! To understand what is going on, I had to dig a bit in DBWR functions and I used the following BPFTRACE script to show you the details:
uprobe:/opt/oracle/product/19c/dbhome_1/bin/oracle:kcbb_coalesce_int+2 /pid == $1/ { printf("[kcbb_coalesce_int] Check the block at local address 0x%x\n", arg2); } uprobe:/opt/oracle/product/19c/dbhome_1/bin/oracle:kcbbxsv+2 /pid == $1/ { if (arg2 > 0) { printf("\t[kcbbxsv] Searching for a block at local address: 0x%x\n", arg2); } else { printf("\t[kcbbxsv] Checking neighbours: 0x%x\n", arg5); } } uprobe:/opt/oracle/product/19c/dbhome_1/bin/oracle:kcbbxsv+607 /pid == $1/ { printf("\t\t\t[kcbbxsv] found RBA: 0x%x\n", reg("r10")); } uprobe:/opt/oracle/product/19c/dbhome_1/bin/oracle:kcbbxsv+1325 /pid == $1/ { printf("\t\t\t\t[kcbbxsv] HASH: 0x%x, BLOCK ADDR: 0x%x, HANDLE ADDR: 0x%x\n", reg("dx"), reg("dx")-176, reg("bx")-176); }
Fragment of the output from that bpftrace on DBWR process looks like this:
[root@dbVictim ~]# bpftrace dbwr.bpf 4952 Attaching 4 probes... [kcbb_coalesce_int] Check the block at local address 0xdedaf000 [kcbbxsv] Searching for a block at local address: 0xdedaf000 [kcbbxsv] found RBA: 0x3000093 [kcbbxsv] HASH: 0x83f96bc0, BLOCK ADDR: 0x83f96b10, HANDLE ADDR: 0x762a2708 [kcbbxsv] HASH: 0x81faa990, BLOCK ADDR: 0x81faa8e0, HANDLE ADDR: 0x762a2708 [kcbbxsv] Checking neighbours: 0x3000094 [kcbbxsv] HASH: 0xa3fc4430, BLOCK ADDR: 0xa3fc4380, HANDLE ADDR: 0x77f051a0 [kcbbxsv] HASH: 0xaaf6e918, BLOCK ADDR: 0xaaf6e868, HANDLE ADDR: 0x77f051a0 [kcbbxsv] Checking neighbours: 0x3000092 [kcbbxsv] HASH: 0x81f88688, BLOCK ADDR: 0x81f885d8, HANDLE ADDR: 0x765fc7c8 [kcbb_coalesce_int] Check the block at local address 0xdedaf020 [kcbbxsv] Searching for a block at local address: 0xdedaf020 [kcbbxsv] found RBA: 0x24000d0 [kcbbxsv] HASH: 0xb7f9c728, BLOCK ADDR: 0xb7f9c678, HANDLE ADDR: 0x761e3098 [kcbbxsv] Checking neighbours: 0x24000d1 [kcbbxsv] HASH: 0xa3fc8eb8, BLOCK ADDR: 0xa3fc8e08, HANDLE ADDR: 0x76750358 [kcbbxsv] Checking neighbours: 0x24000cf [kcbbxsv] HASH: 0xb2f86900, BLOCK ADDR: 0xb2f86850, HANDLE ADDR: 0x7653ddd8 [kcbb_coalesce_int] Check the block at local address 0xdedaf040 [kcbbxsv] Searching for a block at local address: 0xdedaf040 [kcbbxsv] found RBA: 0x2400282 [kcbbxsv] HASH: 0x99fa6d18, BLOCK ADDR: 0x99fa6c68, HANDLE ADDR: 0x763e0bc8 [kcbbxsv] Checking neighbours: 0x2400283 [kcbbxsv] HASH: 0x98f6d6d0, BLOCK ADDR: 0x98f6d620, HANDLE ADDR: 0x76085e78 [kcbbxsv] Checking neighbours: 0x2400281 [kcbbxsv] HASH: 0x98f9bd50, BLOCK ADDR: 0x98f9bca0, HANDLE ADDR: 0x7673b908
We can see three interesting things going on in here:
- When DBWR goes through a block list, it checks some local structures that are 20 bytes long – you can see that by the sequence of addresses: 0xdedaf000 => 0xdedaf020 => 0xdedaf040
- DBWR is traversing the hash chain, while looking for a block that it wants to checkpoint
- DBWR is always checking if a neighbor of a given block needs to be checkpointed as well – it checks one block before and one after!
So if any block in the same hash chain or any neighbor was modified – DBWR will write it to disk!
The big question is: can fool DBWR to force it to write a database block that I want, instead the one that was meant to be written to disk?
Let’s use GDB and find out 🙂
I will set a breakpoint on kcbb_coalesce_int to analyze what is going on inside my DBWR:
(gdb) b kcbb_coalesce_int Breakpoint 1 at 0x122e8cb0 (gdb) c Continuing. Breakpoint 1, 0x00000000122e8cb0 in kcbb_coalesce_int () (gdb) info reg rax 0x7f63df402e18 140066924015128 rbx 0x0 0 rcx 0x7ffcdfa45ed4 140724060577492 rdx 0x7f63dedaf020 140066917380128 rsi 0x3 3 rdi 0x7f63e4f1b440 140067019535424 rbp 0x7ffcdfa45980 0x7ffcdfa45980 rsp 0x7ffcdfa454b8 0x7ffcdfa454b8 r8 0x0 0 r9 0xffffffffffffffff -1 r10 0x0 0 r11 0x3c7fc071 1015005297 r12 0x7f63e4fca9a0 140067020253600 r13 0x7f63e501b3c0 140067020583872 r14 0x7f63e501b3c0 140067020583872 r15 0x76f77cc8 1995930824 rip 0x122e8cb0 0x122e8cb0 <kcbb_coalesce_int> eflags 0x246 [ PF ZF IF ] cs 0x33 51 ss 0x2b 43 ds 0x0 0 es 0x0 0 fs 0x0 0 gs 0x0 0
Checkout the value of RDX registry: 0x7f63dedaf000 – does it look familiar? And if I print only the lower bits?
(gdb) p/x $edx $1 = 0xdedaf020
Yup, that’s my address alright! Let’s see what it holds:
(gdb) x/5x $rdx 0x7f63dedaf020: 0x81faa8e0 0x00000000 0x00000005 0x00000004 0x7f63dedaf030: 0x03000093
That looks promising! So I have a 20B structure as I suspected where the first 4B is a block header address and the last 4B is RBA! The other bytes, probably represent data necessary to calculate hash and locate a hash chain.
So the byte string, that represents our block is: "e0a8fa8100000000050000000400000093000003″
If I want a database writer to be forced to write this piece of data to disk (for example after modifying the unencrypted copy of the block in memory) I could set a breakpoint on kcbb_coalesce_int and wait a bit:
(gdb) b kcbb_coalesce_int Breakpoint 1 at 0x122e8cb0 (gdb) c Continuing. Breakpoint 1, 0x00000000122e8cb0 in kcbb_coalesce_int () (gdb) p/x $rdx $1 = 0x7f63dedaf000
Now I just have inject my byte string at the address pointed by $RDX registry:
>>> import binascii >>> f = open("/proc/4952/mem", "rb+") >>> f.seek(int('0x7f63dedaf000',16)) >>> x = binascii.unhexlify("e0a8fa8100000000050000000400000093000003") >>> f.write(x) >>> f.close()
And we have it! After continuing DBWR will write the block, that we want to be written!
Cool isn’t it? I hope to write here more about the internal buffer cache structures, DBWR and SMON soon, since I have some more discoveries to share 🙂
P.S.
Many thanks to Radek Kut for his help with spotting patterns 🙂