So we continue our journey to understand the nature of buffer cache and forcing DBWR process to write our changes to disk. If you want to follow this thread – you should get familiar with at least those blog posts:
Until now we have established, that DBWR will write to disk a database block changed in memory by us if another block from the same table is affected by a running transaction… but not just any block. In this article I will try to explain what is actually going on under the hoods by using bpftrace, GDB and my own tools.
Thanks to a great book by Jonathan Lewis "Oracle Core: Essential Internals for DBAs and Developers" I was able to mark some fields of X$BH to a Rust structure.
To check the addresses of buffer headers we can use oradebug:
SQL> oradebug setmypid
Statement processed.
SQL> oradebug dump buffers 1
Statement processed.
SQL> oradebug tracefile_name
/opt/oracle/diag/rdbms/orclcdb/ORCLCDB/trace/ORCLCDB_ora_141852.trc
Table HR.EMPLOYEES has 2 blocks with data in the buffer cache:
BH (0x81faa8e0) file#: 16 rdba: 0x03000093 (12/147) class: 1 ba: 0x8183c000
set: 10 pool: 3 bsz: 8192 bsi: 0 sflg: 0 pwc: 0,0
dbwrid: 0 obj: 73298 objn: 73298 tsn: [4/5] afn: 16 hint: f
hash: [0x9ff92f48,0x762a27b8] lru: [0x80fe5d98,0x81faea78]
ckptq: [NULL] fileq: [NULL]
objq: [0x63125eb0,0x63125eb0] objaq: [0x63125ea0,0x63125ea0]
st: XCURRENT md: NULL fpin: 'kdswh01: kdstgr' fscn: 0x718d4c tch: 1
flags: only_sequential_access
LRBA: [0x0.0.0] LSCN: [0x0] HSCN: [0x0] HSUB: [65535]
BH (0xaaf6e868) file#: 16 rdba: 0x03000094 (12/148) class: 1 ba: 0xaa2e6000
set: 11 pool: 3 bsz: 8192 bsi: 0 sflg: 0 pwc: 0,0
dbwrid: 0 obj: 73298 objn: 73298 tsn: [4/5] afn: 16 hint: f
hash: [0x9df58550,0x77f05250] lru: [0xacfcd810,0x90fbf878]
ckptq: [NULL] fileq: [NULL]
objq: [0x6cc05e20,0x6cc05e20] objaq: [0x6cc05e10,0x6cc05e10]
st: XCURRENT md: NULL fpin: 'kdswh01: kdstgr' fscn: 0x718d4c tch: 1
flags: only_sequential_access
LRBA: [0x0.0.0] LSCN: [0x0] HSCN: [0x0] HSUB: [65535]
Block 147 (rdba: 0x03000093) is located at address 0x81faa8e0 and block 148 (rdba: 0x03000094) is located at address 0xaaf6e868.
Soon you will find a new tool for Buffer Cache visualization on my GitHub, for now the code looks like this:
use std::fs::File;
use std::io::Seek;
use std::io::SeekFrom;
use binread::BinRead;
use binread::BinReaderExt;
use clap::Parser;
use std::{thread, time};
#[derive(BinRead)]
#[derive(Debug)]
#[allow(dead_code)]
struct Xbh {
ckpt1: u64, //@0
ckpt2: u64, //@8
#[br(count = 160)]
data1: Vec<u8>, //unknown data @16
nxt_hash: u64, //@176
prv_hash: u64, //@184
#[br(count = 8)]
data2: Vec<u8>, //unknown data @192
rdba: u32, //@200
flag1: u32, //@204
objd: u32, //@208
flag2: u32, //@212
dirty_flag: u32, //@216
#[br(count = 12)]
data3: Vec<u8>, //unknown data @220
nxt_repl: u64, //@232
prv_repl: u64, //@240
#[br(count = 3)]
data4: Vec<u8>, //unknown data @248
tch: u8, //@251
#[br(count = 4)]
data5: Vec<u8>, //unknown data @252
ba: u64, //@256
}
#[derive(BinRead)]
#[derive(Debug)]
#[allow(dead_code)]
struct Kcbh {
type_kcbh: u8,
frmt_kcbh: u8,
spare1_kcbh: u8,
spare2_kcbh: u8,
rdba_kcbh: u32,
bas_kcbh: u32,
wrp_kcbh: u16,
seq_kcbh: u8,
flg_kcbh: u8,
chkval_kcbh: u16,
spare3_kcbh: u16
}
#[derive(BinRead)]
#[derive(Debug)]
#[allow(dead_code)]
struct Ktbbhitl {
kxidusn: u16,
kxidslt: u16,
kxidsqn: u32,
kubadba: u32,
kubaseq: u16,
kubarec: u8,
_spare1: u8,
ktbitflg: u16,
ktbitun: u16,
ktbitbas: u32,
}
#[derive(BinRead)]
#[derive(Debug)]
#[allow(dead_code)]
struct Ktbbhcsc
{
kscnbas: u32,
kscnwrp: u32,
}
#[derive(BinRead)]
#[derive(Debug)]
#[allow(dead_code)]
struct Ktbbh {
ktbbhtyp: u32,
ktbbhsid: u32,
ktbbhcsc: Ktbbhcsc,
ktbbhict: u16,
ktbbhflg: u8,
ktbbhfsl: u8,
ktbbhfnx: u32,
#[br(count = ktbbhict)]
ktbbhitl: Vec<Ktbbhitl>,
}
#[derive(Parser, Debug)]
#[clap(author, version, about, long_about = None)]
struct Args {
///PID of the process to scan
#[clap(short, long)]
pid: u64,
///From which BH address to start?
#[clap(short, long)]
start_bh: u64,
///Options for a program [chains: show other buffers in the same hash chain]
#[clap(default_value="none", short, long)]
option: String,
}
fn main() {
let args = Args::parse();
let sleep_interval = time::Duration::from_millis(100);
let fname = format!("/proc/{}/mem", args.pid);
let mut f = File::open(fname).unwrap();
loop {
f.seek(SeekFrom::Start(args.start_bh)).unwrap();
let xbh: Xbh = f.read_ne().unwrap();
std::process::Command::new("clear").status().unwrap();
println!("BH ADDR: 0x{:x}", args.start_bh);
println!("DATA OBJECT ID: {} \t BLOCK: {} 0x{:x}\n", xbh.objd, xbh.rdba % 4194304, xbh.rdba);
println!("CKPT[0x{:x},0x{:x}] \t HASH[0x{:x},0x{:x}] \t REPL[0x{:x},0x{:x}]", xbh.ckpt1, xbh.ckpt2, xbh.nxt_hash, xbh.prv_hash, xbh.nxt_repl, xbh.prv_repl);
println!("\nDIRTY FLAG: {} 0x{:x} \t BA: {} 0x{:x} \t TCH: {}", xbh.dirty_flag, xbh.dirty_flag, xbh.ba, xbh.ba, xbh.tch);
if args.option == "none" || args.option == "chains" {
f.seek(SeekFrom::Start(xbh.ba)).unwrap();
let kcbh: Kcbh = f.read_ne().unwrap();
let ktbbh: Ktbbh = f.read_ne().unwrap();
println!("\nBLOCK SCN: {} BLOCK CHECKSUM: {}", kcbh.bas_kcbh, kcbh.chkval_kcbh);
println!("Number of ITLs: {} (kscnbas = {})", ktbbh.ktbbhict, ktbbh.ktbbhcsc.kscnbas);
for itl in ktbbh.ktbbhitl {
println!("\tXID {:04x}.{:04x}.{:08x}\tSCN: {:12} \t\tFLAG: {:04x}", itl.kxidusn, itl.kxidslt, itl.kxidsqn, itl.ktbitbas, itl.ktbitflg);
}
}
if args.option == "chains" {
println!("\nBlocks in the same hash chain\n");
let start_hash = args.start_bh + 176;
let mut next_hash = xbh.nxt_hash;
while start_hash != next_hash {
let addr = next_hash - 176;
f.seek(SeekFrom::Start(addr)).unwrap();
let xbh2: Xbh = f.read_ne().unwrap();
if xbh2.dirty_flag < 1000000000 {
println!("\t=> ADDR: 0x{:x} \t RBA: 0x{:08x} \t OBJD: {:8} \t HASH[0x{:x},0x{:x}]", addr, xbh2.rdba, xbh2.objd, xbh2.nxt_hash, xbh2.prv_hash);
} else {
println!("\t=> ADDR: 0x{:x} \t HASH CHAIN HANDLE (?) \t\t\t\t HASH[0x{:x},0x{:x}]", addr, xbh2.nxt_hash, xbh2.prv_hash);
}
next_hash = xbh2.nxt_hash;
}
}
thread::sleep(sleep_interval);
}
}
Once I learned the buffer header addresses, I can use my tool to show the most important information:
BH ADDR: 0x81faa8e0
DATA OBJECT ID: 73298 BLOCK: 147 0x3000093
CKPT[0x81faa8e0,0x81faa8e0] HASH[0x762a27b8,0x83f96bc0] REPL[0x80fe5d98,0x81faea78]
DIRTY FLAG: 524288 0x80000 BA: 2172895232 0x8183c000 TCH: 1
BLOCK SCN: 6429846 BLOCK CHECKSUM: 36605
Number of ITLs: 3 (kscnbas = 6429845)
XID 0003.0013.000004ea SCN: 6429846 FLAG: 2001
XID 0002.0007.00000491 SCN: 6429615 FLAG: 8000
XID 0001.0008.000004b0 SCN: 6429822 FLAG: 8000
Blocks in the same hash chain
=> ADDR: 0x762a2708 HASH CHAIN HANDLE (?) HASH[0x83f96bc0,0x81faa990]
=> ADDR: 0x83f96b10 RBA: 0x00c1506d OBJD: 81112 HASH[0x81faa990,0x762a27b8]
BH ADDR: 0xaaf6e868
DATA OBJECT ID: 73298 BLOCK: 148 0x3000094
CKPT[0xaaf6e868,0xaaf6e868] HASH[0x77f05250,0xa3fc4430] REPL[0xacfcd810,0x90fbf878]
DIRTY FLAG: 524288 0x80000 BA: 2855165952 0xaa2e6000 TCH: 1
BLOCK SCN: 6429782 BLOCK CHECKSUM: 1404
Number of ITLs: 3 (kscnbas = 6429781)
XID 000a.0017.0000042c SCN: 4969276 FLAG: 8000
XID 0007.0019.00000436 SCN: 4974019 FLAG: 8000
XID 000a.0009.0000048e SCN: 6429782 FLAG: 2001
Blocks in the same hash chain
=> ADDR: 0x77f051a0 HASH CHAIN HANDLE (?) HASH[0xa3fc4430,0xaaf6e918]
=> ADDR: 0xa3fc4380 RBA: 0x00c1506e OBJD: 81112 HASH[0xaaf6e918,0x77f05250]
As you can see, both blocks are in a different hash chains, but as I showed in this article setting dirty flag on one of those blocks will cause DBWR to write a block to disk if the other block is affected by a transaction.
So when I start a legal transaction on a block, I’ll see buffer header and block ITL summary looking like this:
BH ADDR: 0x81faa8e0
DATA OBJECT ID: 73298 BLOCK: 147 0x3000093
CKPT[0x81faa8e0,0x81faa8e0] HASH[0x762a27b8,0x83f96bc0] REPL[0x80fe5d98,0x81faea78]
DIRTY FLAG: 8 0x8 BA: 2172895232 0x8183c000 TCH: 2
BLOCK SCN: 6429846 BLOCK CHECKSUM: 0
Number of ITLs: 3 (kscnbas = 7612911)
XID 0003.0013.000004ea SCN: 6429846 FLAG: 8000
XID 000a.0017.00000549 SCN: 0 FLAG: 0001
XID 0001.0008.000004b0 SCN: 6429822 FLAG: 8000
Blocks in the same hash chain
=> ADDR: 0x762a2708 HASH CHAIN HANDLE (?) HASH[0x83f96bc0,0x81faa990]
=> ADDR: 0x83f96b10 RBA: 0x00c1506d OBJD: 81112 HASH[0x81faa990,0x762a27b8]
Please notice, that the block checksum was set to 0 and we have an ITL slot allocated. After the commit, the buffer header looks like this:
BH ADDR: 0x81faa8e0
DATA OBJECT ID: 73298 BLOCK: 147 0x3000093
CKPT[0x76855fe8,0xc0f870c0] HASH[0x762a27b8,0x83f96bc0] REPL[0x80fe5d98,0x81faea78]
DIRTY FLAG: 1 0x1 BA: 2172895232 0x8183c000 TCH: 2
BLOCK SCN: 7612940 BLOCK CHECKSUM: 0
Number of ITLs: 3 (kscnbas = 7612911)
XID 0003.0013.000004ea SCN: 6429846 FLAG: 8000
XID 000a.0017.00000549 SCN: 7612940 FLAG: 2001
XID 0001.0008.000004b0 SCN: 6429822 FLAG: 8000
Blocks in the same hash chain
=> ADDR: 0x762a2708 HASH CHAIN HANDLE (?) HASH[0x83f96bc0,0x81faa990]
=> ADDR: 0x83f96b10 RBA: 0x00c1506d OBJD: 81112 HASH[0x81faa990,0x762a27b8]
The dirty flag was set to 1, the block checksum is still 0 and SCN of affecting transaction was set, together with a transaction flag.
I can set a dirty flag manually on the second block with rico2
[root@dbVictim ~]# python2
Python 2.7.18 (default, Apr 5 2022, 05:50:43)
[GCC 8.5.0 20210514 (Red Hat 8.5.0-10.0.1)] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import rico2
>>> r = rico2.Rico(4952)
>>> r.set_dirty_flag_bh(int('0xaaf6e868',16)+216)
Now my second block has also a dirty flag, tho nothing else has changed:
BH ADDR: 0xaaf6e868
DATA OBJECT ID: 73298 BLOCK: 148 0x3000094
CKPT[0xaaf6e868,0xaaf6e868] HASH[0x77f05250,0xa3fc4430] REPL[0xacfcd810,0x90fbf878]
DIRTY FLAG: 1 0x1 BA: 2855165952 0xaa2e6000 TCH: 1
BLOCK SCN: 6429782 BLOCK CHECKSUM: 1404
Number of ITLs: 3 (kscnbas = 6429781)
XID 000a.0017.0000042c SCN: 4969276 FLAG: 8000
XID 0007.0019.00000436 SCN: 4974019 FLAG: 8000
XID 000a.0009.0000048e SCN: 6429782 FLAG: 2001
Blocks in the same hash chain
=> ADDR: 0x77f051a0 HASH CHAIN HANDLE (?) HASH[0xa3fc4430,0xaaf6e918]
=> ADDR: 0xa3fc4380 RBA: 0x00c1506e OBJD: 81112 HASH[0xaaf6e918,0x77f05250]
But after the checkpoint situation looks like this:
BH ADDR: 0x81faa8e0
DATA OBJECT ID: 73298 BLOCK: 147 0x3000093
CKPT[0x81faa8e0,0x81faa8e0] HASH[0x762a27b8,0x83f96bc0] REPL[0x80fe5d98,0x81faea78]
DIRTY FLAG: 2097152 0x200000 BA: 2172895232 0x8183c000 TCH: 2
BLOCK SCN: 7612940 BLOCK CHECKSUM: 33663
Number of ITLs: 3 (kscnbas = 7612911)
XID 0003.0013.000004ea SCN: 6429846 FLAG: 8000
XID 000a.0017.00000549 SCN: 7612940 FLAG: 2001
XID 0001.0008.000004b0 SCN: 6429822 FLAG: 8000
Blocks in the same hash chain
=> ADDR: 0x762a2708 HASH CHAIN HANDLE (?) HASH[0x83f96bc0,0x81faa990]
=> ADDR: 0x83f96b10 RBA: 0x00c1506d OBJD: 81112 HASH[0x81faa990,0x762a27b8]
BH ADDR: 0xaaf6e868
DATA OBJECT ID: 73298 BLOCK: 148 0x3000094
CKPT[0xaaf6e868,0xaaf6e868] HASH[0x77f05250,0xa3fc4430] REPL[0xacfcd810,0x90fbf878]
DIRTY FLAG: 2097152 0x200000 BA: 2855165952 0xaa2e6000 TCH: 1
BLOCK SCN: 6429782 BLOCK CHECKSUM: 1404
Number of ITLs: 3 (kscnbas = 6429781)
XID 000a.0017.0000042c SCN: 4969276 FLAG: 8000
XID 0007.0019.00000436 SCN: 4974019 FLAG: 8000
XID 000a.0009.0000048e SCN: 6429782 FLAG: 2001
Blocks in the same hash chain
=> ADDR: 0x77f051a0 HASH CHAIN HANDLE (?) HASH[0xa3fc4430,0xaaf6e918]
=> ADDR: 0xa3fc4380 RBA: 0x00c1506e OBJD: 81112 HASH[0xaaf6e918,0x77f05250]
Dirty flags on both blocks has changed and both blocks were written to disk! To understand what is going on, I had to dig a bit in DBWR functions and I used the following BPFTRACE script to show you the details:
uprobe:/opt/oracle/product/19c/dbhome_1/bin/oracle:kcbb_coalesce_int+2 /pid == $1/ {
printf("[kcbb_coalesce_int] Check the block at local address 0x%x\n", arg2);
}
uprobe:/opt/oracle/product/19c/dbhome_1/bin/oracle:kcbbxsv+2 /pid == $1/ {
if (arg2 > 0) {
printf("\t[kcbbxsv] Searching for a block at local address: 0x%x\n", arg2);
}
else {
printf("\t[kcbbxsv] Checking neighbours: 0x%x\n", arg5);
}
}
uprobe:/opt/oracle/product/19c/dbhome_1/bin/oracle:kcbbxsv+607 /pid == $1/ {
printf("\t\t\t[kcbbxsv] found RBA: 0x%x\n", reg("r10"));
}
uprobe:/opt/oracle/product/19c/dbhome_1/bin/oracle:kcbbxsv+1325 /pid == $1/ {
printf("\t\t\t\t[kcbbxsv] HASH: 0x%x, BLOCK ADDR: 0x%x, HANDLE ADDR: 0x%x\n", reg("dx"), reg("dx")-176, reg("bx")-176);
}
Fragment of the output from that bpftrace on DBWR process looks like this:
[root@dbVictim ~]# bpftrace dbwr.bpf 4952
Attaching 4 probes...
[kcbb_coalesce_int] Check the block at local address 0xdedaf000
[kcbbxsv] Searching for a block at local address: 0xdedaf000
[kcbbxsv] found RBA: 0x3000093
[kcbbxsv] HASH: 0x83f96bc0, BLOCK ADDR: 0x83f96b10, HANDLE ADDR: 0x762a2708
[kcbbxsv] HASH: 0x81faa990, BLOCK ADDR: 0x81faa8e0, HANDLE ADDR: 0x762a2708
[kcbbxsv] Checking neighbours: 0x3000094
[kcbbxsv] HASH: 0xa3fc4430, BLOCK ADDR: 0xa3fc4380, HANDLE ADDR: 0x77f051a0
[kcbbxsv] HASH: 0xaaf6e918, BLOCK ADDR: 0xaaf6e868, HANDLE ADDR: 0x77f051a0
[kcbbxsv] Checking neighbours: 0x3000092
[kcbbxsv] HASH: 0x81f88688, BLOCK ADDR: 0x81f885d8, HANDLE ADDR: 0x765fc7c8
[kcbb_coalesce_int] Check the block at local address 0xdedaf020
[kcbbxsv] Searching for a block at local address: 0xdedaf020
[kcbbxsv] found RBA: 0x24000d0
[kcbbxsv] HASH: 0xb7f9c728, BLOCK ADDR: 0xb7f9c678, HANDLE ADDR: 0x761e3098
[kcbbxsv] Checking neighbours: 0x24000d1
[kcbbxsv] HASH: 0xa3fc8eb8, BLOCK ADDR: 0xa3fc8e08, HANDLE ADDR: 0x76750358
[kcbbxsv] Checking neighbours: 0x24000cf
[kcbbxsv] HASH: 0xb2f86900, BLOCK ADDR: 0xb2f86850, HANDLE ADDR: 0x7653ddd8
[kcbb_coalesce_int] Check the block at local address 0xdedaf040
[kcbbxsv] Searching for a block at local address: 0xdedaf040
[kcbbxsv] found RBA: 0x2400282
[kcbbxsv] HASH: 0x99fa6d18, BLOCK ADDR: 0x99fa6c68, HANDLE ADDR: 0x763e0bc8
[kcbbxsv] Checking neighbours: 0x2400283
[kcbbxsv] HASH: 0x98f6d6d0, BLOCK ADDR: 0x98f6d620, HANDLE ADDR: 0x76085e78
[kcbbxsv] Checking neighbours: 0x2400281
[kcbbxsv] HASH: 0x98f9bd50, BLOCK ADDR: 0x98f9bca0, HANDLE ADDR: 0x7673b908
We can see three interesting things going on in here:
- When DBWR goes through a block list, it checks some local structures that are 20 bytes long – you can see that by the sequence of addresses: 0xdedaf000 => 0xdedaf020 => 0xdedaf040
- DBWR is traversing the hash chain, while looking for a block that it wants to checkpoint
- DBWR is always checking if a neighbor of a given block needs to be checkpointed as well – it checks one block before and one after!
So if any block in the same hash chain or any neighbor was modified – DBWR will write it to disk!
The big question is: can fool DBWR to force it to write a database block that I want, instead the one that was meant to be written to disk?
Let’s use GDB and find out 🙂
I will set a breakpoint on kcbb_coalesce_int to analyze what is going on inside my DBWR:
(gdb) b kcbb_coalesce_int
Breakpoint 1 at 0x122e8cb0
(gdb) c
Continuing.
Breakpoint 1, 0x00000000122e8cb0 in kcbb_coalesce_int ()
(gdb) info reg
rax 0x7f63df402e18 140066924015128
rbx 0x0 0
rcx 0x7ffcdfa45ed4 140724060577492
rdx 0x7f63dedaf020 140066917380128
rsi 0x3 3
rdi 0x7f63e4f1b440 140067019535424
rbp 0x7ffcdfa45980 0x7ffcdfa45980
rsp 0x7ffcdfa454b8 0x7ffcdfa454b8
r8 0x0 0
r9 0xffffffffffffffff -1
r10 0x0 0
r11 0x3c7fc071 1015005297
r12 0x7f63e4fca9a0 140067020253600
r13 0x7f63e501b3c0 140067020583872
r14 0x7f63e501b3c0 140067020583872
r15 0x76f77cc8 1995930824
rip 0x122e8cb0 0x122e8cb0 <kcbb_coalesce_int>
eflags 0x246 [ PF ZF IF ]
cs 0x33 51
ss 0x2b 43
ds 0x0 0
es 0x0 0
fs 0x0 0
gs 0x0 0
Checkout the value of RDX registry: 0x7f63dedaf000 – does it look familiar? And if I print only the lower bits?
(gdb) p/x $edx
$1 = 0xdedaf020
Yup, that’s my address alright! Let’s see what it holds:
(gdb) x/5x $rdx
0x7f63dedaf020: 0x81faa8e0 0x00000000 0x00000005 0x00000004
0x7f63dedaf030: 0x03000093
That looks promising! So I have a 20B structure as I suspected where the first 4B is a block header address and the last 4B is RBA! The other bytes, probably represent data necessary to calculate hash and locate a hash chain.
So the byte string, that represents our block is: "e0a8fa8100000000050000000400000093000003″
If I want a database writer to be forced to write this piece of data to disk (for example after modifying the unencrypted copy of the block in memory) I could set a breakpoint on kcbb_coalesce_int and wait a bit:
(gdb) b kcbb_coalesce_int
Breakpoint 1 at 0x122e8cb0
(gdb) c
Continuing.
Breakpoint 1, 0x00000000122e8cb0 in kcbb_coalesce_int ()
(gdb) p/x $rdx
$1 = 0x7f63dedaf000
Now I just have inject my byte string at the address pointed by $RDX registry:
>>> import binascii
>>> f = open("/proc/4952/mem", "rb+")
>>> f.seek(int('0x7f63dedaf000',16))
>>> x = binascii.unhexlify("e0a8fa8100000000050000000400000093000003")
>>> f.write(x)
>>> f.close()
And we have it! After continuing DBWR will write the block, that we want to be written!
Cool isn’t it? I hope to write here more about the internal buffer cache structures, DBWR and SMON soon, since I have some more discoveries to share 🙂
P.S.
Many thanks to Radek Kut for his help with spotting patterns 🙂