So we continue our journey to understand the nature of buffer cache and forcing DBWR process to write our changes to disk. If you want to follow this thread – you should get familiar with at least those blog posts:
Until now we have established, that DBWR will write to disk a database block changed in memory by us if another block from the same table is affected by a running transaction… but not just any block. In this article I will try to explain what is actually going on under the hoods by using bpftrace, GDB and my own tools.
Thanks to a great book by Jonathan Lewis "Oracle Core: Essential Internals for DBAs and Developers" I was able to mark some fields of X$BH to a Rust structure.
To check the addresses of buffer headers we can use oradebug:
3
SQL> oradebug dump buffers 1
5
SQL> oradebug tracefile_name
6
/opt/oracle/diag/rdbms/orclcdb/ORCLCDB/trace/ORCLCDB_ora_141852.trc
Table HR.EMPLOYEES has 2 blocks with data in the buffer cache:
1
BH (0x81faa8e0) file#: 16 rdba: 0x03000093 (12/147) class: 1 ba: 0x8183c000
2
set: 10 pool: 3 bsz: 8192 bsi: 0 sflg: 0 pwc: 0,0
3
dbwrid: 0 obj: 73298 objn: 73298 tsn: [4/5] afn: 16 hint: f
4
hash: [0x9ff92f48,0x762a27b8] lru: [0x80fe5d98,0x81faea78]
5
ckptq: [NULL] fileq: [NULL]
6
objq: [0x63125eb0,0x63125eb0] objaq: [0x63125ea0,0x63125ea0]
7
st: XCURRENT md: NULL fpin: 'kdswh01: kdstgr' fscn: 0x718d4c tch: 1
8
flags: only_sequential_access
9
LRBA: [0x0.0.0] LSCN: [0x0] HSCN: [0x0] HSUB: [65535]
1
BH (0xaaf6e868) file#: 16 rdba: 0x03000094 (12/148) class: 1 ba: 0xaa2e6000
2
set: 11 pool: 3 bsz: 8192 bsi: 0 sflg: 0 pwc: 0,0
3
dbwrid: 0 obj: 73298 objn: 73298 tsn: [4/5] afn: 16 hint: f
4
hash: [0x9df58550,0x77f05250] lru: [0xacfcd810,0x90fbf878]
5
ckptq: [NULL] fileq: [NULL]
6
objq: [0x6cc05e20,0x6cc05e20] objaq: [0x6cc05e10,0x6cc05e10]
7
st: XCURRENT md: NULL fpin: 'kdswh01: kdstgr' fscn: 0x718d4c tch: 1
8
flags: only_sequential_access
9
LRBA: [0x0.0.0] LSCN: [0x0] HSCN: [0x0] HSUB: [65535]
Block 147 (rdba: 0x03000093) is located at address 0x81faa8e0 and block 148 (rdba: 0x03000094) is located at address 0xaaf6e868.
Soon you will find a new tool for Buffer Cache visualization on my GitHub, for now the code looks like this:
5
use binread::BinReaderExt;
7
use std::{
thread
,
time
};
92
#[br(count = ktbbhict)]
93
ktbbhitl: Vec<Ktbbhitl>,
98
#[derive(Parser, Debug)]
99
#[clap(author, version, about, long_about = None)]
110
#[clap(default_value="none", short, long)]
116
let args = Args::parse();
117
let sleep_interval =
time
::Duration::from_millis(100);
120
let fname = format!(
"/proc/{}/mem"
, args.pid);
121
let mut f = File::open(fname).unwrap();
124
f.seek(SeekFrom::Start(args.start_bh)).unwrap();
125
let xbh: Xbh = f.read_ne().unwrap();
127
std::process::Command::
new
(
"clear"
).status().unwrap();
128
println!(
"BH ADDR: 0x{:x}"
, args.start_bh);
129
println!(
"DATA OBJECT ID: {} \t BLOCK: {} 0x{:x}\n"
, xbh.objd, xbh.rdba % 4194304, xbh.rdba);
130
println!(
"CKPT[0x{:x},0x{:x}] \t HASH[0x{:x},0x{:x}] \t REPL[0x{:x},0x{:x}]"
, xbh.ckpt1, xbh.ckpt2, xbh.nxt_hash, xbh.prv_hash, xbh.nxt_repl, xbh.prv_repl);
131
println!(
"\nDIRTY FLAG: {} 0x{:x} \t BA: {} 0x{:x} \t TCH: {}"
, xbh.dirty_flag, xbh.dirty_flag, xbh.ba, xbh.ba, xbh.tch);
133
if
args.option ==
"none"
|| args.option ==
"chains"
{
134
f.seek(SeekFrom::Start(xbh.ba)).unwrap();
135
let kcbh: Kcbh = f.read_ne().unwrap();
136
let ktbbh: Ktbbh = f.read_ne().unwrap();
137
println!(
"\nBLOCK SCN: {} BLOCK CHECKSUM: {}"
, kcbh.bas_kcbh, kcbh.chkval_kcbh);
138
println!(
"Number of ITLs: {} (kscnbas = {})"
, ktbbh.ktbbhict, ktbbh.ktbbhcsc.kscnbas);
139
for
itl in ktbbh.ktbbhitl {
140
println!(
"\tXID {:04x}.{:04x}.{:08x}\tSCN: {:12} \t\tFLAG: {:04x}"
, itl.kxidusn, itl.kxidslt, itl.kxidsqn, itl.ktbitbas, itl.ktbitflg);
144
if
args.option ==
"chains"
{
145
println!(
"\nBlocks in the same hash chain\n"
);
146
let start_hash = args.start_bh + 176;
147
let mut next_hash = xbh.nxt_hash;
148
while
start_hash != next_hash {
149
let addr = next_hash - 176;
150
f.seek(SeekFrom::Start(addr)).unwrap();
151
let xbh2: Xbh = f.read_ne().unwrap();
152
if
xbh2.dirty_flag < 1000000000 {
153
println!(
"\t=> ADDR: 0x{:x} \t RBA: 0x{:08x} \t OBJD: {:8} \t HASH[0x{:x},0x{:x}]"
, addr, xbh2.rdba, xbh2.objd, xbh2.nxt_hash, xbh2.prv_hash);
155
println!(
"\t=> ADDR: 0x{:x} \t HASH CHAIN HANDLE (?) \t\t\t\t HASH[0x{:x},0x{:x}]"
, addr, xbh2.nxt_hash, xbh2.prv_hash);
157
next_hash = xbh2.nxt_hash;
161
thread
::sleep(sleep_interval);
Once I learned the buffer header addresses, I can use my tool to show the most important information:
2
DATA OBJECT ID: 73298 BLOCK: 147 0x3000093
4
CKPT[0x81faa8e0,0x81faa8e0] HASH[0x762a27b8,0x83f96bc0] REPL[0x80fe5d98,0x81faea78]
6
DIRTY FLAG: 524288 0x80000 BA: 2172895232 0x8183c000 TCH: 1
8
BLOCK SCN: 6429846 BLOCK CHECKSUM: 36605
9
Number of ITLs: 3 (kscnbas = 6429845)
10
XID 0003.0013.000004ea SCN: 6429846 FLAG: 2001
11
XID 0002.0007.00000491 SCN: 6429615 FLAG: 8000
12
XID 0001.0008.000004b0 SCN: 6429822 FLAG: 8000
14
Blocks in the same hash chain
16
=> ADDR: 0x762a2708 HASH CHAIN HANDLE (?) HASH[0x83f96bc0,0x81faa990]
17
=> ADDR: 0x83f96b10 RBA: 0x00c1506d OBJD: 81112 HASH[0x81faa990,0x762a27b8]
2
DATA OBJECT ID: 73298 BLOCK: 148 0x3000094
4
CKPT[0xaaf6e868,0xaaf6e868] HASH[0x77f05250,0xa3fc4430] REPL[0xacfcd810,0x90fbf878]
6
DIRTY FLAG: 524288 0x80000 BA: 2855165952 0xaa2e6000 TCH: 1
8
BLOCK SCN: 6429782 BLOCK CHECKSUM: 1404
9
Number of ITLs: 3 (kscnbas = 6429781)
10
XID 000a.0017.0000042c SCN: 4969276 FLAG: 8000
11
XID 0007.0019.00000436 SCN: 4974019 FLAG: 8000
12
XID 000a.0009.0000048e SCN: 6429782 FLAG: 2001
14
Blocks in the same hash chain
16
=> ADDR: 0x77f051a0 HASH CHAIN HANDLE (?) HASH[0xa3fc4430,0xaaf6e918]
17
=> ADDR: 0xa3fc4380 RBA: 0x00c1506e OBJD: 81112 HASH[0xaaf6e918,0x77f05250]
As you can see, both blocks are in a different hash chains, but as I showed in this article setting dirty flag on one of those blocks will cause DBWR to write a block to disk if the other block is affected by a transaction.
So when I start a legal transaction on a block, I’ll see buffer header and block ITL summary looking like this:
2
DATA OBJECT ID: 73298 BLOCK: 147 0x3000093
4
CKPT[0x81faa8e0,0x81faa8e0] HASH[0x762a27b8,0x83f96bc0] REPL[0x80fe5d98,0x81faea78]
6
DIRTY FLAG: 8 0x8 BA: 2172895232 0x8183c000 TCH: 2
8
BLOCK SCN: 6429846 BLOCK CHECKSUM: 0
9
Number of ITLs: 3 (kscnbas = 7612911)
10
XID 0003.0013.000004ea SCN: 6429846 FLAG: 8000
11
XID 000a.0017.00000549 SCN: 0 FLAG: 0001
12
XID 0001.0008.000004b0 SCN: 6429822 FLAG: 8000
14
Blocks in the same hash chain
16
=> ADDR: 0x762a2708 HASH CHAIN HANDLE (?) HASH[0x83f96bc0,0x81faa990]
17
=> ADDR: 0x83f96b10 RBA: 0x00c1506d OBJD: 81112 HASH[0x81faa990,0x762a27b8]
Please notice, that the block checksum was set to 0 and we have an ITL slot allocated. After the commit, the buffer header looks like this:
2
DATA OBJECT ID: 73298 BLOCK: 147 0x3000093
4
CKPT[0x76855fe8,0xc0f870c0] HASH[0x762a27b8,0x83f96bc0] REPL[0x80fe5d98,0x81faea78]
6
DIRTY FLAG: 1 0x1 BA: 2172895232 0x8183c000 TCH: 2
8
BLOCK SCN: 7612940 BLOCK CHECKSUM: 0
9
Number of ITLs: 3 (kscnbas = 7612911)
10
XID 0003.0013.000004ea SCN: 6429846 FLAG: 8000
11
XID 000a.0017.00000549 SCN: 7612940 FLAG: 2001
12
XID 0001.0008.000004b0 SCN: 6429822 FLAG: 8000
14
Blocks in the same hash chain
16
=> ADDR: 0x762a2708 HASH CHAIN HANDLE (?) HASH[0x83f96bc0,0x81faa990]
17
=> ADDR: 0x83f96b10 RBA: 0x00c1506d OBJD: 81112 HASH[0x81faa990,0x762a27b8]
The dirty flag was set to 1, the block checksum is still 0 and SCN of affecting transaction was set, together with a transaction flag.
I can set a dirty flag manually on the second block with rico2
2
Python
2.7
.
18
(default, Apr
5
2022
,
05
:
50
:
43
)
3
[GCC
8.5
.
0
20210514
(Red Hat
8.5
.
0
-
10.0
.
1
)] on linux2
4
Type
"help"
,
"copyright"
,
"credits"
or
"license"
for
more information.
6
>>> r
=
rico2.Rico(
4952
)
7
>>> r.set_dirty_flag_bh(
int
(
'0xaaf6e868'
,
16
)
+
216
)
Now my second block has also a dirty flag, tho nothing else has changed:
2
DATA OBJECT ID: 73298 BLOCK: 148 0x3000094
4
CKPT[0xaaf6e868,0xaaf6e868] HASH[0x77f05250,0xa3fc4430] REPL[0xacfcd810,0x90fbf878]
6
DIRTY FLAG: 1 0x1 BA: 2855165952 0xaa2e6000 TCH: 1
8
BLOCK SCN: 6429782 BLOCK CHECKSUM: 1404
9
Number of ITLs: 3 (kscnbas = 6429781)
10
XID 000a.0017.0000042c SCN: 4969276 FLAG: 8000
11
XID 0007.0019.00000436 SCN: 4974019 FLAG: 8000
12
XID 000a.0009.0000048e SCN: 6429782 FLAG: 2001
14
Blocks in the same hash chain
16
=> ADDR: 0x77f051a0 HASH CHAIN HANDLE (?) HASH[0xa3fc4430,0xaaf6e918]
17
=> ADDR: 0xa3fc4380 RBA: 0x00c1506e OBJD: 81112 HASH[0xaaf6e918,0x77f05250]
But after the checkpoint situation looks like this:
2
DATA OBJECT ID: 73298 BLOCK: 147 0x3000093
4
CKPT[0x81faa8e0,0x81faa8e0] HASH[0x762a27b8,0x83f96bc0] REPL[0x80fe5d98,0x81faea78]
6
DIRTY FLAG: 2097152 0x200000 BA: 2172895232 0x8183c000 TCH: 2
8
BLOCK SCN: 7612940 BLOCK CHECKSUM: 33663
9
Number of ITLs: 3 (kscnbas = 7612911)
10
XID 0003.0013.000004ea SCN: 6429846 FLAG: 8000
11
XID 000a.0017.00000549 SCN: 7612940 FLAG: 2001
12
XID 0001.0008.000004b0 SCN: 6429822 FLAG: 8000
14
Blocks in the same hash chain
16
=> ADDR: 0x762a2708 HASH CHAIN HANDLE (?) HASH[0x83f96bc0,0x81faa990]
17
=> ADDR: 0x83f96b10 RBA: 0x00c1506d OBJD: 81112 HASH[0x81faa990,0x762a27b8]
2
DATA OBJECT ID: 73298 BLOCK: 148 0x3000094
4
CKPT[0xaaf6e868,0xaaf6e868] HASH[0x77f05250,0xa3fc4430] REPL[0xacfcd810,0x90fbf878]
6
DIRTY FLAG: 2097152 0x200000 BA: 2855165952 0xaa2e6000 TCH: 1
8
BLOCK SCN: 6429782 BLOCK CHECKSUM: 1404
9
Number of ITLs: 3 (kscnbas = 6429781)
10
XID 000a.0017.0000042c SCN: 4969276 FLAG: 8000
11
XID 0007.0019.00000436 SCN: 4974019 FLAG: 8000
12
XID 000a.0009.0000048e SCN: 6429782 FLAG: 2001
14
Blocks in the same hash chain
16
=> ADDR: 0x77f051a0 HASH CHAIN HANDLE (?) HASH[0xa3fc4430,0xaaf6e918]
17
=> ADDR: 0xa3fc4380 RBA: 0x00c1506e OBJD: 81112 HASH[0xaaf6e918,0x77f05250]
Dirty flags on both blocks has changed and both blocks were written to disk! To understand what is going on, I had to dig a bit in DBWR functions and I used the following BPFTRACE script to show you the details:
1
uprobe:/opt/oracle/product/19c/dbhome_1/bin/oracle:kcbb_coalesce_int+2 /pid == $1/ {
2
printf
(
"[kcbb_coalesce_int] Check the block at local address 0x%x\n"
, arg2);
4
uprobe:/opt/oracle/product/19c/dbhome_1/bin/oracle:kcbbxsv+2 /pid == $1/ {
6
printf
(
"\t[kcbbxsv] Searching for a block at local address: 0x%x\n"
, arg2);
9
printf
(
"\t[kcbbxsv] Checking neighbours: 0x%x\n"
, arg5);
12
uprobe:/opt/oracle/product/19c/dbhome_1/bin/oracle:kcbbxsv+607 /pid == $1/ {
13
printf
(
"\t\t\t[kcbbxsv] found RBA: 0x%x\n"
, reg(
"r10"
));
15
uprobe:/opt/oracle/product/19c/dbhome_1/bin/oracle:kcbbxsv+1325 /pid == $1/ {
16
printf
(
"\t\t\t\t[kcbbxsv] HASH: 0x%x, BLOCK ADDR: 0x%x, HANDLE ADDR: 0x%x\n"
, reg(
"dx"
), reg(
"dx"
)-176, reg(
"bx"
)-176);
Fragment of the output from that bpftrace on DBWR process looks like this:
1
[root@dbVictim ~]# bpftrace dbwr.bpf 4952
3
[kcbb_coalesce_int] Check the block at local address 0xdedaf000
4
[kcbbxsv] Searching for a block at local address: 0xdedaf000
5
[kcbbxsv] found RBA: 0x3000093
6
[kcbbxsv] HASH: 0x83f96bc0, BLOCK ADDR: 0x83f96b10, HANDLE ADDR: 0x762a2708
7
[kcbbxsv] HASH: 0x81faa990, BLOCK ADDR: 0x81faa8e0, HANDLE ADDR: 0x762a2708
8
[kcbbxsv] Checking neighbours: 0x3000094
9
[kcbbxsv] HASH: 0xa3fc4430, BLOCK ADDR: 0xa3fc4380, HANDLE ADDR: 0x77f051a0
10
[kcbbxsv] HASH: 0xaaf6e918, BLOCK ADDR: 0xaaf6e868, HANDLE ADDR: 0x77f051a0
11
[kcbbxsv] Checking neighbours: 0x3000092
12
[kcbbxsv] HASH: 0x81f88688, BLOCK ADDR: 0x81f885d8, HANDLE ADDR: 0x765fc7c8
13
[kcbb_coalesce_int] Check the block at local address 0xdedaf020
14
[kcbbxsv] Searching for a block at local address: 0xdedaf020
15
[kcbbxsv] found RBA: 0x24000d0
16
[kcbbxsv] HASH: 0xb7f9c728, BLOCK ADDR: 0xb7f9c678, HANDLE ADDR: 0x761e3098
17
[kcbbxsv] Checking neighbours: 0x24000d1
18
[kcbbxsv] HASH: 0xa3fc8eb8, BLOCK ADDR: 0xa3fc8e08, HANDLE ADDR: 0x76750358
19
[kcbbxsv] Checking neighbours: 0x24000cf
20
[kcbbxsv] HASH: 0xb2f86900, BLOCK ADDR: 0xb2f86850, HANDLE ADDR: 0x7653ddd8
21
[kcbb_coalesce_int] Check the block at local address 0xdedaf040
22
[kcbbxsv] Searching for a block at local address: 0xdedaf040
23
[kcbbxsv] found RBA: 0x2400282
24
[kcbbxsv] HASH: 0x99fa6d18, BLOCK ADDR: 0x99fa6c68, HANDLE ADDR: 0x763e0bc8
25
[kcbbxsv] Checking neighbours: 0x2400283
26
[kcbbxsv] HASH: 0x98f6d6d0, BLOCK ADDR: 0x98f6d620, HANDLE ADDR: 0x76085e78
27
[kcbbxsv] Checking neighbours: 0x2400281
28
[kcbbxsv] HASH: 0x98f9bd50, BLOCK ADDR: 0x98f9bca0, HANDLE ADDR: 0x7673b908
We can see three interesting things going on in here:
When DBWR goes through a block list, it checks some local structures that are 20 bytes long – you can see that by the sequence of addresses: 0xdedaf000 => 0xdedaf020 => 0xdedaf040 DBWR is traversing the hash chain, while looking for a block that it wants to checkpoint DBWR is always checking if a neighbor of a given block needs to be checkpointed as well – it checks one block before and one after! So if any block in the same hash chain or any neighbor was modified – DBWR will write it to disk!
The big question is: can fool DBWR to force it to write a database block that I want, instead the one that was meant to be written to disk?
Let’s use GDB and find out 🙂
I will set a breakpoint on kcbb_coalesce_int to analyze what is going on inside my DBWR:
1
(gdb) b kcbb_coalesce_int
2
Breakpoint 1 at 0x122e8cb0
6
Breakpoint 1, 0x00000000122e8cb0 in kcbb_coalesce_int ()
8
rax 0x7f63df402e18 140066924015128
10
rcx 0x7ffcdfa45ed4 140724060577492
11
rdx 0x7f63dedaf020 140066917380128
13
rdi 0x7f63e4f1b440 140067019535424
14
rbp 0x7ffcdfa45980 0x7ffcdfa45980
15
rsp 0x7ffcdfa454b8 0x7ffcdfa454b8
17
r9 0xffffffffffffffff -1
19
r11 0x3c7fc071 1015005297
20
r12 0x7f63e4fca9a0 140067020253600
21
r13 0x7f63e501b3c0 140067020583872
22
r14 0x7f63e501b3c0 140067020583872
23
r15 0x76f77cc8 1995930824
24
rip 0x122e8cb0 0x122e8cb0 <kcbb_coalesce_int>
25
eflags 0x246 [ PF ZF IF ]
Checkout the value of RDX registry: 0x7f63dedaf000 – does it look familiar? And if I print only the lower bits?
Yup, that’s my address alright! Let’s see what it holds:
2
0x7f63dedaf020: 0x81faa8e0 0x00000000 0x00000005 0x00000004
3
0x7f63dedaf030: 0x03000093
That looks promising! So I have a 20B structure as I suspected where the first 4B is a block header address and the last 4B is RBA! The other bytes, probably represent data necessary to calculate hash and locate a hash chain.
So the byte string, that represents our block is: "e0a8fa8100000000050000000400000093000003″
If I want a database writer to be forced to write this piece of data to disk (for example after modifying the unencrypted copy of the block in memory) I could set a breakpoint on kcbb_coalesce_int and wait a bit:
1
(gdb) b kcbb_coalesce_int
2
Breakpoint 1 at 0x122e8cb0
6
Breakpoint 1, 0x00000000122e8cb0 in kcbb_coalesce_int ()
Now I just have inject my byte string at the address pointed by $RDX registry:
2
>>> f = open("/proc/4952/mem", "rb+")
3
>>> f.seek(int('0x7f63dedaf000',16))
4
>>> x = binascii.unhexlify("e0a8fa8100000000050000000400000093000003")
And we have it! After continuing DBWR will write the block, that we want to be written!
Cool isn’t it? I hope to write here more about the internal buffer cache structures, DBWR and SMON soon, since I have some more discoveries to share 🙂
P.S.
Many thanks to Radek Kut for his help with spotting patterns 🙂