Tags: bruteforce ghidra gdb 

Rating: 4.5

The challenge gives an executable `vm` and a `bin` file that is a program that run in that vm. The usage is `./vm bin`. To understand the binary I began by analyzing the vm file in ghidra. The function to start the VM is as follows:

```C
machine * vm_create(void *bin,void *bin_size)

{
machine *m;
void *mem_ptr;

m = (machine *)malloc(168);
m->pc_offset = 0;
m->should_halt = false;
m->set_to_zero = 0;
memset(m->initialized,0,128);
mem_ptr = calloc(0x10000,1);
m->instr_mem_0x10000 = mem_ptr;
/* First 3 bytes do not go to mem, it`s a header "UwU"*/
memcpy(m->instr_mem_0x10000,(void *)((long)bin + 3),(long)bin_size - 3);
mem_ptr = calloc(0x200,4);
m->0x200_mem = mem_ptr;
return m;
}
```

Every single operation in this function was an offset from the same variable, so I created an `machine` struct to help to keep track of these ofsets as shown in the code above. Over the full course of the challenge, this is what I think the offsets mean:

|offset |type | name|
:---: | :---: | :---: |
|0|int4|program_counter|
|1|bool|should_halt|
8|byte[0x80]|ram_mem
144|byte[0x10000]|instr_mem
152|byte[0x200]|stack_mem
160|int4|stack_pointer

Each vm operation is executed through this function:
```C
void vm_step(machine *m)

{
byte opcode;

opcode = *(byte *)((ulong)(uint)m->pc_offset + (long)m->instr_mem_0x10000);
if (25 < cur_instr_addr) {
puts("dead");
exit(0);
}
(*(code *)original_ops[(int)(uint)opcode])(m);
return;
}
```
This piece of code shows that `original_ops` is a function pointer array that describes all of the vm's instructions. These adresses still had their symbols, so the instruction names were: "vm_add", "vm_addi", "vm_sub", "vm_subi", "vm_mul", "vm_muli", "vm_div", "vm_cmp", "vm_jmp", "vm_inv", "vm_push", "vm_pop", "vm_mov", "vm_nop", "vm_exit", "vm_print", "vm_putc", "vm_je", "vm_jne", "vm_jle", "vm_jge", "vm_xor", "vm_store", "vm_load" and "vm_input".

These functions read like assembly, so it's easy to get an idea of what they do. To understand how they work however, it is necessary to do more reverse engineering. I did a lazy reverse engineering by only analysing the functions as they appear in the `bin` file. This python script creates a very rough disassembly:

```python
from sys import argv

inst_list = ["vm_add", "vm_addi", "vm_sub", "vm_subi", "vm_mul", "vm_muli", "vm_div", "vm_cmp", "vm_jmp", "vm_inv", "vm_push", "vm_pop", "vm_mov", "vm_nop", "vm_exit", "vm_print", "vm_putc", "vm_je", "vm_jne", "vm_jle", "vm_jge", "vm_xor", "vm_store", "vm_load", "vm_input"]

if len(argv) < 2:
print("missing filename")
exit

with open(argv[1], "rb") as fp:
file = fp.read()
instr = file[3:]

pc = 0
while pc < len(instr):
if instr[pc] == 0: # add
dest = hex(instr[pc+1])
arg1 = hex(instr[pc+2])
arg2 = hex(instr[pc+3])
print(f"{hex(pc)}:\t{inst_list[instr[pc]]} [{dest}] <= [{arg1}]+[{arg2}]")
pc += 6
elif instr[pc] == 1: # addi
dest = hex(instr[pc+1])
orig = hex(instr[pc+2])
imm = instr[pc+3]
print(f"{hex(pc)}:\t{inst_list[instr[pc]]} [{dest}] <= [{orig}]+{imm}")
pc += 6
elif instr[pc] == 5: # muli
dest = hex(instr[pc+1])
orig = hex(instr[pc+2])
imm = instr[pc+3]
print(f"{hex(pc)}:\t{inst_list[instr[pc]]} [{dest}] <= [{orig}]*{imm}")
pc += 6
elif instr[pc] == 9: # syscall
syscall = hex(instr[pc+1])
num_param = instr[pc+2]
print(f"{hex(pc)}:\t{inst_list[instr[pc]]} syscall {syscall} with {num_param} parameters")
pc += 6
elif instr[pc] == 10: # push
val = hex(instr[pc+1])
print(f"{hex(pc)}:\t{inst_list[instr[pc]]} {val}")
pc += 6
elif instr[pc] == 11: # pop
val = hex(instr[pc+1])
print(f"{hex(pc)}:\t{inst_list[instr[pc]]} [{val}]")
pc += 6
elif instr[pc] == 12: # mov
pos = hex(instr[pc+1])
val = instr[pc+2]
print(f"{hex(pc)}:\t{inst_list[instr[pc]]} [{pos}] <= {val}")
pc += 6
elif instr[pc] == 14: # exit
print(f"{hex(pc)}:\t{inst_list[instr[pc]]} (halt_flag=true)")
pc += 6
elif instr[pc] == 16: # putc
c = chr(instr[pc+1])
print(f"{hex(pc)}:\t{inst_list[instr[pc]]} {c}")
pc += 6
elif instr[pc] == 17: # je
arg1 = hex(instr[pc+1])
arg2 = hex(instr[pc+2])
arg3 = instr[pc+3]
print(f"{hex(pc)}:\t{inst_list[instr[pc]]} ([{arg1}] == [{arg2}])? goto {hex(6*arg3)} : goto {hex(pc+6)}")
pc += 6
elif instr[pc] == 19: # jle
arg1 = hex(instr[pc+1])
arg2 = hex(instr[pc+2])
arg3 = instr[pc+3]
print(f"{hex(pc)}:\t{inst_list[instr[pc]]} ([{arg1}] <= [{arg2}])? goto {hex(6*arg3)} : goto {hex(pc+6)}")
pc += 6
elif instr[pc] == 21: # xor
dest = hex(instr[pc+1])
arg1 = hex(instr[pc+2])
arg2 = hex(instr[pc+3])
print(f"{hex(pc)}:\t{inst_list[instr[pc]]} [{dest}] <= [{arg1}]^[{arg2}]")
pc += 6
elif instr[pc] == 22: # store
pos = hex(instr[pc+1])
val = hex(instr[pc+2])
print(f"{hex(pc)}:\t{inst_list[instr[pc]]} code[{pos}] <= [{val}]")
pc += 6
elif instr[pc] == 23: # load
arg1 = hex(instr[pc+1])
arg2 = hex(instr[pc+2])
print(f"{hex(pc)}:\t{inst_list[instr[pc]]} [{arg1}] <= code[{arg2}]")
pc += 6
elif instr[pc] == 24: # input
pos = hex(instr[pc+1])
print(f"{hex(pc)}:\t{inst_list[instr[pc]]} [{pos}]")
pc += 6

elif instr[pc] < 25:
print(f"unknown instruction at {hex(pc)}: {instr[pc]}!")
exit()
else:
print(f"{hex(pc)}: data - {chr(instr[pc])}")
pc += 1
```

This is the disassembly of the `bin` file:

```
0x0: vm_putc [
0x6: vm_putc M
0xc: vm_putc a
0x12: vm_putc i
0x18: vm_putc n
0x1e: vm_putc
0x24: vm_putc V
0x2a: vm_putc e
0x30: vm_putc s
0x36: vm_putc s
0x3c: vm_putc e
0x42: vm_putc l
0x48: vm_putc
0x4e: vm_putc T
0x54: vm_putc e
0x5a: vm_putc r
0x60: vm_putc m
0x66: vm_putc i
0x6c: vm_putc n
0x72: vm_putc a
0x78: vm_putc l
0x7e: vm_putc ]
0x84: vm_putc '\n'
0x8a: vm_putc <
0x90: vm_putc
0x96: vm_putc E
0x9c: vm_putc n
0xa2: vm_putc t
0xa8: vm_putc e
0xae: vm_putc r
0xb4: vm_putc
0xba: vm_putc k
0xc0: vm_putc e
0xc6: vm_putc y
0xcc: vm_putc c
0xd2: vm_putc o
0xd8: vm_putc d
0xde: vm_putc e
0xe4: vm_putc
0xea: vm_putc '\n'
0xf0: vm_putc >
0xf6: vm_putc
0xfc: vm_mov [0x1e] <= 160
0x102: vm_mov [0x1c] <= 0
0x108: vm_mov [0x1d] <= 17
0x10e: vm_input [0x19]
0x114: vm_store code[0x1e] <= [0x19]
0x11a: vm_addi [0x1e] <= [0x1e]+1
0x120: vm_addi [0x1c] <= [0x1c]+1
0x126: vm_jle ([0x1c] <= [0x1d])? goto 0x10e : goto 0x12c
0x12c: vm_mov [0x1e] <= 4
0x132: vm_mov [0x1f] <= 160
0x138: vm_mov [0x1c] <= 0
0x13e: vm_mov [0x1d] <= 10
0x144: vm_mov [0x1b] <= 169
0x14a: vm_mov [0x17] <= 0
0x150: vm_load [0x19] <= code[0x1e]
0x156: vm_load [0x18] <= code[0x1f]
0x15c: vm_xor [0x19] <= [0x19]^[0x1b]
0x162: vm_je ([0x19] == [0x18])? goto 0x1d4 : goto 0x168
0x168: vm_putc U
0x16e: vm_putc n
0x174: vm_putc k
0x17a: vm_putc n
0x180: vm_putc o
0x186: vm_putc w
0x18c: vm_putc n
0x192: vm_putc
0x198: vm_putc k
0x19e: vm_putc e
0x1a4: vm_putc y
0x1aa: vm_putc c
0x1b0: vm_putc o
0x1b6: vm_putc d
0x1bc: vm_putc e
0x1c2: vm_putc !
0x1c8: vm_putc '\n'
0x1ce: vm_exit (halt_flag=true)
0x1d4: vm_addi [0x1e] <= [0x1e]+1
0x1da: vm_addi [0x1f] <= [0x1f]+1
0x1e0: vm_addi [0x1c] <= [0x1c]+1
0x1e6: vm_jle ([0x1c] <= [0x1d])? goto 0x150 : goto 0x1ec
0x1ec: vm_mov [0xf] <= 0
0x1f2: vm_push 0xf
0x1f8: vm_push 0xf
0x1fe: vm_push 0xf
0x204: vm_inv syscall 0x65 with 3 parameters
0x20a: vm_mov [0x10] <= 0
0x210: vm_je ([0x1f] == [0x10])? goto 0x288 : goto 0x216
0x216: vm_putc T
0x21c: vm_putc e
0x222: vm_putc r
0x228: vm_putc m
0x22e: vm_putc i
0x234: vm_putc n
0x23a: vm_putc a
0x240: vm_putc l
0x246: vm_putc
0x24c: vm_putc b
0x252: vm_putc l
0x258: vm_putc o
0x25e: vm_putc c
0x264: vm_putc k
0x26a: vm_putc e
0x270: vm_putc d
0x276: vm_putc !
0x27c: vm_putc '\n'
0x282: vm_exit (halt_flag=true)
0x288: vm_mov [0x1e] <= 119
0x28e: vm_muli [0x1e] <= [0x1e]*6
0x294: vm_mov [0x1c] <= 0
0x29a: vm_mov [0x1d] <= 220
0x2a0: vm_mov [0x1b] <= 69
0x2a6: vm_load [0x19] <= code[0x1e]
0x2ac: vm_xor [0x19] <= [0x19]^[0x1b]
0x2b2: vm_store code[0x1e] <= [0x19]
0x2b8: vm_addi [0x1e] <= [0x1e]+1
0x2be: vm_addi [0x1c] <= [0x1c]+1
0x2c4: vm_jle ([0x1c] <= [0x1d])? goto 0x2a6 : goto 0x2ca
```

This code starts by getting the user pointer and then comparing to a string decoded with a xor key, being the comparison instruction the vm_je at 0x1e6. The easiest way I found to get the key was to put a breakpoint at vm_je with `bp vm_je` and defining a hook to change what I wrote with the right char and to print it on the screen.

```
define hook-stop
set *(int*)($rdi+0x68)=*(int*)($rdi+0x6c)
p/c *(char*)($rdi+0x6c)
end
```

Then, running the vm with any input just running continue in gdb would give the key: `c0d3_r3d_5h`!

After that, the program checked if it was in a terminal with a syscall and exited if it was not. That is a problem for running gdb, but can easily be bypassed by changing the vm_je parameters with an hex editor, since giving the same adress twice guarantees equality.

The worst part is what comes next :(

The program decodes with a xor key some section of data and jumps to it. This means the binary has self modifying code and unpacks the part of itself which actualy tests for the flag.

To analyse this part of code I used a breakpoint to stop the vm in a struction after the unpacking was done and dumped the whole memory with `dump binary memory unpacked_bin *(char *)($rdi+144) *(char *)($rdi+144)+0x10000`. (To do this it necessary to do the gdb bypass described above).

Then I added the "UwU" header in the file and ran it through the disassembler again. The new piece code is as follows (this one needed more deep analysis so I had to do some formatting and commenting):

```
; print enter secret phrase

; 0x2ca offset

0x90: vm_mov [0x1e] <= 48 ; a
0x96: vm_mov [0x1c] <= 0 ; i
0x9c: vm_mov [0x1d] <= 36 ; max_i
read_loop:
0xa2: vm_input [0x19]
0xa8: vm_store code[0x1e] <= [0x19]
0xae: vm_addi [0x1e] <= [0x1e]+1
0xb4: vm_addi [0x1c] <= [0x1c]+1 ; a++, i++
0xba: vm_jle ([0x1c] <= [0x1d])? goto read_loop : goto 0xc0

; for i in range(36):
; read(0x19+i)

0xc0: vm_mov [0x1c] <= 0 ; j
0xc6: vm_mov [0x1d] <= 35 ; max_j
outer_loop:

0xcc: vm_mov [0x1e] <= 48 ; a
0xd2: vm_mov [0x1f] <= 148 ; b
0xd8: vm_mov [0x1a] <= 0 ; i
0xde: vm_mov [0x1b] <= 35 ; max_i
inner_loop:
0xe4: vm_load [0x14] <= code[0x1e]
0xea: vm_load [0x15] <= code[0x1f]
0xf0: vm_push 0x14
0xf6: vm_pop [0x13]
0xfc: vm_mov [0x12] <= 48
0x102: vm_add [0x12] <= [0x12]+[0x15]
0x108: vm_load [0x11] <= code[0x12]
; adds 48 to b and reads memory
0x10e: vm_store code[0x1e] <= [0x11] ; secret stored in b+48
0x114: vm_store code[0x12] <= [0x13] ; user input
; stores result on user input and stores user_input backup at b+48
0x11a: vm_addi [0x1a] <= [0x1a]+1
0x120: vm_addi [0x1e] <= [0x1e]+1
0x126: vm_addi [0x1f] <= [0x1f]+1 ; i++ , a++, b++
0x12c: vm_jle ([0x1a] <= [0x1b])? goto inner_loop : goto 0x132

0x132: vm_mov [0x1e] <= 48 ; a
0x138: vm_mov [0x1f] <= 248 ; b_base
0x13e: vm_mov [0x1a] <= 0 ; i
0x144: vm_mov [0x1b] <= 35 ; max_i
second_loop:
0x14a: vm_load [0x14] <= code[0x1e]
0x150: vm_push 0x1f
0x156: vm_pop [0xf]
0x15c: vm_add [0xf] <= [0xf]+[0x1c]
0x162: vm_load [0x10] <= code[0xf] ; read b_base+j
0x168: vm_xor [0x14] <= [0x14]^[0x10]
0x16e: vm_store code[0x1e] <= [0x14]
0x174: vm_addi [0x1a] <= [0x1a]+1
0x17a: vm_addi [0x1e] <= [0x1e]+1 ; i++, a++
0x180: vm_jle ([0x1a] <= [0x1b])? goto second_loop : goto 0x186

0x186: vm_addi [0x1c] <= [0x1c]+1 ; j++

0x18c: vm_jle ([0x1c] <= [0x1d])? goto outer_loop : goto 0x192

0x192: vm_mov [0x1e] <= 48 ; a
0x198: vm_mov [0x1f] <= 92 ; b
0x19e: vm_mov [0x1a] <= 0 ; i
0x1a4: vm_mov [0x1b] <= 35 ; max_i
access_loop:
0x1aa: vm_load [0xf] <= code[0x1e]
0x1b0: vm_load [0x10] <= code[0x1f]
0x1b6: vm_je ([0xf] == [0x10])? goto next : goto 0x1bc

; print Wrong
0x1e6: vm_exit (halt_flag=true)

next:
0x1ec: vm_addi [0x1a] <= [0x1a]+1
0x1f2: vm_addi [0x1e] <= [0x1e]+1
0x1f8: vm_addi [0x1f] <= [0x1f]+1 ; i++, a++, b++
0x1fe: vm_jle ([0x1a] <= [0x1b])? goto access_loop : goto 0x204

; print acess granted
0x2be: vm_exit (halt_flag=true)
```

This time there's a more complex verification that does a lot of xoring shuffling in the user input before the actual comparison, so just breaking at the comparison instruction and printing it's value does not work. I could try to understand what this code does and to do it in reverse, but I'm too lazy to do that and used a bruteforce aproach.

Before the bruteforce I did some changes to the `unpacked_bin` file. First I put a jump in the beginning to go right to the second user input read loop, and put a `vm_putc @` after the comparison at 0x1b6 to be able to track if I got a more partially correct input. This modified `unpacked_bin` is available here with after a gzip and base64 encoding:

```
H4sICM9EGGQAA3NlY19yZWFkX21vZADt1MtLVGEYx/EzeSlHc86Z+80zY0aLKNAGpxYRthiCoFZp
i5rEZiYtp9EsGYQg6B8Ioo1SFLRqE9EfELSKCndFCy+LNi6irQRtOu97yHnyDJEpbfp+Fj+P73nP
e310uD5s+Xz3DMM8YzjMMZ1XddZ0ZnWO6KzovCnSbamKnudE+4zO654xx8RXRZ1+ncfFOAXR/5Zn
TLfPpGif01nSOaWzLN5mxSwnGi1d9pOAk2n93GM5GU+o56itfvhs2+dkOu1kKN1zWPVvNZ3MyK/U
oF2pZypjqiWWsFXGM05GEomUYViJ+FlnxmGxo0nx7K627jnzP99db2N33Sp+rjyTkes/5qwwoN76
PdlRaVHrN9WzZaqb2fI9umu+LFqmxJrlXsrN19xlqyNos+32xtkut6mzLWycqnueTW/nxl/Uj6zh
knhbEV+5Pad1Tog+Y54RmlRXv9XYy37jt9V1v9Gzr/Ft5oHKpG5PqfZYWFdXRFWXP6zaO0P6bVD1
N4LBiPPWCupZVEM0qF77kknfr1URSqYeNWb51nQWf0aPryvECASc1cVM9UskHDbV+OFNIztjPhc7
mm+MfzG4afyA3oWpJrAC5lvnxM6Ls50SdzfevFq8OzKHVLuzhlcbt1kStyP/a2XFyPI2vdXi1uoh
T824lTAr+rsp/zrGxVdlsS/5l+7ZVwGAlwEAwP+k1F/Ojc7kyqODEyc1/87P8W5+cWF9aWFx/eHr
5ZXF+dU3Oz8FtiCUCPh3Gz3dUbPL15mJh/d0pJO9rfuCbSkr274rEttr97Vsd47oi1Mvfd9Xv9wt
Xiiat2u9o2uFg/m+D58OfHxqfV279H7p8crn7c5RKdZP54ZGqrNHi4O12pV8NT9dqV8bmCsOTN+p
1nJH8vmB4nbnAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAPDv+H8AqZHq5AQAAQA=
```

The bruteforce script is as follows:

```python
import subprocess
import string

# dictionary of already discovered chars
known_chars = {}
# number of @ printed (equals to chars that are right)
num_ats = 0
# number of chars in flag
N_CHARS = 36

i = 0
while num_ats < N_CHARS: # stops if all chars are correct

# no reason to keep trying a known char
if i in known_chars.keys():
i = (i+1)%N_CHARS
continue

# creates a new guess with known_chars and a random one
for c in string.printable:
cur_guess = []
for j in range(N_CHARS):
if j in known_chars.keys():
cur_guess.append(known_chars[j])
else:
cur_guess.append("A")
cur_guess[i] = c
guess = "".join(cur_guess)

# calls function with current guess and counts '@'s
p = subprocess.run(["./vm", "sec_read_mod"], input=f"{guess}".encode(),capture_output=True)
if(p.stdout.count(b"@") > num_ats):
print(guess)
known_chars[i] = c
num_ats += 1
break

# the tries may loop around the string because order is shuffled
i = (i+1)%N_CHARS
```

After around a minute the script yields the flag: `HTB{5w1rl_4r0und_7h3_4l13n_l4ngu4g3}`