Tags: parallel reversing meetinthemiddle 

Rating:

[Link to original writeup](https://wrecktheline.com/writeups/m0lecon-2021/#parallel_the_m0le)

# parallel-the-m0le (11 solves, 301 points)
by JaGoTu

```
I was playing with some parallel programming but I'm so bad with it that I lost the flag, can you help me recover it?

Output: 2aad2e5a49fb2d9adb908dd00eb48c8a6607ab619f75b0272f3c1eb33fe9edaf

Author: spicyNduja
```

We get a `chall` file:

```
$ file chall
chall: ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, BuildID[sha1]=09a5be5a3d5bf899a01af46d34e656d751051e98, for GNU/Linux 3.2.0, not stripped
```

Let's start reversing with `main()`:

```C
int __cdecl main(int argc, const char **argv, const char **envp)
{
unsigned int v3; // eax
int i; // [rsp+14h] [rbp-1Ch]
int j; // [rsp+14h] [rbp-1Ch]
int k; // [rsp+14h] [rbp-1Ch]
int l; // [rsp+14h] [rbp-1Ch]
int m; // [rsp+14h] [rbp-1Ch]
pthread_t *pthreads; // [rsp+18h] [rbp-18h]
threadstruct *threads; // [rsp+20h] [rbp-10h]

if ( argc != 2 )
{
puts("Plz give me something :(");
exit(1);
}
if ( strlen(argv[1]) != 16 )
{
puts("I don't like the size of this input :(");
exit(2);
}
strcpy(flagbuff, argv[1]);
mutex = (pthread_mutex_t *)malloc(0x28uLL);
cv = (pthread_cond_t *)malloc(0x30uLL);
if ( pthread_mutex_init(mutex, 0LL) || pthread_cond_init(cv, 0LL) )
{
puts("something wrong happened");
exit(3);
}
pthreads = (pthread_t *)malloc(0x70uLL);
threads = (threadstruct *)malloc(0xE0uLL);
printf(
"%s\n\n",
"\n"
"\n"
"\n"
" _____ _ _ _ _______ _ __ __ ___ _ \n"
" | __ \\ | | | | |__ __| | | \\/ |/ _ \\| | \n"
" | |__) |_ _ _ __ __ _| | | ___| | | | | |__ ___| \\ / | | | | | ___ \n"
" | ___/ _` | '__/ _` | | |/ _ \\ | | | | '_ \\ / _ \\ |\\/| | | | | |/ _ \\ \n"
" | | | (_| | | | (_| | | | __/ | | | | | | | __/ | | | |_| | | __/ \n"
" |_| \\__,_|_| \\__,_|_|_|\\___|_| |_| |_| |_|\\___|_| |_|\\___/|_|\\___| \n"
" \n"
" \n");
printf("doing some quantistic machine learning crypto");
for ( i = 0; i <= 4; ++i )
{
sleep(1u);
fflush(stdout);
putchar(46);
}
puts("\n\n");
v3 = time(0LL);
srand(v3);
for ( j = 0; j <= 13; ++j )
{
threads[j].threadindex = j;
threads[j].pthread = pthreads[j];
pthread_create(&pthreads[j], 0LL, (void *(*)(void *))wrapper, &threads[j]);
}
for ( k = 0; k <= 13; ++k )
pthread_join(pthreads[k], 0LL);
print_str((__int64)flagbuff, 16LL, 0);
ord_ind = 0;
strcpy(flagbuff, argv[1]);
for ( l = 0; l <= 13; ++l )
{
threads[l].threadindex = l;
threads[l].pthread = pthreads[l];
pthread_create(&pthreads[l], 0LL, (void *(*)(void *))wrapper2, &threads[l]);
}
for ( m = 0; m <= 13; ++m )
pthread_join(pthreads[m], 0LL);
pthread_mutex_destroy(mutex);
pthread_cond_destroy(cv);
print_str((__int64)flagbuff, 16LL, 1);
putchar(10);
return 0;
}
```

So, first the flag is checked for being exactly 16 bytes long. After rand is seeded, 14 threads are created running `wrapper`, creating the first 16 bytes of output. Afther that, the flagbuff is restored to the input, and another 14 threads are created running `wrapper2`, creating the last 16 bytes of output.

Let's look at those wrapper functions:

```C
void __fastcall __noreturn wrapper(threadstruct *a1)
{
int v1; // eax
int v2; // eax

v1 = rand();
usleep(v1 % 1000);
pthread_mutex_lock(mutex);
v2 = ord_ind;
if ( ord_ind > 6 )
{
++ord_ind;
order[20 - v2] = a1->threadindex;
}
else
{
++ord_ind;
order[v2] = a1->threadindex;
}
((void (__fastcall *)(char *, unsigned __int64))funcs[a1->threadindex])(flagbuff, 16uLL);
pthread_mutex_unlock(mutex);
pthread_exit(0LL);
}

void __fastcall __noreturn wrapper2(threadstruct *a1)
{
pthread_mutex_lock(mutex);
while ( order[ord_ind] != a1->threadindex )
pthread_cond_wait(cv, mutex);
((void (__fastcall *)(char *, unsigned __int64))funcs[a1->threadindex])(flagbuff, 0x10uLL);
++ord_ind;
pthread_cond_broadcast(cv);
pthread_mutex_unlock(mutex);
pthread_exit(0LL);
}
```

In the first run, the threads will wake up in a random order (thanks to `usleep(v1 % 1000)`), write themselves into the `order` array and call their respective function on the flagbuff. In the second run, the threads are run in the order of the `order` array, also calling their function on flagbuff.

What this causes is that we get two outputs generated by two different function chains:

```
wrapper: A->B->C->D->E->F->G -> H->I->J->K->L->M->N
wrapper2: A->B->C->D->E->F->G -> N->M->L->K->J->I->H
```

To be able to get the plaintext from the provided output, we need to reverse engineer and invert all the funcs. As they are mostly small loops, I won't describe the process here, you can check the inverses in the resulting python script.

Now, trying all 14! = 87178291200 possible chains on the output we got doesn't sound too practical. However, we can notice that the first half of the chain is identical. That means that using a chain of `N'->M'->L'->K'->J'->I'->H'` on the first output and `H'->I'->J'->K'->L'->M'->N'` on the second output (the tick denoting transformation inverse), we should get the same result - a kind of a meet-in-the-middle attack. Bruteforcing all permutations of 7 out of 14 = 17297280 different options sounds way more doable. To make the bruteforce parallel, I took the first element of the chain as an argument, effectively spreading the workload to 14 cores.

```python
revs = [rev_1, rev_2, rev_3, rev_4, rev_5, rev_6, rev_7, rev_8, rev_9, rev_10, rev_11, rev_12, rev_13, rev_14]

target = binascii.unhexlify("2aad2e5a49fb2d9adb908dd00eb48c8a6607ab619f75b0272f3c1eb33fe9edaf")
target1 = target[:16]
target2 = target[16:]

def run_on(text, order):
buf = list(text)
for step in order:
revs[step](buf)
#print(hexdump(bytes(buf)))
return bytes(buf)

iter=0
possibles = list(range(14))
start = int(sys.argv[1])
possibles.remove(start)

for i in itertools.permutations(possibles, 6):
perm = (start, ) + i
iter +=1
if(iter % 100000 == 0):
print(iter)

middle1 = run_on(target1, perm)
middle2 = run_on(target2, perm[::-1])
if(middle1 == middle2):
print(middle1)
print(i)
```

```
$ for i in {0..13}; do python3 paramoly.py $i | tee -a out_$i & done
[...]
$ cat out* | grep '^(' -B 1
b'\x9f[\xaaM\x89s\xb9\xc7\x97E;\xf6}X\xb7o'
(8, 13, 3, 9, 11, 6)
b'\x9f[\xaaM\x89s\xb9\xc7\x97E;\xf6}X\xb7o'
(8, 13, 9, 3, 11, 6)
--
b'\x9f[\xaaM\x89s\xb9\xc7\x97E;\xf6}X\xb7o'
(7, 13, 3, 9, 11, 6)
b'\x9f[\xaaM\x89s\xb9\xc7\x97E;\xf6}X\xb7o'
(7, 13, 9, 3, 11, 6)
--
b'\x9f[\xaaM\x89s\xb9\xc7\x97E;\xf6}X\xb7o'
(13, 1, 7, 9, 11, 6)
b'\x9f[\xaaM\x89s\xb9\xc7\x97E;\xf6}X\xb7o'
(13, 1, 9, 7, 11, 6)
b'\x9f[\xaaM\x89s\xb9\xc7\x97E;\xf6}X\xb7o'
(13, 3, 1, 9, 11, 6)
b'\x9f[\xaaM\x89s\xb9\xc7\x97E;\xf6}X\xb7o'
(13, 3, 9, 1, 11, 6)
b'\x9f[\xaaM\x89s\xb9\xc7\x97E;\xf6}X\xb7o'
(13, 7, 3, 9, 11, 6)
--
b'\x9f[\xaaM\x89s\xb9\xc7\x97E;\xf6}X\xb7o'
(13, 7, 9, 3, 11, 6)
b'\x9f[\xaaM\x89s\xb9\xc7\x97E;\xf6}X\xb7o'
(13, 9, 1, 7, 11, 6)
b'\x9f[\xaaM\x89s\xb9\xc7\x97E;\xf6}X\xb7o'
(13, 9, 3, 1, 11, 6)
b'\x9f[\xaaM\x89s\xb9\xc7\x97E;\xf6}X\xb7o'
(13, 9, 7, 3, 11, 6)
```

Unfortunately the second half of the chain is not definite, having multiple possibilities, but the value of the flag in the middle of processing has only one possible value. We could run the same bruteforce for the first half of the chain, but I made it a bit lighter by observing that the functions 6, 9, 11 and 13 were definitely used in the second half of the chain and can't be used in the first, resulting in just 604800 options, which can easily run single threaded. Here is the full final script:

```python
import binascii
import itertools
import sys

def rev_13(buff):
for i in range(16):
buff[i] = (buff[i]-i) & 0xFF

def rev_14(buff):
for i in range(16):
if buff[i] > 0x40 and buff[i] <= 0x5a:
buff[i] += 0x20
elif buff[i] > 0x60 and buff[i] <= 0x7a:
buff[i] -= 0x20

def rev_12(buff):
for i in range(16):
buff[i] = (i ^ (~buff[i])) & 0xFF

def rev_11(buff):
for i in range(16):
bits = "{0:08b}".format(buff[i])
orig = bits[0]+bits[4]+bits[1]+bits[5]+bits[2]+bits[6]+bits[3]+bits[7]
buff[i] = int(orig, 2)

def rev_10(buff):
for i in range(16):
buff[i] = ((16*buff[i]) | (buff[i]>>4)) & 0xFF

def rev_9(buff):
for i in range(16):
buff[i] = (buff[i]-42) & 0xFF

def rev_8(buff):
for i in range(8):
(buff[i], buff[15-i]) = (buff[15-i], buff[i])

def rev_7(buff):
for i in range(16):
bits = "{0:08b}".format(buff[i])
orig = bits[::-1]
buff[i] = int(orig, 2)

def rev_6(buff):
for i in range(16):
buff[i] = (~buff[i]) & 0xFF

xorkey = "{reverse_fake_flag}ptm".encode()
def rev_5(buff):
for i in range(16):
buff[i] = (buff[i] ^ xorkey[i]) & 0xFF

def rev_4(buff):
for i in range(8):
buff[15-i] = (buff[i] ^ buff[15-i]) & 0xFF

rol = lambda val, r_bits, max_bits: \
(val << r_bits%max_bits) & (2**max_bits-1) | \
((val & (2**max_bits-1)) >> (max_bits-(r_bits%max_bits)))

ror = lambda val, r_bits, max_bits: \
((val & (2**max_bits-1)) >> r_bits%max_bits) | \
(val << (max_bits-(r_bits%max_bits)) & (2**max_bits-1))

def rev_3(buff):
for i in range(16):
buff[i] = (ror(buff[i], i%8, 8))

def rev_2(buff):
for i in range(8):
buff[i] = (buff[i] ^ buff[15-i]) & 0xFF

def rev_1(buff):
for i in range(0, 16, 4):
tmp = buff[i:i+4]
buff[i] = tmp[3]
buff[i+1] = tmp[0]
buff[i+2] = tmp[2]
buff[i+3] = tmp[1]

revs = [rev_1, rev_2, rev_3, rev_4, rev_5, rev_6, rev_7, rev_8, rev_9, rev_10, rev_11, rev_12, rev_13, rev_14]

target = binascii.unhexlify("2aad2e5a49fb2d9adb908dd00eb48c8a6607ab619f75b0272f3c1eb33fe9edaf")
target1 = target[:16]
target2 = target[16:]

def run_on(text, order):
buf = list(text)
for step in order:
revs[step](buf)
#print(hexdump(bytes(buf)))
return bytes(buf)

iter=0
possibles = list(range(14))
possibles.remove(11)
possibles.remove(6)
possibles.remove(9)
possibles.remove(13)

middle = b'\x9f[\xaaM\x89s\xb9\xc7\x97E;\xf6}X\xb7o'

for i in itertools.permutations(possibles, 7):
perm = i
iter +=1
if(iter % 10000 == 0):
print(iter)

result = run_on(middle, perm)
if b'ptm{' in result:
print(result)
```

```
$ python3 paramoly.py
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
b'ptm{brut\xb8\xd5\xeb\xad\xf6\xf1\xb8\xf2'
b'ptm{brut\xb8\xd5\xeb\xad\xf6\xf1\xb8\xf2'
b'ptm{brut\xb8\xd5\xeb\xad\xf6\xf1\xb8\xf2'
b'ptm{brut\xb8\xd5\xeb\xad\xf6\xf1\xb8\xf2'
b'ptm{brut3_f0rc3}'
b'ptm{brut3_f0rc3}'
b'ptm{brut\xb8\xd5\xeb\xad\xf6\xf1\xb8\xf2'
b'ptm{brut\xb8\xd5\xeb\xad\xf6\xf1\xb8\xf2'
b'ptm{brut\xb8\xd5\xeb\xad\xf6\xf1\xb8\xf2'
b'ptm{brut\xb8\xd5\xeb\xad\xf6\xf1\xb8\xf2'
b'ptm{brut3_f0rc3}'
b'ptm{brut\xb8\xd5\xeb\xad\xf6\xf1\xb8\xf2'
b'ptm{brut3_f0rc3}'
b'ptm{brut3_f0rc3}'
b'ptm{brut3_f0rc3}'
b'ptm{brut\xb8\xd5\xeb\xad\xf6\xf1\xb8\xf2'
b'ptm{brut\xb8\xd5\xeb\xad\xf6\xf1\xb8\xf2'
b'ptm{brut3_f0rc3}'
b'ptm{brut\xb8\xd5\xeb\xad\xf6\xf1\xb8\xf2'
b'ptm{brut3_f0rc3}'
150000
160000
170000
180000
190000
[...]
```

`ptm{brut3_f0rc3}` is the flag!

Original writeup (https://wrecktheline.com/writeups/m0lecon-2021/#parallel_the_m0le).