Tags: crc128 crypto crc 

Rating:

Author's solution:

```C++
// Solver for CRC Recursive Challenge from Dragon CTF 2021
// Author: Michał "Redford" Kowalczyk
//
// Could probably be much simpler, but this code was evolving for too long
// without a rewrite ¯\_(ツ)_/¯

#include <iostream>
#include <cstdio>
#include <cstddef>
#include <cstring>
#include <cassert>
#include <algorithm>
#include <thread>

using std::swap;
using std::pair;
using std::make_pair;

// using representation without reversing bits: LSB is on the right (x & 1 == LSB)
using poly_t = unsigned __int128;
//using poly_t = uint64_t;

//constexpr poly_t MOD = 0xad93d23594c935a9; // <- picked for 64-bit chall
//constexpr poly_t BITS = 64;
//constexpr poly_t MOD = 0x4caa61b0d7fe5fa54189d46709eaba2d; // <- picked for 128-bit chall, C++ sucks
constexpr poly_t MOD = ((poly_t)0x4caa61b0d7fe5fa5 << 64) + 0x4189d46709eaba2d;
constexpr poly_t BITS = 128;
//const char prefix[] = "My crc64 is 0x";
const char prefix[] = "My crc128 is 0x";
const char suffix[] = "! Cool, isn\'t it?";
constexpr size_t prefix_len = sizeof(prefix) - 1;
constexpr size_t suffix_len = sizeof(suffix) - 1;

constexpr poly_t MASK = ((poly_t)1 << (BITS - 1) << 1) - 1; // can't do '(1 << BITS) - 1' because of the bitshift UB
constexpr size_t SETS_CNT = 2;
poly_t set_fixup_left[SETS_CNT] = {
0x30, // 0 -> [0, 0, 1, 1, 0, 0, 0, 0]
0x60, // 1 -> [0, 1, 1, 0, 0, 0, 0, 0]
};
poly_t set_fixup_right[SETS_CNT] = {
0x0, // 0 -> [0, 0, 0, 0]
0x9, // 1 -> [1, 0, 0, 1]
};
poly_t set_var_coeff_left[SETS_CNT][4] = { // [set][var%4] -> bits if 1
{0x08, // [0, 0, 0, 0, 1, 0, 0, 0]
0x04, // [0, 0, 0, 0, 0, 1, 0, 0]
0x02, // [0, 0, 0, 0, 0, 0, 1, 0]
0x01}, // [0, 0, 0, 0, 0, 0, 0, 1]
{0x00, // [0, 0, 0, 0, 0, 0, 0, 0]
0x04, // [0, 0, 0, 0, 0, 1, 0, 0]
0x02, // [0, 0, 0, 0, 0, 0, 1, 0]
0x01} // [0, 0, 0, 0, 0, 0, 0, 1]
};

poly_t set_var_coeff_right[SETS_CNT][4] = { // [set][var%4] -> bits if 1
{0x08, // [1, 0, 0, 0]
0x04, // [0, 1, 0, 0]
0x02, // [0, 0, 1, 0]
0x01}, // [0, 0, 0, 1]
{0x04, // [0, 1, 0, 0]
0x00, // [0, 0, 0, 0]
0x02, // [0, 0, 1, 0]
0x03} // [0, 0, 1, 1]
};

poly_t crc(const uint8_t* buf, size_t buf_size, poly_t crc = MASK) {
for (size_t i = 0; i < buf_size; i++) {
crc ^= (poly_t)buf[i] << (BITS - 8);
for (int j = 0; j < 8; j++) {
if ((crc & ((poly_t)1 << (BITS - 1))) == 0)
crc = crc << 1;
else
crc = (crc << 1) ^ MOD;
}
}
return crc & MASK;
}

bool verify(poly_t checksum) {
// slow and ugly, but this was used only for testing and debugging
char input[128] = { 0 };
strcat(input, prefix);
char fmt[16];
sprintf(fmt, "%%0%ullx", (int)BITS / 4);
sprintf(input + prefix_len, fmt, checksum);
strcpy(input + prefix_len + BITS / 4, suffix);
return crc((const uint8_t*)input, strlen(input)) == checksum;
}

poly_t poly_bytes_mod(uint8_t* buf, size_t buf_size, poly_t mod) {
poly_t crc = 0;
assert(buf_size > BITS / 8);
size_t i;
for (i = 0; i < buf_size - BITS / 8; i++) {
crc ^= (poly_t)buf[i] << (BITS - 8);
for (int j = 0; j < 8; j++) {
if ((crc & ((poly_t)1 << (BITS - 1))) == 0)
crc = crc << 1;
else
crc = (crc << 1) ^ MOD;
}
}
for (size_t j = 0; j < BITS / 8; j++) {
crc ^= (poly_t)buf[i + j] << (BITS - 8 * (j + 1));
}
return crc & MASK;
}

void poly_bytes_add_bit(uint8_t* buf, size_t buf_size, size_t pos) {
buf[buf_size - 1 - pos / 8] ^= (1 << (pos % 8));
}

// Be careful: modifies `vars`!
//__declspec(noinline)
// TODO: hardcode vars count? - lol, makes the code slower...
void poly_gauss(pair<poly_t, poly_t>* vars /*<weight, var_bit>*/,
size_t vars_cnt, poly_t expected_sum, size_t letters) {
size_t fixed = 0;
for (size_t col = 0; col < BITS; col++) {
bool found = false;
for (size_t row = fixed; row < vars_cnt; row++) {
if (vars[row].first & ((poly_t)1 << col)) {
swap(vars[fixed], vars[row]);
found = true;
break;
}
}
if (!found)
continue; // can't fully control this bit
pair<poly_t, poly_t> cur = vars[fixed];
for (size_t row = fixed + 1; row < vars_cnt; row++) {
if (vars[row].first & ((poly_t)1 << col)) {
vars[row].first ^= cur.first;
vars[row].second ^= cur.second;
}
}
fixed++;
}

poly_t res_sum = 0;
poly_t res_vars = 0;
size_t row = 0;
for (size_t col = 0; col < BITS; col++) {
if ((expected_sum ^ res_sum) & ((poly_t)1 << col)) {
if (vars[row].first & ((poly_t)1 << col)) {
res_sum ^= vars[row].first;
res_vars ^= vars[row].second;
} else {
return;
}
}
if (vars[row].first & ((poly_t)1 << col)) {
row += 1;
if (row >= vars_cnt)
break;
}
}
assert(res_sum == expected_sum); // otherwise we'd have returned in the loop above

assert(fixed > 0);
poly_t limit;
if (vars_cnt - fixed == 0)
limit = 1;
else
limit = ((poly_t)1 << (vars_cnt - fixed - 1)) << 1; // may overflow to 0, but without UB
for (poly_t i = 0;;) {
poly_t solution = res_vars;
for (poly_t bit = 0; bit < vars_cnt - fixed; bit++)
if (i & ((poly_t)1 << bit))
solution ^= vars[vars_cnt - 1 - bit].second;
// verify poly solution (it may not match the bruteforced ranges
bool ok = true;
for (poly_t nibble_pos = 0; nibble_pos < BITS / 4; nibble_pos++) {
int sol_vars = (solution >> (BITS - 4 - 4 * nibble_pos)) & 0xF;
if (letters & ((size_t)1 << nibble_pos)) {
// a-f
// the other bit endianness
// if (sol_vars != 0b1000 &&
// sol_vars != 0b0100 &&
// sol_vars != 0b1101 &&
// sol_vars != 0b0011 &&
// sol_vars != 0b1011 &&
// sol_vars != 0b0111) {
// ok = false;
// break;
// }
if (sol_vars != 0b0001 &&
sol_vars != 0b0010 &&
sol_vars != 0b1011 &&
sol_vars != 0b1100 &&
sol_vars != 0b1101 &&
sol_vars != 0b1110) {
ok = false;
break;
}

} else {
// 0-9
if (sol_vars >= 10) {
ok = false;
break;
}
// the other bit endianness
// if (sol_vars != 0b0000 &&
// sol_vars != 0b1000 &&
// sol_vars != 0b0100 &&
// sol_vars != 0b1100 &&
// sol_vars != 0b0010 &&
// sol_vars != 0b1010 &&
// sol_vars != 0b0110 &&
// sol_vars != 0b1110 &&
// sol_vars != 0b0001 &&
// sol_vars != 0b1001) {
// ok = false;
// break;
// }
}
}
if (ok) {
poly_t crc = 0;
for (poly_t nibble_pos = 0; nibble_pos < BITS / 4; nibble_pos++) {
size_t set_ind = !!(letters & ((size_t)1 << nibble_pos));
poly_t nibble = set_fixup_right[set_ind];
for (int j = 0; j < 4; j++)
if (solution & ((poly_t)1 << (BITS - 1 - nibble_pos * 4 - j)))
nibble ^= set_var_coeff_right[set_ind][j];
crc ^= nibble << (BITS - 4 * (nibble_pos + 1));
}
if (BITS <= 64) // otherwise ull is too small
assert(verify(crc));
if (BITS <= 64) {
char fmt[32];
sprintf(fmt, "Found: 0x%%0%ullx\n", (int)BITS / 4);
printf(fmt, crc);
} else {
char fmt[32];
sprintf(fmt, "Found: 0x%%0%ullx%%0%ullx\n", (int)BITS / 4 / 2, (int)BITS / 4 / 2);
printf(fmt, (uint64_t)(crc >> 64), uint64_t(crc));
}
}

if (i == limit) // doing it this way to prevent UB on 1 << 64
break;
i++;
}
}

poly_t set_weight[BITS / 4][SETS_CNT] = { 0 }; // [nibble_pos][let] -> coeff for C
poly_t var_weight[BITS][SETS_CNT] = { 0 }; // [var][set] -> variable weight
poly_t C_base;

void solver(size_t start, size_t end) { // exclusive
for (size_t letters = start; letters < end; letters++) { // this bitshift won't overflow for the bitsizes we use
if (((letters - start) & 0x1FFFFF) == 0)
//printf("%zu/%zu...\n", letters, end - 1);
printf("%d%%...\n", int((letters - start) / (double)(end-start-1) * 100));

poly_t C = C_base;
for (int i = 0; i < BITS / 4; i++)
C ^= set_weight[i][!!(letters & ((size_t)1 << i))];

size_t vars_cnt = 0;
pair<poly_t, poly_t> vars[BITS]; // <weight, var_bit>
for (int var = 0; var < BITS; var++) {
int set = !!(letters & ((size_t)1 << var / 4));
vars[vars_cnt] = make_pair(var_weight[var][set], (poly_t)1 << (BITS - 1 - var));
vars_cnt++;
}

poly_gauss(vars, vars_cnt, C, letters);
}
}

int main(int argc, char* argv[]) {
// preprocess base C value
uint8_t tmp[prefix_len + BITS / 4 + suffix_len + BITS / 8] = { 0 };
memcpy(tmp, prefix, prefix_len);
memcpy(tmp + prefix_len + BITS / 4, suffix, suffix_len);
for (size_t i = 0; i < BITS / 8; i++)
tmp[i] ^= 0xFF;
C_base = poly_bytes_mod(tmp, sizeof(tmp), MOD);

// preprocess set_weight
for (size_t nibble_pos = 0; nibble_pos < BITS / 4; nibble_pos++)
for (int set = 0; set < SETS_CNT; set++) {
uint8_t tmp[BITS / 4 + suffix_len + BITS / 8] = { 0 };
tmp[nibble_pos] ^= set_fixup_left[set];
if (nibble_pos % 2 == 0)
tmp[BITS / 4 + suffix_len + nibble_pos / 2] ^= (set_fixup_right[set] << 4);
else
tmp[BITS / 4 + suffix_len + nibble_pos / 2] ^= set_fixup_right[set];
set_weight[nibble_pos][set] = poly_bytes_mod(tmp, sizeof(tmp), MOD);
}

// preprocess var_weight
for (size_t var = 0; var < BITS; var++)
for (int set = 0; set < SETS_CNT; set++) {
uint8_t tmp[BITS / 4 + suffix_len + BITS / 8] = { 0 };
tmp[var / 4] ^= set_var_coeff_left[set][var % 4];
if ((var / 4) % 2 == 0)
tmp[BITS / 4 + suffix_len + var / 4 / 2] ^= (set_var_coeff_right[set][var % 4] << 4);
else
tmp[BITS / 4 + suffix_len + var / 4 / 2] ^= set_var_coeff_right[set][var % 4];
var_weight[var][set] = poly_bytes_mod(tmp, sizeof(tmp), MOD);
}

size_t threads_cnt;
if (argc <= 1) {
threads_cnt = 1;
} else {
threads_cnt = atoi(argv[1]);
}
printf("Running main loop with %zu threads...\n", threads_cnt);
std::vector<std::thread> threads;
size_t end = (size_t)1 << BITS / 4;
size_t step = (end + threads_cnt - 1) / threads_cnt;
for (size_t i = 0; i < threads_cnt; i++)
threads.emplace_back(solver, step * i, std::min(step * (i + 1), end));
for (auto& thread : threads)
thread.join();

puts("Done!");
}
```

Original writeup (https://gist.github.com/mkow/b43883ef4c3e25d6001e8c5f4f63430d).