u.twoha.cc/ctf/csaw/rev_archeology.md

321 lines
11 KiB
Markdown
Raw Normal View History

2024-09-13 03:24:53 -04:00
---
title: 'CSAW CTF 2024 Quals: Reversing - Archeology'
date: 2024/09/12
tags: ['ctf', 'rev']
---
## Task
> While researching Egyptian artifacts, an Archaeology student discovers a series of incomprehensible hieroglyphs. Suspecting a custom cipher, she uncovers an ancient cipher machine and a matching hieroglyph dictionary. Can you help her decipher a small part of the secret message?
>
> [`chal`](https://ctf.csaw.io/files/024ec940cc04eee0a93d2b8af8e5271b/chal?token=eyJ1c2VyX2lkIjoyNjc1LCJ0ZWFtX2lkIjo5MDUsImZpbGVfaWQiOjI5fQ.Zt_SNg.1oxzWU8rIIRxpXdKwybpSukzv7E) [`hieroglyphs.txt`](https://ctf.csaw.io/files/12d35b03b6f134b15a58aae0c49a2c20/hieroglyphs.txt?token=eyJ1c2VyX2lkIjoyNjc1LCJ0ZWFtX2lkIjo5MDUsImZpbGVfaWQiOjMxfQ.Zt_SNg.0Owx0lX6ed13BojCxTwYvewPjws) [`message.txt`](https://ctf.csaw.io/files/2efde5dd515025b55561749742cc9a30/message.txt?token=eyJ1c2VyX2lkIjoyNjc1LCJ0ZWFtX2lkIjo5MDUsImZpbGVfaWQiOjMyfQ.Zt_SNg.1ZzbRrdYp8zAgAp9jBH3PIHaGDI)
- `Author: keeboi`
- `Points: 426`
- `Solves: 110 / 1181 (9.314%)`
## Writeup
We are provided with three files: a binary that encrypts our input into some hieroglyphs (`chal`), a text file used by the binary (`hieroglyphs.txt`), and an encrypted message (`message.txt`).
After running a decompiler on `chal` (`angr` is used here), we can see that `main` can be divided into a few sections.
First, a function called `washing_machine` is run on the first argument:
```c
int main(unsigned long a0, struct_1 *a1)
{
/* ... variable declarations ... */
v17 = &v15;
alloca(0x12000);
if ((unsigned int)a0 != 2)
{
printf("Usage: %s <data-to-encrypt>\n", (unsigned int)a1->field_0);
return 1;
}
v12 = 3721182122;
v13 = 238;
v6 = 5;
v9 = &a1->field_8->field_0;
v7 = strlen(v9);
v1 = 0;
printf("Encrypted data: ");
washing_machine(v9, v7);
```
Then, the program iterates over our input to write something to `v14` and calls `runnnn` and `washing_machine`:
```c
for (v2 = 0; v2 < v7; v2 += 1)
{
v18 = v1;
v1 = (unsigned int)v18 + 1;
*((char *)(&v14 + v18)) = 0;
v19 = v1;
v1 = (unsigned int)v19 + 1;
(&v14)[v19] = 1;
v20 = v1;
v1 = (unsigned int)v20 + 1;
(&v14)[v20] = v9[v2];
for (v3 = 0; v3 <= 9; v3 += 1)
{
/* more of the same stuff, omitted for brevity */
}
v35 = v1;
v1 = (unsigned int)v35 + 1;
(&v14)[v35] = 4;
v36 = v1;
v1 = (unsigned int)v36 + 1;
(&v14)[v36] = 1;
v37 = v1;
v1 = (unsigned int)v37 + 1;
(&v14)[v37] = v2;
}
v38 = v1;
v1 = (unsigned int)v38 + 1;
(&v14)[v38] = 7;
runnnn(&v14);
washing_machine(&memory, v7);
```
Finally, the program references `hieroglyphs.txt` and `memory` to print out the encrypted input to the screen:
```c
v10 = &fopen("hieroglyphs.txt", "r")->_flags;
if (!v10)
{
perror("Failed to open hieroglyphs.txt");
return 1;
}
for (v4 = 0; fgets(&(&v11)[0x100 * v4], 0x100, v10) && v4 <= 255; v4 += 1)
{
(&v11)[0x100 * v4 + strcspn(&(&v11)[0x100 * v4], "\n")] = 0;
}
fclose(v10);
for (v5 = 0; v5 < v7; v5 += 1)
{
v0 = *(&(&memory)[v5]);
printf("%s", (unsigned int)&(&v11)[0x100 * v0]);
}
putchar(10);
exit(0); /* do not return */
}
```
The decompiler output for `washing_machine` is fairly straightforward:
```c
void washing_machine(char *a0, unsigned long a1)
{
char v0; // [bp-0x1b]
char v1; // [bp-0x1a]
char v2; // [bp-0x19]
unsigned long v3; // [bp-0x18]
unsigned long v4; // [bp-0x10]
v0 = *(a0);
for (v3 = 1; v3 < a1; v3 += 1)
{
v2 = a0[v3] ^ v0;
a0[v3] = v2;
v0 = v2;
}
for (v4 = 0; v4 < a1 >> 1; v4 += 1)
{
v1 = a0[v4];
/* index seems to be wrong here, should be (-1 + a1 - v4) */
a0[v4] = a0[1 + a1 + -1 * v4];
a0[1 + a1 + -1 * v4] = v1;
}
return;
}
```
Translating this to Python, we get:
```py
def washing_machine(buf):
n = len(buf)
# xor bytes with the previous byte
for i in range(1, n):
buf[i] ^= buf[i - 1]
# reverse buf
for i in range(n // 2):
j = n - i - 1
buf[i], buf[j] = buf[j], buf[i]
def unwash(buf):
n = len(buf)
for i in range(n // 2):
j = n - i - 1
buf[i], buf[j] = buf[j], buf[i]
# repeating the xor undoes it
for i in range(n - 1, 0, -1):
buf[i] ^= buf[i - 1]
```
Now lets look at `runnnn`:
```c
void runnnn(unsigned long a0)
{
/* variable decs */
v5 = 0;
v6 = 1;
while (v6)
{
v8 = v5;
v5 = (unsigned int)v8 + 1;
v0 = v8[a0];
v9 = v0;
switch ((unsigned int)v9)
{
case 0:
v11 = v5;
v5 = (unsigned int)v11 + 1;
v1 = *((char *)(a0 + v11));
v12 = v5;
v5 = (unsigned int)v12 + 1;
*(&(&regs)[v1]) = *((char *)(v12 + a0));
break;
case 1:
v13 = v5;
v5 = (unsigned int)v13 + 1;
v1 = *((char *)(a0 + v13));
v14 = v5;
v5 = (unsigned int)v14 + 1;
v4 = *((char *)(a0 + v14));
*(&(&regs)[v1]) = *(&(&regs)[v1]) ^ *(&(&regs)[v4]);
break;
case 2:
v15 = v5;
v5 = (unsigned int)v15 + 1;
v1 = *((char *)(a0 + v15));
v16 = v5;
v5 = (unsigned int)v16 + 1;
v2 = *((char *)(a0 + v16));
*(&(&regs)[v1]) = *(&(&regs)[v1]) << (v2 & 31) | (unsigned int)(*(&(&regs)[v1]) >> ((char)(8 - v2) & 31));
break;
case 3:
v17 = v5;
v5 = (unsigned int)v17 + 1;
v1 = *((char *)(a0 + v17));
*(&(&regs)[v1]) = *(&(&sbox)[*(&(&regs)[v1])]);
break;
case 4:
v18 = v5;
v5 = (unsigned int)v18 + 1;
v1 = *((char *)(a0 + v18));
v19 = v5;
v5 = (unsigned int)v19 + 1;
v3 = *((char *)(a0 + v19));
*(&(&memory)[v3]) = *(&(&regs)[v1]);
break;
case 5:
v20 = v5;
v5 = (unsigned int)v20 + 1;
v1 = *((char *)(a0 + v20));
v21 = v5;
v5 = (unsigned int)v21 + 1;
v3 = *((char *)(a0 + v21));
*(&(&regs)[v1]) = *(&(&memory)[v3]);
break;
case 6:
v22 = v5;
v5 = (unsigned int)v22 + 1;
v1 = *((char *)(a0 + v22));
putchar(*(&(&regs)[v1]));
break;
case 7:
v6 = 0;
break;
case 8:
v23 = v5;
v5 = (unsigned int)v23 + 1;
v1 = *((char *)(a0 + v23));
v24 = v5;
v5 = (unsigned int)v24 + 1;
v2 = *((char *)(a0 + v24));
*(&(&regs)[v1]) = (unsigned int)(*(&(&regs)[v1]) >> (v2 & 31)) | *(&(&regs)[v1]) << ((char)(8 - v2) & 31);
break;
default:
puts("Invalid instruction");
v6 = 0;
break;
}
}
return;
}
```
It seems that this function implements some kind of virtual machine (we can see `regs`, `memory`, and `"Invalid instruction"`), with the argument being a buffer containing the bytecode. Thus, we can assume that the loop over our input in `main` generates the instructions to be executed.
While we could go through the tedious process of determining the effect of each instruction generated and writing a function to undo these operations, it's possible that the entire process can be reduced to a mapping from input bytes to output bytes. To check this, we can write the following script:
```py
from pwn import gdb, context
# suppress pwntools output
context.log_level = 'CRITICAL'
prog = './chal'
# modified to return a new array instead of modifying in-place
def unwash(buf):
n = len(buf)
b = buf[::-1]
for i in range(n - 1, 0, -1):
b[i] ^= b[i - 1]
return b
# run the program and dump the memory buffer
def dump(n, out):
# get the input that will be [n..n+64] after washing_machine
arg = bytes(unwash(list(range(n, n + 64))))
try:
# run the program, break after the call to runnnn, then dump memory
p = gdb.debug([prog, arg], gdbscript=f'''
b *main+940
c
dump memory {out} &memory ((char*)&memory)+64
exit
''')
p.wait()
except EOFError:
pass
# the program segfaults if our input is too big, so we split the dump into 4 runs
for i in range(4):
dump(64 * i, f'/tmp/dump{i}.bin')
# get the mapping
mapping = []
for i in range(4):
with open(f'/tmp/dump{i}.bin', 'rb') as f:
mapping.extend(f.read())
# assert that we got a bijection
assert len(mapping) == 256 == len(set(mapping))
print(mapping)
# [14, 106, 19, 58, 0, 209, 250, 226, 228, 176, 142, 35, 100, 158, 93, 194, 224, 51, 112, 189, 39, 46, 6, 136, 132, 197, 78, 215, 192, 146, 30, 152, 185, 151, 67, 71, 96, 234, 252, 214, 143, 178, 175, 11, 188, 122, 230, 7, 104, 243, 22, 182, 187, 3, 193, 135, 212, 221, 54, 216, 222, 62, 26, 149, 5, 229, 75, 12, 248, 155, 17, 219, 47, 235, 110, 61, 164, 177, 118, 191, 84, 166, 114, 60, 150, 167, 91, 134, 37, 203, 156, 8, 43, 148, 179, 133, 169, 218, 144, 138, 55, 87, 4, 171, 139, 128, 119, 145, 206, 130, 85, 254, 236, 217, 196, 48, 80, 36, 50, 198, 65, 99, 69, 123, 227, 244, 29, 174, 40, 20, 113, 111, 159, 109, 180, 79, 86, 81, 208, 165, 246, 88, 53, 15, 245, 131, 207, 21, 66, 213, 33, 204, 52, 240, 163, 233, 251, 121, 34, 72, 31, 49, 108, 57, 25, 223, 160, 120, 23, 105, 201, 41, 172, 195, 92, 28, 70, 45, 68, 129, 102, 173, 82, 168, 210, 242, 74, 77, 232, 205, 73, 2, 24, 32, 44, 237, 42, 116, 140, 181, 16, 199, 13, 225, 137, 9, 76, 56, 117, 97, 190, 124, 18, 83, 184, 89, 115, 161, 38, 98, 90, 101, 147, 186, 231, 157, 253, 183, 200, 107, 125, 239, 127, 63, 211, 154, 247, 141, 94, 126, 103, 170, 162, 95, 1, 241, 153, 202, 27, 249, 64, 59, 10, 238, 220, 255]
```
It appears our assumption is correct since we successfully get a bijection from input bytes to output bytes.
The final part of `main` seems to just read each line of `hieroglyphs.txt`, iterate over each byte of `memory`, and print the corresponding line. To reverse this, we can add this to our Python script:
```python
with open('hieroglyphs.txt', 'r') as f:
h_mapping = f.read().split('\n')
with open('message.txt', 'r') as f:
msg = f.read()
memory = [h_mapping.index(c) for c in msg]
unwashed_memory = unwash(memory)
washed_input = [mapping.index(b) for b in unwashed_memory]
unwashed_input = unwash(washed_input)
print(bytes(unwashed_input).decode())
```
Running our script now gives us the flag: `csawctf{w41t_1_54w_7h353_5ymb0l5_47_7h3_m3t_71m3_70_r34d_b00k_0f_7h3_d34d}`.