Add binary diffing utilities found in several of my repositories.

This commit is contained in:
Jennifer Taylor 2021-06-27 16:37:32 +00:00
parent ffee6fcd1c
commit f43c46ef2f
3 changed files with 426 additions and 0 deletions

View File

@ -3,3 +3,52 @@
Collection of utilities written in Python for working with various arcade binaries.
This is mostly suited towards the separated formats found in MAME archival releases
but also work on a variety of binaries from basically anywhere.
## bindiff
Create a binary diff from two same-length binaries, or apply a previously created
diff to a binary to patch that binary. Run it like `./bindiff diff --help` to see
options for diffing, and `./bindiff patch --help` to see options for patching.
The patch format is simple. The number on the left of the colon is the hex offset where
the difference was found, and the numbers on the right are the hex values to find
and replace. A wildcard (`*`) can be substituted for a hex pair for any byte in
the before section if you do not care what the value is, but be aware that this will
make the patch non-reversible. Arbitrary comments are supported anywhere in the diff.
Start a line with the `#` character to create a comment. Special values are recognized
in comments. If you create a comment starting with `# File size:` then the the base
file will be compared against the decimal number placed after the colon and any file
not matching that length will be rejected.
Some examples are as follows:
A simple patch changing a byte in a file at offset `0x256` from `0xAA` to `0xDD`:
```
256: AA -> DD
```
That same patch, but only for files that are exactly 1024 bytes long:
```
# File size: 1024
256: AA -> DD
```
A patch that does not care about one of the bytes it is patching. The byte at `0x513`
can be any value and the patch will still be applied, and altogether 4 bytes starting
at `0x512` will be changed to the hex value `0x00 0x11 0x22 0x33`:
```
512: AA * CC DD -> 00 11 22 33
```
A patch with multiple offsets, and helpful author descriptions for each section:
```
# This part of the patch fixes a sprite offset issue.
128: AA -> BB
# This part of the patch fixes sound playback issues.
256: 33 -> 44
```

263
arcadeutils/binary.py Normal file
View File

@ -0,0 +1,263 @@
from typing import List, Optional, Tuple, cast
from typing_extensions import Final
class BinaryDiffException(Exception):
pass
class BinaryDiff:
CHUNK_SIZE: Final[int] = 1024
@staticmethod
def _hex(val: int) -> str:
out = hex(val)[2:]
out = out.upper()
if len(out) == 1:
out = "0" + out
return out
@staticmethod
def diff(bin1: bytes, bin2: bytes) -> List[str]:
binlength = len(bin1)
if binlength != len(bin2):
raise BinaryDiffException("Cannot diff different-sized binary blobs!")
# First, get the list of differences
differences: List[Tuple[int, bytes, bytes]] = []
# Chunk the differences, assuming files are usually about the same,
# for a massive speed boost.
for offset in range(0, binlength, Binary.CHUNK_SIZE):
if bin1[offset:(offset + Binary.CHUNK_SIZE)] != bin2[offset:(offset + Binary.CHUNK_SIZE)]:
for i in range(Binary.CHUNK_SIZE):
byte1 = bin1[offset + i]
byte2 = bin2[offset + i]
if byte1 != byte2:
differences.append((offset + i, bytes([byte1]), bytes([byte2])))
# Don't bother with any combination crap if we have nothing to do
if not differences:
return []
# Now, combine them for easier printing
cur_block: Tuple[int, bytes, bytes] = differences[0]
ret: List[str] = []
# Now, include the original byte size for later comparison/checks
ret.append(f"# File size: {len(bin1)}")
def _hexrun(val: bytes) -> str:
return " ".join(Binary._hex(v) for v in val)
def _output(val: Tuple[int, bytes, bytes]) -> None:
start = val[0] - len(val[1]) + 1
ret.append(
f"{Binary._hex(start)}: {_hexrun(val[1])} -> {_hexrun(val[2])}"
)
def _combine(val: Tuple[int, bytes, bytes]) -> None:
nonlocal cur_block
if cur_block[0] + 1 == val[0]:
# This is a continuation of a run
cur_block = (
val[0],
cur_block[1] + val[1],
cur_block[2] + val[2],
)
else:
# This is a new run
_output(cur_block)
cur_block = val
# Combine and output runs of differences
for diff in differences[1:]:
_combine(diff)
# Make sure we output the last difference
_output(cur_block)
# Return our summation
return ret
@staticmethod
def size(patchlines: List[str]) -> Optional[int]:
for patch in patchlines:
if patch.startswith('#'):
# This is a comment, ignore it, unless its a file-size comment
patch = patch[1:].strip().lower()
if patch.startswith('file size:'):
return int(patch[10:].strip())
return None
@staticmethod
def _convert(val: str) -> Optional[int]:
val = val.strip()
if val == '*':
return None
return int(val, 16)
@staticmethod
def _gather_differences(patchlines: List[str], reverse: bool) -> List[Tuple[int, Optional[bytes], bytes]]:
# First, separate out into a list of offsets and old/new bytes
differences: List[Tuple[int, Optional[bytes], bytes]] = []
for patch in patchlines:
if patch.startswith('#'):
# This is a comment, ignore it.
continue
start_offset, patch_contents = patch.split(':', 1)
before, after = patch_contents.split('->')
beforevals = [
Binary._convert(x) for x in before.split(" ") if x.strip()
]
aftervals = [
Binary._convert(x) for x in after.split(" ") if x.strip()
]
if len(beforevals) != len(aftervals):
raise BinaryDiffException(
f"Patch before and after length mismatch at "
f"offset {start_offset}!"
)
if len(beforevals) == 0:
raise BinaryDiffException(
f"Must have at least one byte to change at "
f"offset {start_offset}!"
)
offset = int(start_offset.strip(), 16)
for i in range(len(beforevals)):
if aftervals[i] is None:
raise BinaryDiffException(
f"Cannot convert a location to a wildcard "
f"at offset {start_offset}"
)
if beforevals[i] is None and reverse:
raise BinaryDiffException(
f"Patch offset {start_offset} specifies a wildcard and cannot "
f"be reversed!"
)
differences.append(
(
offset + i,
bytes([beforevals[i] or 0]) if beforevals[i] is not None else None,
bytes([aftervals[i] or 0]),
)
)
# Now, if we're doing the reverse, just switch them
if reverse:
# We cast here because mypy can't see that we have already asserted that x[2] will never
# be optional in the above loop if reverse is set to True.
differences = [cast(Tuple[int, Optional[bytes], bytes], (x[0], x[2], x[1])) for x in differences]
# Finally, return it
return differences
@staticmethod
def patch(
binary: bytes,
patchlines: List[str],
*,
reverse: bool = False,
) -> bytes:
# First, grab the differences
file_size = Binary.size(patchlines)
if file_size is not None and file_size != len(binary):
raise BinaryDiffException(
f"Patch is for binary of size {file_size} but binary is {len(binary)} "
f"bytes long!"
)
differences: List[Tuple[int, Optional[bytes], bytes]] = sorted(
Binary._gather_differences(patchlines, reverse),
key=lambda diff: diff[0],
)
chunks: List[bytes] = []
last_patch_end: int = 0
# Now, apply the changes to the binary data
for diff in differences:
offset, old, new = diff
if len(binary) < offset:
raise BinaryDiffException(
f"Patch offset {Binary._hex(offset)} is beyond the end of "
f"the binary!"
)
if old is not None and binary[offset:(offset + 1)] != old:
raise BinaryDiffException(
f"Patch offset {Binary._hex(offset)} expecting {Binary._hex(old[0])} "
f"but found {Binary._hex(binary[offset])}!"
)
if last_patch_end < offset:
chunks.append(binary[last_patch_end:offset])
chunks.append(new)
last_patch_end = offset + 1
# Return the new data!
chunks.append(binary[last_patch_end:])
return b"".join(chunks)
@staticmethod
def can_patch(
binary: bytes,
patchlines: List[str],
*,
reverse: bool = False,
ignore_size_differences: bool = False,
) -> Tuple[bool, str]:
# First, grab the differences
if not ignore_size_differences:
file_size = Binary.size(patchlines)
if file_size is not None and file_size != len(binary):
return (
False,
f"Patch is for binary of size {file_size} but binary is {len(binary)} "
f"bytes long!"
)
differences: List[Tuple[int, Optional[bytes], bytes]] = Binary._gather_differences(patchlines, reverse)
# Now, verify the changes to the binary data
for diff in differences:
offset, old, _ = diff
if len(binary) < offset:
return (
False,
f"Patch offset {Binary._hex(offset)} is beyond the end of "
f"the binary!"
)
if old is not None and binary[offset:(offset + 1)] != old:
return (
False,
f"Patch offset {Binary._hex(offset)} expecting {Binary._hex(old[0])} "
f"but found {Binary._hex(binary[offset])}!"
)
# Didn't find any problems
return (True, "")
@staticmethod
def description(patchlines: List[str]) -> Optional[str]:
for patch in patchlines:
if patch.startswith('#'):
# This is a comment, ignore it, unless its a description comment
patch = patch[1:].strip().lower()
if patch.startswith('description:'):
return patch[12:].strip()
return None
@staticmethod
def needed_amount(patchlines: List[str]) -> int:
# First, grab the differences.
differences: List[Tuple[int, Optional[bytes], bytes]] = Binary._gather_differences(patchlines, False)
# Now, get the maximum byte we need to apply this patch.
return max([offset for offset, _, _ in differences]) + 1 if differences else 0

114
bindiff Executable file
View File

@ -0,0 +1,114 @@
#! /usr/bin/env python3
import argparse
import os
import sys
from arcadeutils.binary import BinaryDiff
def main() -> int:
# Create the argument parser
parser = argparse.ArgumentParser(
description="Utilities for diffing or patching binary files.",
)
subparsers = parser.add_subparsers(help='commands', dest='command')
# Parser for diffing two binary files
diff_parser = subparsers.add_parser('diff', help='create a diff of two same-length binary files')
diff_parser.add_argument(
'file1',
metavar='FILE1',
type=str,
help='the base file that we will output diffs relative to',
)
diff_parser.add_argument(
'file2',
metavar='FILE2',
type=str,
help='the file that we will compare against the base file to find diffs',
)
diff_parser.add_argument(
'--patch-file',
metavar='FILE',
type=str,
help='write patches to a file instead of stdout',
)
# Parser for patching a binary file
patch_parser = subparsers.add_parser('patch', help='patch a binary file using a previously created diff')
patch_parser.add_argument(
'bin',
metavar='BIN',
type=str,
help='the binary file we should patch',
)
patch_parser.add_argument(
'out',
metavar='OUT',
type=str,
help='the file we should write the patched binary to',
)
patch_parser.add_argument(
'--patch-file',
metavar='FILE',
type=str,
help='read patches from a file instead of stdin',
)
patch_parser.add_argument(
'--reverse',
action="store_true",
help='perform the patch in reverse (undo the patch)',
)
# Grab what we're doing
args = parser.parse_args()
if args.command == 'diff':
with open(args.file1, "rb") as fp:
file1 = fp.read()
with open(args.file2, "rb") as fp:
file2 = fp.read()
try:
differences = BinaryDiff.diff(file1, file2)
except Exception as e:
print(f"Could not diff {args.file1} against {args.file2}: {str(e)}", file=sys.stderr)
return 1
if not args.patch_file:
for line in differences:
print(line)
else:
with open(args.patch_file, "w") as fp:
fp.write(os.linesep.join(differences))
elif args.command == 'patch':
with open(args.bin, "rb") as fp:
old = fp.read()
if not args.patch_file:
differences = sys.stdin.readlines()
else:
with open(args.patch_file, "r") as fp:
differences = fp.readlines()
differences = [d.strip() for d in differences if d.strip()]
try:
new = BinaryDiff.patch(old, differences, reverse=args.reverse)
except Exception as e:
print(f"Could not patch {args.bin}: {str(e)}", file=sys.stderr)
return 1
with open(args.out, "wb") as fp:
fp.write(new)
print(f"Patched {args.bin} and wrote to {args.out}.")
else:
print(f"Please specify a valid command!{os.linesep}", file=sys.stderr)
parser.print_help()
return 1
return 0
if __name__ == "__main__":
sys.exit(main())