From f43c46ef2fa3e4175f85b3bfd7391324b0f9db6f Mon Sep 17 00:00:00 2001 From: Jennifer Taylor Date: Sun, 27 Jun 2021 16:37:32 +0000 Subject: [PATCH] Add binary diffing utilities found in several of my repositories. --- README.md | 49 ++++++++ arcadeutils/binary.py | 263 ++++++++++++++++++++++++++++++++++++++++++ bindiff | 114 ++++++++++++++++++ 3 files changed, 426 insertions(+) create mode 100644 arcadeutils/binary.py create mode 100755 bindiff diff --git a/README.md b/README.md index 19b6b99..63ad93c 100644 --- a/README.md +++ b/README.md @@ -3,3 +3,52 @@ Collection of utilities written in Python for working with various arcade binaries. This is mostly suited towards the separated formats found in MAME archival releases but also work on a variety of binaries from basically anywhere. + +## bindiff + +Create a binary diff from two same-length binaries, or apply a previously created +diff to a binary to patch that binary. Run it like `./bindiff diff --help` to see +options for diffing, and `./bindiff patch --help` to see options for patching. + +The patch format is simple. The number on the left of the colon is the hex offset where +the difference was found, and the numbers on the right are the hex values to find +and replace. A wildcard (`*`) can be substituted for a hex pair for any byte in +the before section if you do not care what the value is, but be aware that this will +make the patch non-reversible. Arbitrary comments are supported anywhere in the diff. +Start a line with the `#` character to create a comment. Special values are recognized +in comments. If you create a comment starting with `# File size:` then the the base +file will be compared against the decimal number placed after the colon and any file +not matching that length will be rejected. + +Some examples are as follows: + +A simple patch changing a byte in a file at offset `0x256` from `0xAA` to `0xDD`: + +``` +256: AA -> DD +``` + +That same patch, but only for files that are exactly 1024 bytes long: + +``` +# File size: 1024 +256: AA -> DD +``` + +A patch that does not care about one of the bytes it is patching. The byte at `0x513` +can be any value and the patch will still be applied, and altogether 4 bytes starting +at `0x512` will be changed to the hex value `0x00 0x11 0x22 0x33`: + +``` +512: AA * CC DD -> 00 11 22 33 +``` + +A patch with multiple offsets, and helpful author descriptions for each section: + +``` +# This part of the patch fixes a sprite offset issue. +128: AA -> BB + +# This part of the patch fixes sound playback issues. +256: 33 -> 44 +``` diff --git a/arcadeutils/binary.py b/arcadeutils/binary.py new file mode 100644 index 0000000..90be880 --- /dev/null +++ b/arcadeutils/binary.py @@ -0,0 +1,263 @@ +from typing import List, Optional, Tuple, cast +from typing_extensions import Final + + +class BinaryDiffException(Exception): + pass + + +class BinaryDiff: + + CHUNK_SIZE: Final[int] = 1024 + + @staticmethod + def _hex(val: int) -> str: + out = hex(val)[2:] + out = out.upper() + if len(out) == 1: + out = "0" + out + return out + + @staticmethod + def diff(bin1: bytes, bin2: bytes) -> List[str]: + binlength = len(bin1) + if binlength != len(bin2): + raise BinaryDiffException("Cannot diff different-sized binary blobs!") + + # First, get the list of differences + differences: List[Tuple[int, bytes, bytes]] = [] + + # Chunk the differences, assuming files are usually about the same, + # for a massive speed boost. + for offset in range(0, binlength, Binary.CHUNK_SIZE): + if bin1[offset:(offset + Binary.CHUNK_SIZE)] != bin2[offset:(offset + Binary.CHUNK_SIZE)]: + for i in range(Binary.CHUNK_SIZE): + byte1 = bin1[offset + i] + byte2 = bin2[offset + i] + + if byte1 != byte2: + differences.append((offset + i, bytes([byte1]), bytes([byte2]))) + + # Don't bother with any combination crap if we have nothing to do + if not differences: + return [] + + # Now, combine them for easier printing + cur_block: Tuple[int, bytes, bytes] = differences[0] + ret: List[str] = [] + + # Now, include the original byte size for later comparison/checks + ret.append(f"# File size: {len(bin1)}") + + def _hexrun(val: bytes) -> str: + return " ".join(Binary._hex(v) for v in val) + + def _output(val: Tuple[int, bytes, bytes]) -> None: + start = val[0] - len(val[1]) + 1 + + ret.append( + f"{Binary._hex(start)}: {_hexrun(val[1])} -> {_hexrun(val[2])}" + ) + + def _combine(val: Tuple[int, bytes, bytes]) -> None: + nonlocal cur_block + + if cur_block[0] + 1 == val[0]: + # This is a continuation of a run + cur_block = ( + val[0], + cur_block[1] + val[1], + cur_block[2] + val[2], + ) + else: + # This is a new run + _output(cur_block) + cur_block = val + + # Combine and output runs of differences + for diff in differences[1:]: + _combine(diff) + + # Make sure we output the last difference + _output(cur_block) + + # Return our summation + return ret + + @staticmethod + def size(patchlines: List[str]) -> Optional[int]: + for patch in patchlines: + if patch.startswith('#'): + # This is a comment, ignore it, unless its a file-size comment + patch = patch[1:].strip().lower() + if patch.startswith('file size:'): + return int(patch[10:].strip()) + return None + + @staticmethod + def _convert(val: str) -> Optional[int]: + val = val.strip() + if val == '*': + return None + return int(val, 16) + + @staticmethod + def _gather_differences(patchlines: List[str], reverse: bool) -> List[Tuple[int, Optional[bytes], bytes]]: + # First, separate out into a list of offsets and old/new bytes + differences: List[Tuple[int, Optional[bytes], bytes]] = [] + + for patch in patchlines: + if patch.startswith('#'): + # This is a comment, ignore it. + continue + start_offset, patch_contents = patch.split(':', 1) + before, after = patch_contents.split('->') + beforevals = [ + Binary._convert(x) for x in before.split(" ") if x.strip() + ] + aftervals = [ + Binary._convert(x) for x in after.split(" ") if x.strip() + ] + + if len(beforevals) != len(aftervals): + raise BinaryDiffException( + f"Patch before and after length mismatch at " + f"offset {start_offset}!" + ) + if len(beforevals) == 0: + raise BinaryDiffException( + f"Must have at least one byte to change at " + f"offset {start_offset}!" + ) + + offset = int(start_offset.strip(), 16) + + for i in range(len(beforevals)): + if aftervals[i] is None: + raise BinaryDiffException( + f"Cannot convert a location to a wildcard " + f"at offset {start_offset}" + ) + if beforevals[i] is None and reverse: + raise BinaryDiffException( + f"Patch offset {start_offset} specifies a wildcard and cannot " + f"be reversed!" + ) + differences.append( + ( + offset + i, + bytes([beforevals[i] or 0]) if beforevals[i] is not None else None, + bytes([aftervals[i] or 0]), + ) + ) + + # Now, if we're doing the reverse, just switch them + if reverse: + # We cast here because mypy can't see that we have already asserted that x[2] will never + # be optional in the above loop if reverse is set to True. + differences = [cast(Tuple[int, Optional[bytes], bytes], (x[0], x[2], x[1])) for x in differences] + + # Finally, return it + return differences + + @staticmethod + def patch( + binary: bytes, + patchlines: List[str], + *, + reverse: bool = False, + ) -> bytes: + # First, grab the differences + file_size = Binary.size(patchlines) + if file_size is not None and file_size != len(binary): + raise BinaryDiffException( + f"Patch is for binary of size {file_size} but binary is {len(binary)} " + f"bytes long!" + ) + differences: List[Tuple[int, Optional[bytes], bytes]] = sorted( + Binary._gather_differences(patchlines, reverse), + key=lambda diff: diff[0], + ) + chunks: List[bytes] = [] + last_patch_end: int = 0 + + # Now, apply the changes to the binary data + for diff in differences: + offset, old, new = diff + + if len(binary) < offset: + raise BinaryDiffException( + f"Patch offset {Binary._hex(offset)} is beyond the end of " + f"the binary!" + ) + if old is not None and binary[offset:(offset + 1)] != old: + raise BinaryDiffException( + f"Patch offset {Binary._hex(offset)} expecting {Binary._hex(old[0])} " + f"but found {Binary._hex(binary[offset])}!" + ) + + if last_patch_end < offset: + chunks.append(binary[last_patch_end:offset]) + chunks.append(new) + last_patch_end = offset + 1 + + # Return the new data! + chunks.append(binary[last_patch_end:]) + return b"".join(chunks) + + @staticmethod + def can_patch( + binary: bytes, + patchlines: List[str], + *, + reverse: bool = False, + ignore_size_differences: bool = False, + ) -> Tuple[bool, str]: + # First, grab the differences + if not ignore_size_differences: + file_size = Binary.size(patchlines) + if file_size is not None and file_size != len(binary): + return ( + False, + f"Patch is for binary of size {file_size} but binary is {len(binary)} " + f"bytes long!" + ) + differences: List[Tuple[int, Optional[bytes], bytes]] = Binary._gather_differences(patchlines, reverse) + + # Now, verify the changes to the binary data + for diff in differences: + offset, old, _ = diff + + if len(binary) < offset: + return ( + False, + f"Patch offset {Binary._hex(offset)} is beyond the end of " + f"the binary!" + ) + if old is not None and binary[offset:(offset + 1)] != old: + return ( + False, + f"Patch offset {Binary._hex(offset)} expecting {Binary._hex(old[0])} " + f"but found {Binary._hex(binary[offset])}!" + ) + + # Didn't find any problems + return (True, "") + + @staticmethod + def description(patchlines: List[str]) -> Optional[str]: + for patch in patchlines: + if patch.startswith('#'): + # This is a comment, ignore it, unless its a description comment + patch = patch[1:].strip().lower() + if patch.startswith('description:'): + return patch[12:].strip() + return None + + @staticmethod + def needed_amount(patchlines: List[str]) -> int: + # First, grab the differences. + differences: List[Tuple[int, Optional[bytes], bytes]] = Binary._gather_differences(patchlines, False) + + # Now, get the maximum byte we need to apply this patch. + return max([offset for offset, _, _ in differences]) + 1 if differences else 0 diff --git a/bindiff b/bindiff new file mode 100755 index 0000000..44dbb11 --- /dev/null +++ b/bindiff @@ -0,0 +1,114 @@ +#! /usr/bin/env python3 +import argparse +import os +import sys + +from arcadeutils.binary import BinaryDiff + + +def main() -> int: + # Create the argument parser + parser = argparse.ArgumentParser( + description="Utilities for diffing or patching binary files.", + ) + subparsers = parser.add_subparsers(help='commands', dest='command') + + # Parser for diffing two binary files + diff_parser = subparsers.add_parser('diff', help='create a diff of two same-length binary files') + diff_parser.add_argument( + 'file1', + metavar='FILE1', + type=str, + help='the base file that we will output diffs relative to', + ) + diff_parser.add_argument( + 'file2', + metavar='FILE2', + type=str, + help='the file that we will compare against the base file to find diffs', + ) + diff_parser.add_argument( + '--patch-file', + metavar='FILE', + type=str, + help='write patches to a file instead of stdout', + ) + + # Parser for patching a binary file + patch_parser = subparsers.add_parser('patch', help='patch a binary file using a previously created diff') + patch_parser.add_argument( + 'bin', + metavar='BIN', + type=str, + help='the binary file we should patch', + ) + patch_parser.add_argument( + 'out', + metavar='OUT', + type=str, + help='the file we should write the patched binary to', + ) + patch_parser.add_argument( + '--patch-file', + metavar='FILE', + type=str, + help='read patches from a file instead of stdin', + ) + patch_parser.add_argument( + '--reverse', + action="store_true", + help='perform the patch in reverse (undo the patch)', + ) + + # Grab what we're doing + args = parser.parse_args() + + if args.command == 'diff': + with open(args.file1, "rb") as fp: + file1 = fp.read() + with open(args.file2, "rb") as fp: + file2 = fp.read() + + try: + differences = BinaryDiff.diff(file1, file2) + except Exception as e: + print(f"Could not diff {args.file1} against {args.file2}: {str(e)}", file=sys.stderr) + return 1 + + if not args.patch_file: + for line in differences: + print(line) + else: + with open(args.patch_file, "w") as fp: + fp.write(os.linesep.join(differences)) + elif args.command == 'patch': + with open(args.bin, "rb") as fp: + old = fp.read() + + if not args.patch_file: + differences = sys.stdin.readlines() + else: + with open(args.patch_file, "r") as fp: + differences = fp.readlines() + differences = [d.strip() for d in differences if d.strip()] + + try: + new = BinaryDiff.patch(old, differences, reverse=args.reverse) + except Exception as e: + print(f"Could not patch {args.bin}: {str(e)}", file=sys.stderr) + return 1 + + with open(args.out, "wb") as fp: + fp.write(new) + + print(f"Patched {args.bin} and wrote to {args.out}.") + else: + print(f"Please specify a valid command!{os.linesep}", file=sys.stderr) + parser.print_help() + return 1 + + return 0 + + +if __name__ == "__main__": + sys.exit(main())