From 814cd64358e52bffd4fd2e701d034c5ccfd81e18 Mon Sep 17 00:00:00 2001 From: Connor Olding Date: Tue, 7 Jun 2022 04:09:48 +0200 Subject: [PATCH] 6502_name_codec: add encoder --- 6502_name_codec/name_encoding.py | 266 +++++++++++++++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 6502_name_codec/name_encoding.py diff --git a/6502_name_codec/name_encoding.py b/6502_name_codec/name_encoding.py new file mode 100644 index 0000000..b795b28 --- /dev/null +++ b/6502_name_codec/name_encoding.py @@ -0,0 +1,266 @@ +# see ../txt/character-encoding-idea.txt + +import sys + +program, args = sys.argv[0], sys.argv[1:] +if len(args) == 0 or len(args) > 2: + print( + f"usage: {program} {{path-to-names-list.txt}} [maximum-length]", file=sys.stderr + ) + sys.exit(1) + +fp = args[0] +maxlen = int(args[1]) if len(args) == 2 else 6 + +# TODO: replace this with a perfect hashing function. +# note that the order within each half doesn't actually matter, +# so long as the first and second halves are separate. +# ETAOINSRHLDC UMFPGWYBVKXJ QZ +# C gets demoted: +# ETAOINSRHLDU CMFPGWYBVKXJ QZ +# tested against: ~py/../txt/names-usa-2021-no.txt +# and also: ~/play/hash/pokemon8-alpha.txt +# mapping = "ETAOINSRHLDC" + "UMFPGWYBVKXJ" + "QZ" # 718 bytes, 4151 poké (mayzner) +# mapping = "ETAOINSHRCLU" + "DMFPGWYBVKXJ" + "QZ" # 725 bytes, 4132 poké (swap-D-U) +mapping = "ETAOINSHRDLU" + "CMFPGWYBVKXJ" + "QZ" # 724 bytes, 4130 poké (classic) +# mapping = "AERNILHTSOYC" + "DMJBUGKPVWFX" + "ZQ" # 724 bytes, 4181 poké (names-usa) +# mapping = "AEORILNTSUCM" + "DGPHBKYWFVZX" + "JQ" # 736 bytes, 4113 poké (???) +# mapping = "EAIORTNSLCUP" + "MDHGYBVFKWZX" + "QJ" # 738 bytes, 4108 poké (cain.txt) +# mapping = "EAIONRTLSUDG" + "BCMPFHVWYKJX" + "QZ" # 732 bytes, 4124 poké (scrabble) +# shakespeare: ETOAHISNRLDUMYW,FCGBP.VK';:?!-JQXZ% +# mapping = "EAIORNTSLMDH" + "CUGKYPBFXZVW" + "JQ" # (usernames) + +# scrabble scores, for reference: +# 1 point: E ×12, A ×9, I ×9, O ×8, N ×6, R ×6, T ×6, L ×4, S ×4, U ×4 +# 2 points: D ×4, G ×3 +# 3 points: B ×2, C ×2, M ×2, P ×2 +# 4 points: F ×2, H ×2, V ×2, W ×2, Y ×2 +# 5 points: K ×1 +# 8 points: J ×1, X ×1 +# 10 points: Q ×1, Z ×1 + +# "numeric" mapping: +napping = "0123456789??" + "$€¥¢????????" + "??" # NOTE: still two more slots! +napping = "0123456789-?" + "!@#$%^&*()_+" + "/=" # NOTE: still two more slots! +# !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ +# still needs mapping: `~ '" ;: <> [] {} \| (14 missing) + +# unmapping = {i: c for i, c in enumerate(mapping)} +unmapping = mapping + ". " +# convert keys to one-based indices into the alphabet. +mapping = {ord(c) - ord("A"): i for i, c in enumerate(mapping)} + + +def encode_name(name, maxlen=None, force=False): + bits = [] + upper = True + was_replaced = -99 + for i, c in enumerate(name): + ci = ord(c) + if ci >= 65 and ci <= 90: # uppercase + if not upper and not force: + print(f"{fp}:{li + 1}: unexpected uppercase", file=sys.stderr) + sys.exit(2) + bits.append(mapping[ci - 65]) + upper = False + elif ci >= 97 and ci <= 122: # lowercase + if upper and not force: + print(f"{fp}:{li + 1}: unexpected lowercase", file=sys.stderr) + sys.exit(2) + upper = False + bits.append(mapping[ci - 97]) + elif ci in (32, 39, 44, 46): # space, apostrophe, comma, period + if ci == 32: # space + bits.append(27) + upper = True + elif ci == 46: # period + bits.append(26) + upper = True + elif not force: + print(f"{fp}:{li + 1}: unsupported character: {c}", file=sys.stderr) + sys.exit(2) + elif force and ci in (95,): # underscore + if i - 1 != was_replaced: + bits.append(27) + was_replaced = i + elif not force: + print(f"{fp}:{li + 1}: unsupported character: {c}", file=sys.stderr) + sys.exit(2) + + enc = b"" + everything = 0 # python version only, mostly for debugging + everything_length = 0 + remain = 0 + length = 0 + for b in bits: + # remain = b if b < 12 else b - 12 + 48 + # length = 4 if b < 12 else 6 + if b < 12: + remain <<= 4 + remain |= b + length += 4 + # print(4, f"{bin(b)[2:]:>04}", file=sys.stderr) + everything <<= 4 + everything |= b + everything_length += 4 + else: # when b >= 12 + v = b + (0b110000 - 12) + remain <<= 6 + remain |= v + length += 6 + # print(6, f"{bin(v)[2:]:>06}", file=sys.stderr) + everything <<= 6 + everything |= v + everything_length += 6 + if length >= 8: + shift = length - 8 + enc += bytes((remain >> shift,)) + remain ^= remain >> shift << shift + length -= 8 + + if length and maxlen is None: + shift = 8 - length + enc += bytes((remain << shift,)) + elif length: + shift = 8 - length + ones = (1 << shift) - 1 + enc += bytes((remain << shift | ones,)) + # if maxlen is not None: + # while len(enc) < maxlen: + # enc += bytes((255,)) + + # print(everything_length, bin(everything), file=sys.stderr) + + if maxlen is not None and len(enc) > maxlen: + print(f"{fp}:{li + 1}: name too long: {name}", file=sys.stderr) + if force: + enc = enc[:maxlen] + else: + sys.exit(2) + + return enc + + +def decode_name(enc, maxlen=None): + if maxlen is None: + maxlen = len(enc) + + # NOTE: this code assumes >> is a logical shift, not an arithmetic shift! + dec = "" + state = "empty" # you would use an enum or separate code paths in low-level code + it = iter(enc) + upper = True + byte, hi, lo = None, None, None + + def goto(state_): + nonlocal going, state + assert not going, "two gotos in a row (bug in code)" + going, state = True, state_ + + def unmap_common(b): + nonlocal upper, dec + assert 0 <= b <= 11 + c = unmapping[b] + dec += c if upper else c.lower() + upper = False # upper = b >= 14 (never true) + + def unmap_uncommon(b): + nonlocal upper, dec + assert 0 <= b <= 15 + c = unmapping[b + 12] + dec += c if upper else c.lower() + upper = b >= 14 # upper = c == "." or c == " " + + def advance(): + nonlocal byte, hi, lo + byte = next(it, None) + if byte is None: + hi, lo = None, None + return False + else: + hi, lo = byte >> 4, byte & 15 + return True + + def split_middle(a, b): + # equivalent: + # return (a << 2 | b >> 2) & 15 + return (a & 3) << 2 | b >> 2 + + for _ in range(99): # some big number to prevent accidental self-denial-of-service + going = False # should only be for this Python implementation + + if state == "empty": + if not advance(): + break + + if hi >= 12: + unmap_uncommon(split_middle(hi, lo)) + goto("have2bits") + else: + unmap_common(hi) + goto("lower4") + + elif state == "have2bits": # need 4 or 2 more + prev = byte + if not advance(): + break + + if prev & 3 == 3: + unmap_uncommon(hi) + goto("lower4") + else: + unmap_common(split_middle(prev, hi)) + goto("lower6") + + elif state == "have4bits": # need 2 more + prev = byte + if not advance(): + break + + unmap_uncommon(split_middle(prev, hi)) + goto("lower6") + + elif state == "lower4": + if lo >= 12: + goto("have4bits") + else: + unmap_common(lo) + goto("empty") + + elif state == "lower6": + if hi & 3 == 3: + unmap_uncommon(lo) + goto("empty") + else: + unmap_common(split_middle(hi, lo)) + goto("have2bits") + + else: + assert False, "impossible" + + assert going, "must go somewhere (bug in code)" + + return dec.rstrip(" ") # don't forget to handle this in your code too! + + +def hexdump(b): + return ("{:02X}" * len(b)).format(*b) + + +dumb_hack = False +fmt = f"{{}} {{:<{maxlen * 2}}} {{:<{maxlen * 2}}} {{:<{maxlen * 2}}}" +total = 0 +for li, line in enumerate(open(fp, "r")): + name = line.rstrip() + encoded = encode_name(name, maxlen, force=dumb_hack) + decoded = decode_name(encoded, maxlen) + # print(f"{len(encoded)} {name:<12} {hexdump(encoded):<12} {decoded:<12}") + if dumb_hack: + if name.replace("_", "").lower() == decoded.replace(" ", "").lower(): + print(decoded) + else: + print(fmt.format(len(encoded), name, hexdump(encoded), decoded)) + if not dumb_hack: + assert name == decoded, (name, decoded) + total += len(encoded) +print(f"Total: {total}", file=sys.stderr)