backyard/6502_name_codec/name_encoding.py

267 lines
8.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# see ../txt/character-encoding-idea.txt
import sys
program, args = sys.argv[0], sys.argv[1:]
if len(args) == 0 or len(args) > 2:
print(
f"usage: {program} {{path-to-names-list.txt}} [maximum-length]", file=sys.stderr
)
sys.exit(1)
fp = args[0]
maxlen = int(args[1]) if len(args) == 2 else 6
# TODO: replace this with a perfect hashing function.
# note that the order within each half doesn't actually matter,
# so long as the first and second halves are separate.
# ETAOINSRHLDC UMFPGWYBVKXJ QZ
# C gets demoted:
# ETAOINSRHLDU CMFPGWYBVKXJ QZ
# tested against: ~py/../txt/names-usa-2021-no.txt
# and also: ~/play/hash/pokemon8-alpha.txt
# mapping = "ETAOINSRHLDC" + "UMFPGWYBVKXJ" + "QZ" # 718 bytes, 4151 poké (mayzner)
# mapping = "ETAOINSHRCLU" + "DMFPGWYBVKXJ" + "QZ" # 725 bytes, 4132 poké (swap-D-U)
mapping = "ETAOINSHRDLU" + "CMFPGWYBVKXJ" + "QZ" # 724 bytes, 4130 poké (classic)
# mapping = "AERNILHTSOYC" + "DMJBUGKPVWFX" + "ZQ" # 724 bytes, 4181 poké (names-usa)
# mapping = "AEORILNTSUCM" + "DGPHBKYWFVZX" + "JQ" # 736 bytes, 4113 poké (???)
# mapping = "EAIORTNSLCUP" + "MDHGYBVFKWZX" + "QJ" # 738 bytes, 4108 poké (cain.txt)
# mapping = "EAIONRTLSUDG" + "BCMPFHVWYKJX" + "QZ" # 732 bytes, 4124 poké (scrabble)
# shakespeare: ETOAHISNRLDUMYW,FCGBP.VK';:?!-JQXZ%
# mapping = "EAIORNTSLMDH" + "CUGKYPBFXZVW" + "JQ" # (usernames)
# scrabble scores, for reference:
# 1 point: E ×12, A ×9, I ×9, O ×8, N ×6, R ×6, T ×6, L ×4, S ×4, U ×4
# 2 points: D ×4, G ×3
# 3 points: B ×2, C ×2, M ×2, P ×2
# 4 points: F ×2, H ×2, V ×2, W ×2, Y ×2
# 5 points: K ×1
# 8 points: J ×1, X ×1
# 10 points: Q ×1, Z ×1
# "numeric" mapping:
napping = "0123456789??" + "$€¥¢????????" + "??" # NOTE: still two more slots!
napping = "0123456789-?" + "!@#$%^&*()_+" + "/=" # NOTE: still two more slots!
# !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
# still needs mapping: `~ '" ;: <> [] {} \| (14 missing)
# unmapping = {i: c for i, c in enumerate(mapping)}
unmapping = mapping + ". "
# convert keys to one-based indices into the alphabet.
mapping = {ord(c) - ord("A"): i for i, c in enumerate(mapping)}
def encode_name(name, maxlen=None, force=False):
bits = []
upper = True
was_replaced = -99
for i, c in enumerate(name):
ci = ord(c)
if ci >= 65 and ci <= 90: # uppercase
if not upper and not force:
print(f"{fp}:{li + 1}: unexpected uppercase", file=sys.stderr)
sys.exit(2)
bits.append(mapping[ci - 65])
upper = False
elif ci >= 97 and ci <= 122: # lowercase
if upper and not force:
print(f"{fp}:{li + 1}: unexpected lowercase", file=sys.stderr)
sys.exit(2)
upper = False
bits.append(mapping[ci - 97])
elif ci in (32, 39, 44, 46): # space, apostrophe, comma, period
if ci == 32: # space
bits.append(27)
upper = True
elif ci == 46: # period
bits.append(26)
upper = True
elif not force:
print(f"{fp}:{li + 1}: unsupported character: {c}", file=sys.stderr)
sys.exit(2)
elif force and ci in (95,): # underscore
if i - 1 != was_replaced:
bits.append(27)
was_replaced = i
elif not force:
print(f"{fp}:{li + 1}: unsupported character: {c}", file=sys.stderr)
sys.exit(2)
enc = b""
everything = 0 # python version only, mostly for debugging
everything_length = 0
remain = 0
length = 0
for b in bits:
# remain = b if b < 12 else b - 12 + 48
# length = 4 if b < 12 else 6
if b < 12:
remain <<= 4
remain |= b
length += 4
# print(4, f"{bin(b)[2:]:>04}", file=sys.stderr)
everything <<= 4
everything |= b
everything_length += 4
else: # when b >= 12
v = b + (0b110000 - 12)
remain <<= 6
remain |= v
length += 6
# print(6, f"{bin(v)[2:]:>06}", file=sys.stderr)
everything <<= 6
everything |= v
everything_length += 6
if length >= 8:
shift = length - 8
enc += bytes((remain >> shift,))
remain ^= remain >> shift << shift
length -= 8
if length and maxlen is None:
shift = 8 - length
enc += bytes((remain << shift,))
elif length:
shift = 8 - length
ones = (1 << shift) - 1
enc += bytes((remain << shift | ones,))
# if maxlen is not None:
# while len(enc) < maxlen:
# enc += bytes((255,))
# print(everything_length, bin(everything), file=sys.stderr)
if maxlen is not None and len(enc) > maxlen:
print(f"{fp}:{li + 1}: name too long: {name}", file=sys.stderr)
if force:
enc = enc[:maxlen]
else:
sys.exit(2)
return enc
def decode_name(enc, maxlen=None):
if maxlen is None:
maxlen = len(enc)
# NOTE: this code assumes >> is a logical shift, not an arithmetic shift!
dec = ""
state = "empty" # you would use an enum or separate code paths in low-level code
it = iter(enc)
upper = True
byte, hi, lo = None, None, None
def goto(state_):
nonlocal going, state
assert not going, "two gotos in a row (bug in code)"
going, state = True, state_
def unmap_common(b):
nonlocal upper, dec
assert 0 <= b <= 11
c = unmapping[b]
dec += c if upper else c.lower()
upper = False # upper = b >= 14 (never true)
def unmap_uncommon(b):
nonlocal upper, dec
assert 0 <= b <= 15
c = unmapping[b + 12]
dec += c if upper else c.lower()
upper = b >= 14 # upper = c == "." or c == " "
def advance():
nonlocal byte, hi, lo
byte = next(it, None)
if byte is None:
hi, lo = None, None
return False
else:
hi, lo = byte >> 4, byte & 15
return True
def split_middle(a, b):
# equivalent:
# return (a << 2 | b >> 2) & 15
return (a & 3) << 2 | b >> 2
for _ in range(99): # some big number to prevent accidental self-denial-of-service
going = False # should only be for this Python implementation
if state == "empty":
if not advance():
break
if hi >= 12:
unmap_uncommon(split_middle(hi, lo))
goto("have2bits")
else:
unmap_common(hi)
goto("lower4")
elif state == "have2bits": # need 4 or 2 more
prev = byte
if not advance():
break
if prev & 3 == 3:
unmap_uncommon(hi)
goto("lower4")
else:
unmap_common(split_middle(prev, hi))
goto("lower6")
elif state == "have4bits": # need 2 more
prev = byte
if not advance():
break
unmap_uncommon(split_middle(prev, hi))
goto("lower6")
elif state == "lower4":
if lo >= 12:
goto("have4bits")
else:
unmap_common(lo)
goto("empty")
elif state == "lower6":
if hi & 3 == 3:
unmap_uncommon(lo)
goto("empty")
else:
unmap_common(split_middle(hi, lo))
goto("have2bits")
else:
assert False, "impossible"
assert going, "must go somewhere (bug in code)"
return dec.rstrip(" ") # don't forget to handle this in your code too!
def hexdump(b):
return ("{:02X}" * len(b)).format(*b)
dumb_hack = False
fmt = f"{{}} {{:<{maxlen * 2}}} {{:<{maxlen * 2}}} {{:<{maxlen * 2}}}"
total = 0
for li, line in enumerate(open(fp, "r")):
name = line.rstrip()
encoded = encode_name(name, maxlen, force=dumb_hack)
decoded = decode_name(encoded, maxlen)
# print(f"{len(encoded)} {name:<12} {hexdump(encoded):<12} {decoded:<12}")
if dumb_hack:
if name.replace("_", "").lower() == decoded.replace(" ", "").lower():
print(decoded)
else:
print(fmt.format(len(encoded), name, hexdump(encoded), decoded))
if not dumb_hack:
assert name == decoded, (name, decoded)
total += len(encoded)
print(f"Total: {total}", file=sys.stderr)