backyard/6502_name_codec/name_encoding.py

267 lines
8.5 KiB
Python
Raw Normal View History

2022-06-06 19:09:48 -07:00
# see ../txt/character-encoding-idea.txt
import sys
program, args = sys.argv[0], sys.argv[1:]
if len(args) == 0 or len(args) > 2:
print(
f"usage: {program} {{path-to-names-list.txt}} [maximum-length]", file=sys.stderr
)
sys.exit(1)
fp = args[0]
maxlen = int(args[1]) if len(args) == 2 else 6
# TODO: replace this with a perfect hashing function.
# note that the order within each half doesn't actually matter,
# so long as the first and second halves are separate.
# ETAOINSRHLDC UMFPGWYBVKXJ QZ
# C gets demoted:
# ETAOINSRHLDU CMFPGWYBVKXJ QZ
# tested against: ~py/../txt/names-usa-2021-no.txt
# and also: ~/play/hash/pokemon8-alpha.txt
# mapping = "ETAOINSRHLDC" + "UMFPGWYBVKXJ" + "QZ" # 718 bytes, 4151 poké (mayzner)
# mapping = "ETAOINSHRCLU" + "DMFPGWYBVKXJ" + "QZ" # 725 bytes, 4132 poké (swap-D-U)
mapping = "ETAOINSHRDLU" + "CMFPGWYBVKXJ" + "QZ" # 724 bytes, 4130 poké (classic)
# mapping = "AERNILHTSOYC" + "DMJBUGKPVWFX" + "ZQ" # 724 bytes, 4181 poké (names-usa)
# mapping = "AEORILNTSUCM" + "DGPHBKYWFVZX" + "JQ" # 736 bytes, 4113 poké (???)
# mapping = "EAIORTNSLCUP" + "MDHGYBVFKWZX" + "QJ" # 738 bytes, 4108 poké (cain.txt)
# mapping = "EAIONRTLSUDG" + "BCMPFHVWYKJX" + "QZ" # 732 bytes, 4124 poké (scrabble)
# shakespeare: ETOAHISNRLDUMYW,FCGBP.VK';:?!-JQXZ%
# mapping = "EAIORNTSLMDH" + "CUGKYPBFXZVW" + "JQ" # (usernames)
# scrabble scores, for reference:
# 1 point: E ×12, A ×9, I ×9, O ×8, N ×6, R ×6, T ×6, L ×4, S ×4, U ×4
# 2 points: D ×4, G ×3
# 3 points: B ×2, C ×2, M ×2, P ×2
# 4 points: F ×2, H ×2, V ×2, W ×2, Y ×2
# 5 points: K ×1
# 8 points: J ×1, X ×1
# 10 points: Q ×1, Z ×1
# "numeric" mapping:
napping = "0123456789??" + "$€¥¢????????" + "??" # NOTE: still two more slots!
napping = "0123456789-?" + "!@#$%^&*()_+" + "/=" # NOTE: still two more slots!
# !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
# still needs mapping: `~ '" ;: <> [] {} \| (14 missing)
# unmapping = {i: c for i, c in enumerate(mapping)}
unmapping = mapping + ". "
# convert keys to one-based indices into the alphabet.
mapping = {ord(c) - ord("A"): i for i, c in enumerate(mapping)}
def encode_name(name, maxlen=None, force=False):
bits = []
upper = True
was_replaced = -99
for i, c in enumerate(name):
ci = ord(c)
if ci >= 65 and ci <= 90: # uppercase
if not upper and not force:
print(f"{fp}:{li + 1}: unexpected uppercase", file=sys.stderr)
sys.exit(2)
bits.append(mapping[ci - 65])
upper = False
elif ci >= 97 and ci <= 122: # lowercase
if upper and not force:
print(f"{fp}:{li + 1}: unexpected lowercase", file=sys.stderr)
sys.exit(2)
upper = False
bits.append(mapping[ci - 97])
elif ci in (32, 39, 44, 46): # space, apostrophe, comma, period
if ci == 32: # space
bits.append(27)
upper = True
elif ci == 46: # period
bits.append(26)
upper = True
elif not force:
print(f"{fp}:{li + 1}: unsupported character: {c}", file=sys.stderr)
sys.exit(2)
elif force and ci in (95,): # underscore
if i - 1 != was_replaced:
bits.append(27)
was_replaced = i
elif not force:
print(f"{fp}:{li + 1}: unsupported character: {c}", file=sys.stderr)
sys.exit(2)
enc = b""
everything = 0 # python version only, mostly for debugging
everything_length = 0
remain = 0
length = 0
for b in bits:
# remain = b if b < 12 else b - 12 + 48
# length = 4 if b < 12 else 6
if b < 12:
remain <<= 4
remain |= b
length += 4
# print(4, f"{bin(b)[2:]:>04}", file=sys.stderr)
everything <<= 4
everything |= b
everything_length += 4
else: # when b >= 12
v = b + (0b110000 - 12)
remain <<= 6
remain |= v
length += 6
# print(6, f"{bin(v)[2:]:>06}", file=sys.stderr)
everything <<= 6
everything |= v
everything_length += 6
if length >= 8:
shift = length - 8
enc += bytes((remain >> shift,))
remain ^= remain >> shift << shift
length -= 8
if length and maxlen is None:
shift = 8 - length
enc += bytes((remain << shift,))
elif length:
shift = 8 - length
ones = (1 << shift) - 1
enc += bytes((remain << shift | ones,))
# if maxlen is not None:
# while len(enc) < maxlen:
# enc += bytes((255,))
# print(everything_length, bin(everything), file=sys.stderr)
if maxlen is not None and len(enc) > maxlen:
print(f"{fp}:{li + 1}: name too long: {name}", file=sys.stderr)
if force:
enc = enc[:maxlen]
else:
sys.exit(2)
return enc
def decode_name(enc, maxlen=None):
if maxlen is None:
maxlen = len(enc)
# NOTE: this code assumes >> is a logical shift, not an arithmetic shift!
dec = ""
state = "empty" # you would use an enum or separate code paths in low-level code
it = iter(enc)
upper = True
byte, hi, lo = None, None, None
def goto(state_):
nonlocal going, state
assert not going, "two gotos in a row (bug in code)"
going, state = True, state_
def unmap_common(b):
nonlocal upper, dec
assert 0 <= b <= 11
c = unmapping[b]
dec += c if upper else c.lower()
upper = False # upper = b >= 14 (never true)
def unmap_uncommon(b):
nonlocal upper, dec
assert 0 <= b <= 15
c = unmapping[b + 12]
dec += c if upper else c.lower()
upper = b >= 14 # upper = c == "." or c == " "
def advance():
nonlocal byte, hi, lo
byte = next(it, None)
if byte is None:
hi, lo = None, None
return False
else:
hi, lo = byte >> 4, byte & 15
return True
def split_middle(a, b):
# equivalent:
# return (a << 2 | b >> 2) & 15
return (a & 3) << 2 | b >> 2
for _ in range(99): # some big number to prevent accidental self-denial-of-service
going = False # should only be for this Python implementation
if state == "empty":
if not advance():
break
if hi >= 12:
unmap_uncommon(split_middle(hi, lo))
goto("have2bits")
else:
unmap_common(hi)
goto("lower4")
elif state == "have2bits": # need 4 or 2 more
prev = byte
if not advance():
break
if prev & 3 == 3:
unmap_uncommon(hi)
goto("lower4")
else:
unmap_common(split_middle(prev, hi))
goto("lower6")
elif state == "have4bits": # need 2 more
prev = byte
if not advance():
break
unmap_uncommon(split_middle(prev, hi))
goto("lower6")
elif state == "lower4":
if lo >= 12:
goto("have4bits")
else:
unmap_common(lo)
goto("empty")
elif state == "lower6":
if hi & 3 == 3:
unmap_uncommon(lo)
goto("empty")
else:
unmap_common(split_middle(hi, lo))
goto("have2bits")
else:
assert False, "impossible"
assert going, "must go somewhere (bug in code)"
return dec.rstrip(" ") # don't forget to handle this in your code too!
def hexdump(b):
return ("{:02X}" * len(b)).format(*b)
dumb_hack = False
fmt = f"{{}} {{:<{maxlen * 2}}} {{:<{maxlen * 2}}} {{:<{maxlen * 2}}}"
total = 0
for li, line in enumerate(open(fp, "r")):
name = line.rstrip()
encoded = encode_name(name, maxlen, force=dumb_hack)
decoded = decode_name(encoded, maxlen)
# print(f"{len(encoded)} {name:<12} {hexdump(encoded):<12} {decoded:<12}")
if dumb_hack:
if name.replace("_", "").lower() == decoded.replace(" ", "").lower():
print(decoded)
else:
print(fmt.format(len(encoded), name, hexdump(encoded), decoded))
if not dumb_hack:
assert name == decoded, (name, decoded)
total += len(encoded)
print(f"Total: {total}", file=sys.stderr)