267 lines
8.5 KiB
Python
267 lines
8.5 KiB
Python
|
# see ../txt/character-encoding-idea.txt
|
|||
|
|
|||
|
import sys
|
|||
|
|
|||
|
program, args = sys.argv[0], sys.argv[1:]
|
|||
|
if len(args) == 0 or len(args) > 2:
|
|||
|
print(
|
|||
|
f"usage: {program} {{path-to-names-list.txt}} [maximum-length]", file=sys.stderr
|
|||
|
)
|
|||
|
sys.exit(1)
|
|||
|
|
|||
|
fp = args[0]
|
|||
|
maxlen = int(args[1]) if len(args) == 2 else 6
|
|||
|
|
|||
|
# TODO: replace this with a perfect hashing function.
|
|||
|
# note that the order within each half doesn't actually matter,
|
|||
|
# so long as the first and second halves are separate.
|
|||
|
# ETAOINSRHLDC UMFPGWYBVKXJ QZ
|
|||
|
# C gets demoted:
|
|||
|
# ETAOINSRHLDU CMFPGWYBVKXJ QZ
|
|||
|
# tested against: ~py/../txt/names-usa-2021-no.txt
|
|||
|
# and also: ~/play/hash/pokemon8-alpha.txt
|
|||
|
# mapping = "ETAOINSRHLDC" + "UMFPGWYBVKXJ" + "QZ" # 718 bytes, 4151 poké (mayzner)
|
|||
|
# mapping = "ETAOINSHRCLU" + "DMFPGWYBVKXJ" + "QZ" # 725 bytes, 4132 poké (swap-D-U)
|
|||
|
mapping = "ETAOINSHRDLU" + "CMFPGWYBVKXJ" + "QZ" # 724 bytes, 4130 poké (classic)
|
|||
|
# mapping = "AERNILHTSOYC" + "DMJBUGKPVWFX" + "ZQ" # 724 bytes, 4181 poké (names-usa)
|
|||
|
# mapping = "AEORILNTSUCM" + "DGPHBKYWFVZX" + "JQ" # 736 bytes, 4113 poké (???)
|
|||
|
# mapping = "EAIORTNSLCUP" + "MDHGYBVFKWZX" + "QJ" # 738 bytes, 4108 poké (cain.txt)
|
|||
|
# mapping = "EAIONRTLSUDG" + "BCMPFHVWYKJX" + "QZ" # 732 bytes, 4124 poké (scrabble)
|
|||
|
# shakespeare: ETOAHISNRLDUMYW,FCGBP.VK';:?!-JQXZ%
|
|||
|
# mapping = "EAIORNTSLMDH" + "CUGKYPBFXZVW" + "JQ" # (usernames)
|
|||
|
|
|||
|
# scrabble scores, for reference:
|
|||
|
# 1 point: E ×12, A ×9, I ×9, O ×8, N ×6, R ×6, T ×6, L ×4, S ×4, U ×4
|
|||
|
# 2 points: D ×4, G ×3
|
|||
|
# 3 points: B ×2, C ×2, M ×2, P ×2
|
|||
|
# 4 points: F ×2, H ×2, V ×2, W ×2, Y ×2
|
|||
|
# 5 points: K ×1
|
|||
|
# 8 points: J ×1, X ×1
|
|||
|
# 10 points: Q ×1, Z ×1
|
|||
|
|
|||
|
# "numeric" mapping:
|
|||
|
napping = "0123456789??" + "$€¥¢????????" + "??" # NOTE: still two more slots!
|
|||
|
napping = "0123456789-?" + "!@#$%^&*()_+" + "/=" # NOTE: still two more slots!
|
|||
|
# !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
|
|||
|
# still needs mapping: `~ '" ;: <> [] {} \| (14 missing)
|
|||
|
|
|||
|
# unmapping = {i: c for i, c in enumerate(mapping)}
|
|||
|
unmapping = mapping + ". "
|
|||
|
# convert keys to one-based indices into the alphabet.
|
|||
|
mapping = {ord(c) - ord("A"): i for i, c in enumerate(mapping)}
|
|||
|
|
|||
|
|
|||
|
def encode_name(name, maxlen=None, force=False):
|
|||
|
bits = []
|
|||
|
upper = True
|
|||
|
was_replaced = -99
|
|||
|
for i, c in enumerate(name):
|
|||
|
ci = ord(c)
|
|||
|
if ci >= 65 and ci <= 90: # uppercase
|
|||
|
if not upper and not force:
|
|||
|
print(f"{fp}:{li + 1}: unexpected uppercase", file=sys.stderr)
|
|||
|
sys.exit(2)
|
|||
|
bits.append(mapping[ci - 65])
|
|||
|
upper = False
|
|||
|
elif ci >= 97 and ci <= 122: # lowercase
|
|||
|
if upper and not force:
|
|||
|
print(f"{fp}:{li + 1}: unexpected lowercase", file=sys.stderr)
|
|||
|
sys.exit(2)
|
|||
|
upper = False
|
|||
|
bits.append(mapping[ci - 97])
|
|||
|
elif ci in (32, 39, 44, 46): # space, apostrophe, comma, period
|
|||
|
if ci == 32: # space
|
|||
|
bits.append(27)
|
|||
|
upper = True
|
|||
|
elif ci == 46: # period
|
|||
|
bits.append(26)
|
|||
|
upper = True
|
|||
|
elif not force:
|
|||
|
print(f"{fp}:{li + 1}: unsupported character: {c}", file=sys.stderr)
|
|||
|
sys.exit(2)
|
|||
|
elif force and ci in (95,): # underscore
|
|||
|
if i - 1 != was_replaced:
|
|||
|
bits.append(27)
|
|||
|
was_replaced = i
|
|||
|
elif not force:
|
|||
|
print(f"{fp}:{li + 1}: unsupported character: {c}", file=sys.stderr)
|
|||
|
sys.exit(2)
|
|||
|
|
|||
|
enc = b""
|
|||
|
everything = 0 # python version only, mostly for debugging
|
|||
|
everything_length = 0
|
|||
|
remain = 0
|
|||
|
length = 0
|
|||
|
for b in bits:
|
|||
|
# remain = b if b < 12 else b - 12 + 48
|
|||
|
# length = 4 if b < 12 else 6
|
|||
|
if b < 12:
|
|||
|
remain <<= 4
|
|||
|
remain |= b
|
|||
|
length += 4
|
|||
|
# print(4, f"{bin(b)[2:]:>04}", file=sys.stderr)
|
|||
|
everything <<= 4
|
|||
|
everything |= b
|
|||
|
everything_length += 4
|
|||
|
else: # when b >= 12
|
|||
|
v = b + (0b110000 - 12)
|
|||
|
remain <<= 6
|
|||
|
remain |= v
|
|||
|
length += 6
|
|||
|
# print(6, f"{bin(v)[2:]:>06}", file=sys.stderr)
|
|||
|
everything <<= 6
|
|||
|
everything |= v
|
|||
|
everything_length += 6
|
|||
|
if length >= 8:
|
|||
|
shift = length - 8
|
|||
|
enc += bytes((remain >> shift,))
|
|||
|
remain ^= remain >> shift << shift
|
|||
|
length -= 8
|
|||
|
|
|||
|
if length and maxlen is None:
|
|||
|
shift = 8 - length
|
|||
|
enc += bytes((remain << shift,))
|
|||
|
elif length:
|
|||
|
shift = 8 - length
|
|||
|
ones = (1 << shift) - 1
|
|||
|
enc += bytes((remain << shift | ones,))
|
|||
|
# if maxlen is not None:
|
|||
|
# while len(enc) < maxlen:
|
|||
|
# enc += bytes((255,))
|
|||
|
|
|||
|
# print(everything_length, bin(everything), file=sys.stderr)
|
|||
|
|
|||
|
if maxlen is not None and len(enc) > maxlen:
|
|||
|
print(f"{fp}:{li + 1}: name too long: {name}", file=sys.stderr)
|
|||
|
if force:
|
|||
|
enc = enc[:maxlen]
|
|||
|
else:
|
|||
|
sys.exit(2)
|
|||
|
|
|||
|
return enc
|
|||
|
|
|||
|
|
|||
|
def decode_name(enc, maxlen=None):
|
|||
|
if maxlen is None:
|
|||
|
maxlen = len(enc)
|
|||
|
|
|||
|
# NOTE: this code assumes >> is a logical shift, not an arithmetic shift!
|
|||
|
dec = ""
|
|||
|
state = "empty" # you would use an enum or separate code paths in low-level code
|
|||
|
it = iter(enc)
|
|||
|
upper = True
|
|||
|
byte, hi, lo = None, None, None
|
|||
|
|
|||
|
def goto(state_):
|
|||
|
nonlocal going, state
|
|||
|
assert not going, "two gotos in a row (bug in code)"
|
|||
|
going, state = True, state_
|
|||
|
|
|||
|
def unmap_common(b):
|
|||
|
nonlocal upper, dec
|
|||
|
assert 0 <= b <= 11
|
|||
|
c = unmapping[b]
|
|||
|
dec += c if upper else c.lower()
|
|||
|
upper = False # upper = b >= 14 (never true)
|
|||
|
|
|||
|
def unmap_uncommon(b):
|
|||
|
nonlocal upper, dec
|
|||
|
assert 0 <= b <= 15
|
|||
|
c = unmapping[b + 12]
|
|||
|
dec += c if upper else c.lower()
|
|||
|
upper = b >= 14 # upper = c == "." or c == " "
|
|||
|
|
|||
|
def advance():
|
|||
|
nonlocal byte, hi, lo
|
|||
|
byte = next(it, None)
|
|||
|
if byte is None:
|
|||
|
hi, lo = None, None
|
|||
|
return False
|
|||
|
else:
|
|||
|
hi, lo = byte >> 4, byte & 15
|
|||
|
return True
|
|||
|
|
|||
|
def split_middle(a, b):
|
|||
|
# equivalent:
|
|||
|
# return (a << 2 | b >> 2) & 15
|
|||
|
return (a & 3) << 2 | b >> 2
|
|||
|
|
|||
|
for _ in range(99): # some big number to prevent accidental self-denial-of-service
|
|||
|
going = False # should only be for this Python implementation
|
|||
|
|
|||
|
if state == "empty":
|
|||
|
if not advance():
|
|||
|
break
|
|||
|
|
|||
|
if hi >= 12:
|
|||
|
unmap_uncommon(split_middle(hi, lo))
|
|||
|
goto("have2bits")
|
|||
|
else:
|
|||
|
unmap_common(hi)
|
|||
|
goto("lower4")
|
|||
|
|
|||
|
elif state == "have2bits": # need 4 or 2 more
|
|||
|
prev = byte
|
|||
|
if not advance():
|
|||
|
break
|
|||
|
|
|||
|
if prev & 3 == 3:
|
|||
|
unmap_uncommon(hi)
|
|||
|
goto("lower4")
|
|||
|
else:
|
|||
|
unmap_common(split_middle(prev, hi))
|
|||
|
goto("lower6")
|
|||
|
|
|||
|
elif state == "have4bits": # need 2 more
|
|||
|
prev = byte
|
|||
|
if not advance():
|
|||
|
break
|
|||
|
|
|||
|
unmap_uncommon(split_middle(prev, hi))
|
|||
|
goto("lower6")
|
|||
|
|
|||
|
elif state == "lower4":
|
|||
|
if lo >= 12:
|
|||
|
goto("have4bits")
|
|||
|
else:
|
|||
|
unmap_common(lo)
|
|||
|
goto("empty")
|
|||
|
|
|||
|
elif state == "lower6":
|
|||
|
if hi & 3 == 3:
|
|||
|
unmap_uncommon(lo)
|
|||
|
goto("empty")
|
|||
|
else:
|
|||
|
unmap_common(split_middle(hi, lo))
|
|||
|
goto("have2bits")
|
|||
|
|
|||
|
else:
|
|||
|
assert False, "impossible"
|
|||
|
|
|||
|
assert going, "must go somewhere (bug in code)"
|
|||
|
|
|||
|
return dec.rstrip(" ") # don't forget to handle this in your code too!
|
|||
|
|
|||
|
|
|||
|
def hexdump(b):
|
|||
|
return ("{:02X}" * len(b)).format(*b)
|
|||
|
|
|||
|
|
|||
|
dumb_hack = False
|
|||
|
fmt = f"{{}} {{:<{maxlen * 2}}} {{:<{maxlen * 2}}} {{:<{maxlen * 2}}}"
|
|||
|
total = 0
|
|||
|
for li, line in enumerate(open(fp, "r")):
|
|||
|
name = line.rstrip()
|
|||
|
encoded = encode_name(name, maxlen, force=dumb_hack)
|
|||
|
decoded = decode_name(encoded, maxlen)
|
|||
|
# print(f"{len(encoded)} {name:<12} {hexdump(encoded):<12} {decoded:<12}")
|
|||
|
if dumb_hack:
|
|||
|
if name.replace("_", "").lower() == decoded.replace(" ", "").lower():
|
|||
|
print(decoded)
|
|||
|
else:
|
|||
|
print(fmt.format(len(encoded), name, hexdump(encoded), decoded))
|
|||
|
if not dumb_hack:
|
|||
|
assert name == decoded, (name, decoded)
|
|||
|
total += len(encoded)
|
|||
|
print(f"Total: {total}", file=sys.stderr)
|