# see ../txt/character-encoding-idea.txt import sys program, args = sys.argv[0], sys.argv[1:] if len(args) == 0 or len(args) > 2: print( f"usage: {program} {{path-to-names-list.txt}} [maximum-length]", file=sys.stderr ) sys.exit(1) fp = args[0] maxlen = int(args[1]) if len(args) == 2 else 6 # TODO: replace this with a perfect hashing function. # note that the order within each half doesn't actually matter, # so long as the first and second halves are separate. # ETAOINSRHLDC UMFPGWYBVKXJ QZ # C gets demoted: # ETAOINSRHLDU CMFPGWYBVKXJ QZ # tested against: ~py/../txt/names-usa-2021-no.txt # and also: ~/play/hash/pokemon8-alpha.txt # mapping = "ETAOINSRHLDC" + "UMFPGWYBVKXJ" + "QZ" # 718 bytes, 4151 poké (mayzner) # mapping = "ETAOINSHRCLU" + "DMFPGWYBVKXJ" + "QZ" # 725 bytes, 4132 poké (swap-D-U) mapping = "ETAOINSHRDLU" + "CMFPGWYBVKXJ" + "QZ" # 724 bytes, 4130 poké (classic) # mapping = "AERNILHTSOYC" + "DMJBUGKPVWFX" + "ZQ" # 724 bytes, 4181 poké (names-usa) # mapping = "AEORILNTSUCM" + "DGPHBKYWFVZX" + "JQ" # 736 bytes, 4113 poké (???) # mapping = "EAIORTNSLCUP" + "MDHGYBVFKWZX" + "QJ" # 738 bytes, 4108 poké (cain.txt) # mapping = "EAIONRTLSUDG" + "BCMPFHVWYKJX" + "QZ" # 732 bytes, 4124 poké (scrabble) # shakespeare: ETOAHISNRLDUMYW,FCGBP.VK';:?!-JQXZ% # mapping = "EAIORNTSLMDH" + "CUGKYPBFXZVW" + "JQ" # (usernames) # scrabble scores, for reference: # 1 point: E ×12, A ×9, I ×9, O ×8, N ×6, R ×6, T ×6, L ×4, S ×4, U ×4 # 2 points: D ×4, G ×3 # 3 points: B ×2, C ×2, M ×2, P ×2 # 4 points: F ×2, H ×2, V ×2, W ×2, Y ×2 # 5 points: K ×1 # 8 points: J ×1, X ×1 # 10 points: Q ×1, Z ×1 # "numeric" mapping: napping = "0123456789??" + "$€¥¢????????" + "??" # NOTE: still two more slots! napping = "0123456789-?" + "!@#$%^&*()_+" + "/=" # NOTE: still two more slots! # !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ # still needs mapping: `~ '" ;: <> [] {} \| (14 missing) # unmapping = {i: c for i, c in enumerate(mapping)} unmapping = mapping + ". " # convert keys to one-based indices into the alphabet. mapping = {ord(c) - ord("A"): i for i, c in enumerate(mapping)} def encode_name(name, maxlen=None, force=False): bits = [] upper = True was_replaced = -99 for i, c in enumerate(name): ci = ord(c) if ci >= 65 and ci <= 90: # uppercase if not upper and not force: print(f"{fp}:{li + 1}: unexpected uppercase", file=sys.stderr) sys.exit(2) bits.append(mapping[ci - 65]) upper = False elif ci >= 97 and ci <= 122: # lowercase if upper and not force: print(f"{fp}:{li + 1}: unexpected lowercase", file=sys.stderr) sys.exit(2) upper = False bits.append(mapping[ci - 97]) elif ci in (32, 39, 44, 46): # space, apostrophe, comma, period if ci == 32: # space bits.append(27) upper = True elif ci == 46: # period bits.append(26) upper = True elif not force: print(f"{fp}:{li + 1}: unsupported character: {c}", file=sys.stderr) sys.exit(2) elif force and ci in (95,): # underscore if i - 1 != was_replaced: bits.append(27) was_replaced = i elif not force: print(f"{fp}:{li + 1}: unsupported character: {c}", file=sys.stderr) sys.exit(2) enc = b"" everything = 0 # python version only, mostly for debugging everything_length = 0 remain = 0 length = 0 for b in bits: # remain = b if b < 12 else b - 12 + 48 # length = 4 if b < 12 else 6 if b < 12: remain <<= 4 remain |= b length += 4 # print(4, f"{bin(b)[2:]:>04}", file=sys.stderr) everything <<= 4 everything |= b everything_length += 4 else: # when b >= 12 v = b + (0b110000 - 12) remain <<= 6 remain |= v length += 6 # print(6, f"{bin(v)[2:]:>06}", file=sys.stderr) everything <<= 6 everything |= v everything_length += 6 if length >= 8: shift = length - 8 enc += bytes((remain >> shift,)) remain ^= remain >> shift << shift length -= 8 if length and maxlen is None: shift = 8 - length enc += bytes((remain << shift,)) elif length: shift = 8 - length ones = (1 << shift) - 1 enc += bytes((remain << shift | ones,)) # if maxlen is not None: # while len(enc) < maxlen: # enc += bytes((255,)) # print(everything_length, bin(everything), file=sys.stderr) if maxlen is not None and len(enc) > maxlen: print(f"{fp}:{li + 1}: name too long: {name}", file=sys.stderr) if force: enc = enc[:maxlen] else: sys.exit(2) return enc def decode_name(enc, maxlen=None): if maxlen is None: maxlen = len(enc) # NOTE: this code assumes >> is a logical shift, not an arithmetic shift! dec = "" state = "empty" # you would use an enum or separate code paths in low-level code it = iter(enc) upper = True byte, hi, lo = None, None, None def goto(state_): nonlocal going, state assert not going, "two gotos in a row (bug in code)" going, state = True, state_ def unmap_common(b): nonlocal upper, dec assert 0 <= b <= 11 c = unmapping[b] dec += c if upper else c.lower() upper = False # upper = b >= 14 (never true) def unmap_uncommon(b): nonlocal upper, dec assert 0 <= b <= 15 c = unmapping[b + 12] dec += c if upper else c.lower() upper = b >= 14 # upper = c == "." or c == " " def advance(): nonlocal byte, hi, lo byte = next(it, None) if byte is None: hi, lo = None, None return False else: hi, lo = byte >> 4, byte & 15 return True def split_middle(a, b): # equivalent: # return (a << 2 | b >> 2) & 15 return (a & 3) << 2 | b >> 2 for _ in range(99): # some big number to prevent accidental self-denial-of-service going = False # should only be for this Python implementation if state == "empty": if not advance(): break if hi >= 12: unmap_uncommon(split_middle(hi, lo)) goto("have2bits") else: unmap_common(hi) goto("lower4") elif state == "have2bits": # need 4 or 2 more prev = byte if not advance(): break if prev & 3 == 3: unmap_uncommon(hi) goto("lower4") else: unmap_common(split_middle(prev, hi)) goto("lower6") elif state == "have4bits": # need 2 more prev = byte if not advance(): break unmap_uncommon(split_middle(prev, hi)) goto("lower6") elif state == "lower4": if lo >= 12: goto("have4bits") else: unmap_common(lo) goto("empty") elif state == "lower6": if hi & 3 == 3: unmap_uncommon(lo) goto("empty") else: unmap_common(split_middle(hi, lo)) goto("have2bits") else: assert False, "impossible" assert going, "must go somewhere (bug in code)" return dec.rstrip(" ") # don't forget to handle this in your code too! def hexdump(b): return ("{:02X}" * len(b)).format(*b) dumb_hack = False fmt = f"{{}} {{:<{maxlen * 2}}} {{:<{maxlen * 2}}} {{:<{maxlen * 2}}}" total = 0 for li, line in enumerate(open(fp, "r")): name = line.rstrip() encoded = encode_name(name, maxlen, force=dumb_hack) decoded = decode_name(encoded, maxlen) # print(f"{len(encoded)} {name:<12} {hexdump(encoded):<12} {decoded:<12}") if dumb_hack: if name.replace("_", "").lower() == decoded.replace(" ", "").lower(): print(decoded) else: print(fmt.format(len(encoded), name, hexdump(encoded), decoded)) if not dumb_hack: assert name == decoded, (name, decoded) total += len(encoded) print(f"Total: {total}", file=sys.stderr)