diff --git a/6502_name_codec/README.md b/6502_name_codec/README.md new file mode 100644 index 0000000..b42ee8f --- /dev/null +++ b/6502_name_codec/README.md @@ -0,0 +1,7 @@ +squeeze simple north-american names into fewer bytes. + +i made this with NES/SNES games in mind i.e. for player-given character names in JRPGs. +you will need [m6502.h](https://github.com/floooh/chips/blob/c011ef1/chips/m6502.h) +to compile the test program. as of writing, v1 is the fastest, but v6 is the smallest. + +someday, i want to extend this to use a non-constant LUT. (hidden markov models?) diff --git a/6502_name_codec/decode-idk.c b/6502_name_codec/decode-idk.c new file mode 100644 index 0000000..798715f --- /dev/null +++ b/6502_name_codec/decode-idk.c @@ -0,0 +1,184 @@ +#include +#include +#include + +#define CHIPS_IMPL +#include "m6502.h" + +#define lament(...) fprintf(stderr, __VA_ARGS__) +#define error_when(cond, ...) do { \ + if ((cond) || errno) { \ + lament(__VA_ARGS__); \ + lament(": %s\n", strerror(errno)); \ + goto error; \ + } \ + } while (0) + +// setup 64 kBytes of memory +#define MEMSIZE 65536 +static uint8_t mem[MEMSIZE] = {0}; + +// NOTE: renamed KIL to JAM for consistency. +static const char instrnames[] = +"BRK\0ORA\0JAM\0SLO\0NOP\0ORA\0ASL\0SLO\0PHP\0ORA\0ASL\0ANC\0NOP\0ORA\0ASL\0SLO\0" +"BPL\0ORA\0JAM\0SLO\0NOP\0ORA\0ASL\0SLO\0CLC\0ORA\0NOP\0SLO\0NOP\0ORA\0ASL\0SLO\0" +"JSR\0AND\0JAM\0RLA\0BIT\0AND\0ROL\0RLA\0PLP\0AND\0ROL\0ANC\0BIT\0AND\0ROL\0RLA\0" +"BMI\0AND\0JAM\0RLA\0NOP\0AND\0ROL\0RLA\0SEC\0AND\0NOP\0RLA\0NOP\0AND\0ROL\0RLA\0" +"RTI\0EOR\0JAM\0SRE\0NOP\0EOR\0LSR\0SRE\0PHA\0EOR\0LSR\0ALR\0JMP\0EOR\0LSR\0SRE\0" +"BVC\0EOR\0JAM\0SRE\0NOP\0EOR\0LSR\0SRE\0CLI\0EOR\0NOP\0SRE\0NOP\0EOR\0LSR\0SRE\0" +"RTS\0ADC\0JAM\0RRA\0NOP\0ADC\0ROR\0RRA\0PLA\0ADC\0ROR\0ARR\0JMP\0ADC\0ROR\0RRA\0" +"BVS\0ADC\0JAM\0RRA\0NOP\0ADC\0ROR\0RRA\0SEI\0ADC\0NOP\0RRA\0NOP\0ADC\0ROR\0RRA\0" +"NOP\0STA\0NOP\0SAX\0STY\0STA\0STX\0SAX\0DEY\0NOP\0TXA\0XAA\0STY\0STA\0STX\0SAX\0" +"BCC\0STA\0JAM\0AHX\0STY\0STA\0STX\0SAX\0TYA\0STA\0TXS\0TAS\0SHY\0STA\0SHX\0AHX\0" +"LDY\0LDA\0LDX\0LAX\0LDY\0LDA\0LDX\0LAX\0TAY\0LDA\0TAX\0LAX\0LDY\0LDA\0LDX\0LAX\0" +"BCS\0LDA\0JAM\0LAX\0LDY\0LDA\0LDX\0LAX\0CLV\0LDA\0TSX\0LAS\0LDY\0LDA\0LDX\0LAX\0" +"CPY\0CMP\0NOP\0DCP\0CPY\0CMP\0DEC\0DCP\0INY\0CMP\0DEX\0AXS\0CPY\0CMP\0DEC\0DCP\0" +"BNE\0CMP\0JAM\0DCP\0NOP\0CMP\0DEC\0DCP\0CLD\0CMP\0NOP\0DCP\0NOP\0CMP\0DEC\0DCP\0" +"CPX\0SBC\0NOP\0ISC\0CPX\0SBC\0INC\0ISC\0INX\0SBC\0NOP\0SBC\0CPX\0SBC\0INC\0ISC\0" +"BEQ\0SBC\0JAM\0ISC\0NOP\0SBC\0INC\0ISC\0SED\0SBC\0NOP\0ISC\0NOP\0SBC\0INC\0ISC"; + +static const char documented[] = { + 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, + 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, + 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, + 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, + 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, + 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, + 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, + 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, + 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, + 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, + 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, + 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, + 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, + 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, + 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, + 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0 +}; + +static int loadmem(const char *fp) { + FILE *f = NULL; + long size = MEMSIZE; + + errno = 0; + f = fopen(fp, "rb"); + error_when(f == NULL, "Error opening file: %s", fp); + error_when(fread(mem, 1, size, f) != (size_t)size, "Error reading %li bytes from file: %s", size, fp); + error_when(fclose(f) != 0, "Error closing file: %s", fp); + return 0; + +error: + //return 65; // EX_DATAERR + return 66; // EX_NOINPUT +} + +static void memdebug(const m6502_t cpu, long instrs) { + uint64_t pins = cpu.PINS; + uint16_t pc = cpu.PC; + uint8_t instr = cpu.IR >> 3; + int ic = cpu.IR & 7; + uint16_t addr = M6502_GET_ADDR(pins); + const char *mode = (pins & M6502_RW) ? "READ " : "WRITE "; + uint8_t value = (pins & M6502_RW) ? mem[addr] : M6502_GET_DATA(pins); + const char *instrname = instrnames + instr * 4; + const char *ok = documented[instr] ? "..." : "!!!"; + lament("[%4li.%i:$%04X:$%02X (%s %s)] %s mem[0x%04X]=0x%02X;\n", + instrs, ic, pc, instr, instrname, ok, mode, addr, value); +} + +static void xxd(const uint8_t *start, int length) { + while (length >= 16) { + printf("%08x: ", (unsigned int)(start - mem)); + for (int i = 0; i < 16; i += 2) { + printf("%02x%02x ", start[i], start[i + 1]); + } + printf(" "); + for (int i = 0; i < 16; i++) { + uint8_t value = start[i]; + if (value < 0x20 || value >= 0x7F) { + value = '.'; + } + printf("%c", value); + } + printf("\n"); + start += 16; // FIXME: can technically invoke undefined behavior on boundary. + length -= 16; + } + // TODO: handle the remainder. + //for (int i = 0; i < length; i += 2) { + //} +} + +int main(int argc, char **argv) { + char *name = NULL; + long cycle = 0; + long oldcycle = 0; + long instrs = 0; + long instr_limit = 1000; + uint64_t pins; + m6502_t cpu; + m6502_desc_t desc = {0}; + + if (argc <= 0 || argv == NULL || argv[0] == NULL) { + lament("You've met with a terrible fate.\n"); + return 64; // EX_USAGE + } + + name = argv[0]; + if (argc != 2 && argc != 3) { + lament("usage: %s {ram.bin} [instructions]\n", name); + return 64; // EX_USAGE + } + + if (argc == 3) { + instr_limit = strtol(argv[2], NULL, 0); // can be negative, i guess. + if (errno) { + lament("%s: failed to parse integer: %s\n", name, argv[2]); + return 64; // EX_USAGE + } + } + + { + int res = 0; + if ((res = loadmem(argv[1]))) return res; + } + + // initialize the CPU + desc.bcd_disabled = true; // TODO: do this from instructions? + pins = m6502_init(&cpu, &desc); + for (;; cycle++) { + // run the CPU emulation for one tick + pins = m6502_tick(&cpu, pins); + // extract 16-bit address from pin mask + const uint16_t addr = M6502_GET_ADDR(pins); + // perform memory access + if (pins & M6502_RW) { + // a memory read + uint8_t value = mem[addr]; + memdebug(cpu, instrs); + M6502_SET_DATA(pins, value); + } else { + // a memory write + uint8_t value = M6502_GET_DATA(pins); + memdebug(cpu, instrs); + mem[addr] = value; + } + if (cycle >= oldcycle + 8) { + lament("CPU is locked up!\n"); + break; + } + if (pins & M6502_SYNC) { + instrs++; + oldcycle = cycle; + if (instrs >= instr_limit) break; + } + } + + fflush(stdout); + fflush(stderr); + printf("cpu.PC=0x%04X, cpu.A=0x%02X, cpu.X=0x%02X, cpu.Y=0x%02X, cpu.S=0x%02X, cpu.P=0x%02X;\n", + cpu.PC, cpu.A, cpu.X, cpu.Y, cpu.S, cpu.P); + lament("exiting after %li instructions and %li cycles.\n", instrs, cycle + 1); + xxd(mem, 0x100); + return 0; +} diff --git a/6502_name_codec/decode.asm b/6502_name_codec/decode.asm new file mode 100644 index 0000000..2c68310 --- /dev/null +++ b/6502_name_codec/decode.asm @@ -0,0 +1,145 @@ +arch nes.cpu + +output "decode.bin", create +fill 65536 +origin 0 + +macro align(size) { // Align Byte Amount + while (pc() % {size}) { + db 0 + } +} + +macro nops(new_pc) { + if (pc() > {new_pc}) { + error "PC is already past the point specified" + } + while (pc() < {new_pc}) { + print "adding a byte of padding at " + print pc() + print "\n" + nop + } +} + +define version(6) + +fill 8, $02 // jams + +start: +cld // clear BCD flag +clc // clear carry +clv // clear overflow +// TODO: what i actually should be doing is an RTI +pla +pla +tax +tay +nop //php // push processor status just to advance stack a bit + +macro when_to_stop() { + if 1 { + lda $02 + cmp #(name1 & 0xFF) + beq decode_begin_next + cmp #(name2 & 0xFF) + beq decode_begin_next + cmp #(name3 & 0xFF) + beq decode_begin_next + cmp #(name4 & 0xFF) + beq decode_begin_next + cmp #(name5 & 0xFF) + beq decode_exit + } else { + // FIXME: doesn't work because output has already advanced to a null byte. + ldy #0 + lda #$20 // ascii space + eor ($00),y // load from output + beq decode_begin_next + lda $02 + cmp #(name5 & 0xFF) + beq decode_exit + } +} + +// decode subroutine arguments: +lda #(names_out) +sta $00 +lda #(names_out >> 8) +sta $01 +lda #(names) +sta $02 +lda #(names >> 8) +sta $03 +jsr decode +db $D2 // jam (D for Done, i guess) + +align(16) +// reorder() { python3 -c 's=__import__("sys").argv[1];print("".join(s[int(bin(len(s)|i)[-1:2:-1],2)] for i in range(len(s))))' "$@"; } +decode_lut0xxx: +db "ETAOINSH" +//db "EIASTNOH" +decode_lut10xx: +db "RDLU" +//db "RDLU" +if {version} >= 3 && {version} <= 6 { + db "????" +} +decode_lut11xxxx: +db "CMFPGWYBVKXJQZ. " +//db "CVGQFXY.MKWZPJB " + +//origin 0x100 - 6 * 5 * 2 +align(16) +names_out: +fill 6 * 5 * 2 + +db $FF + +origin 0x200 +names: +name0: +// FIXME: names are truncated unless they end in a padding byte! +db $0A,$4F,$4B,$70,$17,$FF // Elizabeth +name1: +db $C4,$A3,$6F,$FF,$FF,$FF // Mary 110001 00.10 1000 11.0110 +name2: +db $CC,$86,$13,$04,$2F,$FF // Patricia +name3: +db $EC,$B1,$06,$FF,$FF,$FF // James 111011 00.10 110001. 0000 0110. +name4: +db $83,$DC,$20,$7F,$FF,$FF // Robert +name5: + +origin 0x300 +if {version} == 1 { ; include "decode_v1.asm" +} else if {version} == 2 { ;include "decode_v2.asm" +} else if {version} == 3 { ;include "decode_v3.asm" +} else if {version} == 4 { ;include "decode_v4.asm" +} else if {version} == 5 { ;include "decode_v5.asm" +} else if {version} == 6 { ;include "decode_v6.asm" +} + +align(16) +db "DONE: " +dw done + +origin 0xFFFC +db start +db start >> 8 + +//macro revbit(variable x) { +// evaluate lo(((x&(1<<7))>>7)|((x&(1<<6))>>5)|((x&(1<<5))>>3)|((x&(1<<4))>>1)) +// evaluate hi(((x&(1<<3))<<1)|((x&(1<<2))<<3)|((x&(1<<1))<<5)|((x&(1<<0))<<7)) +// db {lo}|{hi} +//} +//macro makename(variable a, variable b, variable c, variable d, variable e) { +// revbit(a) +// revbit(b) +// revbit(c) +// revbit(d) +// revbit(e) +//} +//makename($01,$23,$45,$67,$89) + +// vim:ft=snes_bass diff --git a/6502_name_codec/decode_v1.asm b/6502_name_codec/decode_v1.asm new file mode 100644 index 0000000..59ec894 --- /dev/null +++ b/6502_name_codec/decode_v1.asm @@ -0,0 +1,240 @@ +// included by decode.asm +// cycles on extra-padded "Elizabeth Mary Patricia James Robert": +// 3485-55=3430 (includes jsr and rts) +// program size: 0xFB +// instructions: 1248-17=1231 (includes jsr and rts) + +decode: +// NOTE: output/input pointers cannot cross page boundaries. +// that means the effective longest lengths of output/input are 256/192 bytes. + +decode00: // decode from offset 0, unknown code length (READS A BYTE) +ldy #$00 +lda ($02),y // load from input + +tax // stash for after branch +eor #$C0 // TODO: just use a cmp instruction? +and #$C0 +beq decode06 // branch when mask is fully set +// fallthru decode04 + +decode04: // decode from offset 0, 4-bit code (then 4 under) +txa +lsr +lsr +lsr +lsr +tay +lda decode_lut0xxx,y +ldy #$00 +sta ($00),y // write to output + +inc $00 // advance output +//jsr decode_advance_output +bne decode40 // always branch (unless page boundary) +db $02 // jam + +decode06: // decode from offset 0, 6-bit code (then 2 under) +txa +lsr +lsr +and #$0F +tay +lda decode_lut11xxxx,y +ldy #$00 +sta ($00),y // write to output + +inc $00 // advance output +//jsr decode_advance_output +bne decode60 // always branch (unless page boundary) +db $02 // jam + +decode20: // decode from offset 2, unknown code length +txa +eor #$30 +and #$30 +beq decode26 // branch when mask is fully set + +decode24: // decode from offset 2, 4-bit code (then 2 under) +txa +lsr +lsr +and #$0F +tay +lda decode_lut0xxx,y +ldy #$00 +sta ($00),y // write to output + +inc $00 // advance output +//jsr decode_advance_output +bne decode60 // always branch (unless page boundary) +db $02 // jam + +decode26: // decode from offset 2, 6-bit code (then aligned) +txa +and #$0F +tay +lda decode_lut11xxxx,y +ldy #$00 +sta ($00),y // write to output + +//inc $02 // advance input +//beq die // never branch (unless page boundary) +jsr decode_advance_input + +inc $00 // advance output +//jsr decode_advance_output +bne decode00 // always branch (unless page boundary) +db $02 // jam + +decode40: // decode from offset 4, unknown code length +//inc $02 // advance input +//beq die // never branch (unless page boundary) +jsr decode_advance_input +txa +eor #$0C // TODO: just use a cmp instruction? +and #$0C +beq decode46 // branch when mask is fully set +// fallthru decode04 + +decode44: // decode from offset 4, 4-bit code (then aligned) +txa +and #$0F +tay +lda decode_lut0xxx,y +ldy #$00 +sta ($00),y // write to output + +inc $00 // advance output +//jsr decode_advance_output +bne decode00 // always branch (unless page boundary) +db $02 // jam + +decode46: // decode from offset 4, 6-bit code (then 2 over) (READS A BYTE) +txa +and #$03 +asl +asl +sta $04 +ldy #$00 +lda ($02),y // load from input +tax +rol +rol +rol +and #$03 +ora $04 +tay +lda decode_lut11xxxx,y +ldy #$00 +sta ($00),y // write to output + +inc $00 // advance output +//jsr decode_advance_output +bne decode20 // always branch (unless page boundary) +db $02 // jam + +decode60: // decode from offset 6, unknown code length (READS A BYTE) +//inc $02 // advance input +//beq die // never branch (unless page boundary) +jsr decode_advance_input +ldy #$00 +lda ($02),y // load from input +tay +txa +eor #$03 +and #$03 +beq decode66 // branch when mask is fully set +// fallthru decode64 + +decode64: // decode from offset 6, 4-bit code (then 2 over) +txa +and #$03 +asl +asl +sta $04 +tya // load in (restore) the new input +tax // and put it in X like the rest of the code expects +rol +rol +rol +and #$03 +ora $04 +tay +lda decode_lut0xxx,y +ldy #$00 +sta ($00),y // write to output + +inc $00 // advance output +//jsr decode_advance_output +bne decode20_shim // always branch (unless page boundary) +db $02 // jam + +die: +db $F2 // um lammer jammy + +decode20_shim: +jmp decode20 + +decode66: // decode from offset 6, 6-bit code (then 4 over) +tya // load in (restore) the new input +tax // and put it in X like the rest of the code expects +lsr +lsr +lsr +lsr +tay +lda decode_lut11xxxx,y +ldy #$00 +sta ($00),y // write to output + +inc $00 // advance output +//jsr decode_advance_output +bne decode40 // always branch (unless page boundary) +db $02 // jam + +decode_advance_input: +// do not modify X or Y here +inc $02 +beq die // never branch (unless page boundary) +when_to_stop() +rts + +if 0 { + decode_advance_output: + // do not modify X or Y here + lda $02 + cmp #(name1 & 0xFF) + beq decode_begin_next + cmp #(name2 & 0xFF) + beq decode_begin_next + cmp #(name3 & 0xFF) + beq decode_begin_next + cmp #(name4 & 0xFF) + beq decode_begin_next + cmp #(name5 & 0xFF) + beq decode_exit + inc $00 // advance output + //beq die // never branch (unless page boundary) + rts +} + +decode_begin_next: +//inc $00 // advance output +//beq die // never branch (unless page boundary) +pla +pla +jmp decode00 + +decode_exit: +pla +pla +rts +//db $D2 // jam (D is for Done, i guess) + +// // +done: +db $02 // jam +// // + +// vim:ft=snes_bass diff --git a/6502_name_codec/decode_v2.asm b/6502_name_codec/decode_v2.asm new file mode 100644 index 0000000..288e85e --- /dev/null +++ b/6502_name_codec/decode_v2.asm @@ -0,0 +1,176 @@ +// included by decode.asm +// cycles on extra-padded "Elizabeth Mary Patricia James Robert": +// 4035-55=3980 (includes jsr and rts) +// program size: 0xCF +// instructions: 1358-17=1341 (includes jsr and rts) + +decode_advance_input: +// do not modify X or Y here +inc $02 +beq die // never branch (unless page boundary) +when_to_stop() +rts + +decode_exit: +pla +pla +rts +//db $D2 // jam (D is for Done, i guess) + +decode_common: +tay +lda decode_lut0xxx,y +ldy #$00 +sta ($00),y // write to output +inc $00 // advance output +beq die // never branch (unless page boundary) +rts + +decode_uncommon: +tay +lda decode_lut11xxxx,y +ldy #$00 +sta ($00),y // write to output +inc $00 // advance output +beq die // never branch (unless page boundary) +rts + +die: +db $F2 // um lammer jammy + +decode_begin_next: +//inc $00 // advance output +//beq die // never branch (unless page boundary) +pla +pla + +decode: +// NOTE: output/input pointers cannot cross page boundaries. +// that means the effective longest lengths of output/input are 256/192 bytes. + +decode00: // decode from offset 0, unknown code length (READS A BYTE) +ldy #$00 +lda ($02),y // load from input + +tax // stash for after branch +eor #$C0 // TODO: just use a cmp instruction? +and #$C0 +beq decode06 // branch when mask is fully set +// fallthru decode04 + +decode04: // decode from offset 0, 4-bit code (then 4 under) +txa +lsr +lsr +lsr +lsr +jsr decode_common +bne decode40 // always branch (unless page boundary) + +decode06: // decode from offset 0, 6-bit code (then 2 under) +txa +lsr +lsr +and #$0F +jsr decode_uncommon +bne decode60 // always branch (unless page boundary) + +decode20: // decode from offset 2, unknown code length +txa +eor #$30 +and #$30 +beq decode26 // branch when mask is fully set + +decode24: // decode from offset 2, 4-bit code (then 2 under) +txa +lsr +lsr +and #$0F +jsr decode_common +bne decode60 // always branch (unless page boundary) + +decode26: // decode from offset 2, 6-bit code (then aligned) +jsr decode_advance_input +txa +and #$0F +jsr decode_uncommon +bne decode00 // always branch (unless page boundary) + +decode40: // decode from offset 4, unknown code length +//inc $02 // advance input +//beq die // never branch (unless page boundary) +jsr decode_advance_input +txa +eor #$0C // TODO: just use a cmp instruction? +and #$0C +beq decode46 // branch when mask is fully set +// fallthru decode04 + +decode44: // decode from offset 4, 4-bit code (then aligned) +txa +and #$0F +jsr decode_common +bne decode00 // always branch (unless page boundary) + +decode46: // decode from offset 4, 6-bit code (then 2 over) (READS A BYTE) +txa +and #$03 +asl +asl +sta $04 +ldy #$00 +lda ($02),y // load from input +tax +rol +rol +rol +and #$03 +ora $04 +jsr decode_uncommon +bne decode20 // always branch (unless page boundary) + +decode60: // decode from offset 6, unknown code length (READS A BYTE) +//inc $02 // advance input +//beq die // never branch (unless page boundary) +jsr decode_advance_input +ldy #$00 +lda ($02),y // load from input +tay +txa +eor #$03 +and #$03 +beq decode66 // branch when mask is fully set +// fallthru decode64 + +decode64: // decode from offset 6, 4-bit code (then 2 over) +txa +and #$03 +asl +asl +sta $04 +tya // load in (restore) the new input +tax // and put it in X like the rest of the code expects +rol +rol +rol +and #$03 +ora $04 +jsr decode_common +bne decode20 // always branch (unless page boundary) + +decode66: // decode from offset 6, 6-bit code (then 4 over) +tya // load in (restore) the new input +tax // and put it in X like the rest of the code expects +lsr +lsr +lsr +lsr +jsr decode_uncommon +bne decode40 // always branch (unless page boundary) + +// // +done: +db $02 // jam +// // + +// vim:ft=snes_bass diff --git a/6502_name_codec/decode_v3.asm b/6502_name_codec/decode_v3.asm new file mode 100644 index 0000000..b9dbceb --- /dev/null +++ b/6502_name_codec/decode_v3.asm @@ -0,0 +1,210 @@ +// included by decode.asm +// cycles on extra-padded "Elizabeth Mary Patricia James Robert": +// with double-dec: (dec per bit) +// 7359-55=7304 (includes jsr and rts) +// program size: 0xE0 +// with single-dec: (dec per pair) +// 6759-55=6704 (includes jsr and rts) +// program size: 0xDA +// with tax reduction: +// program size: 0xD4 +// with single death: +// program size: 0xD3 +// with TYA instead of LDA #0: +// program size: 0xD2 +// with reduced ORA: +// 6612-55=6557 (includes jsr and rts) +// program size: 0xD0 +// with stuff crammed into decode_advance: +// program size: 0xBE +// without a shim to jump back to decode_xx from decode_nextbytew: +// program size: 0xBD +// without any JMPs: +// 6564-55=6509 (includes jsr and rts) +// program size: 0xBC +// without any extraneous DBs: +// program size: 0xBB +// instructions: 2359-17=2342 (includes jsr and rts) +// with sty instead of tya + sta: +// cycles: 6466-55=6411 +// instrs: 2310-17=2293 +// program size: 0xBA (186, 22 of which are the end-of-string comparisons) + +decode_advance: +inc $02 +beq die // never branch (unless page boundary) + +when_to_stop() + +ldy #0 +lda ($02),y // load from input +tax // stash for after branch +lda #4 // pairs remaining +sta $04 // write pairs remaining +rts + +decode_exit: +pla +pla +rts + +decode_begin_next: +pla +pla + +decode: +// NOTE: output/input pointers cannot cross page boundaries. +// that means the effective longest lengths of output/input are 256/192 bytes. + +lda #4 // pairs remaining +sta $04 // write pairs remaining + +ldy #0 +lda ($02),y // load from input +tax // stash for after branch + +decode_xx: // decode from offset 0, unknown code length +//tya // lda #0 +sty $05 // write data so far (nothing) + +txa +asl +bcs decode_1x + +decode_0x: +asl +tax +bcs decode_01 +// fallthru to decode_00 + +decode_00: +lda #%00000000 +bpl decode_read2 // always branch + +decode_01: +lda #%00000100 +bpl decode_read2 // always branch + +decode_1x: +asl +tax +bcs decode_11 +// fallthru to decode_10 + +decode_10: +lda #%00001000 +bpl decode_read2 // always branch + +decode_11: +lda #%00010000 +bpl decode_read4 // always branch + +die: +db $F2 + +decode_nextbytew: +jsr decode_advance +bpl decode_xx // always branch + +decode_write: +ora $05 +// decode_common stuff: +tay +lda decode_lut0xxx,y +ldy #0 +sta ($00),y // write to output +inc $00 // advance output +beq die // never branch (unless page boundary) +dec $04 // decrement pairs remaining +bne decode_xx // branch if we're good, otherwise... +beq decode_nextbytew // (always) branch if we need more pairs + +decode_read2_and_ora: +ora $05 + +decode_read2: +sta $05 +dec $04 // decrement pairs remaining +beq decode_nextbyte2 + +decode_read2_again: +// we have at least one pair left to read from X +txa +asl +bcs decode_read2_1x + +decode_read2_0x: +asl +tax +bcs decode_read2_01 + +decode_read2_00: +lda #%00000000 +bpl decode_write // always branch + +decode_read2_01: +lda #%00000001 +bpl decode_write // always branch + +decode_read2_1x: +asl +tax +bcs decode_read2_11 + +decode_read2_10: +lda #%00000010 +bpl decode_write // always branch + +decode_read2_11: +lda #%00000011 +bpl decode_write // always branch + +decode_read4: +sta $05 +dec $04 // decrement pairs remaining +beq decode_nextbyte4 + +decode_read4_again: +// we have at least one pair left to read from X +txa +asl +bcs decode_read4_1x + +decode_read4_0x: +asl +tax +bcs decode_read4_01 + +decode_read4_00: +lda #%00000000 +bpl decode_read2_and_ora // always branch + +decode_read4_01: +lda #%00000100 +bpl decode_read2_and_ora // always branch + +decode_read4_1x: +asl +tax +bcs decode_read4_11 + +decode_read4_10: +lda #%00001000 +bpl decode_read2_and_ora // always branch + +decode_read4_11: +lda #%00001100 +bpl decode_read2_and_ora // always branch + +decode_nextbyte2: +jsr decode_advance +bpl decode_read2_again // always branch + +decode_nextbyte4: +jsr decode_advance +bpl decode_read4_again // always branch + +done: +db $F2 + +// vim:ft=snes_bass diff --git a/6502_name_codec/decode_v4.asm b/6502_name_codec/decode_v4.asm new file mode 100644 index 0000000..84f0469 --- /dev/null +++ b/6502_name_codec/decode_v4.asm @@ -0,0 +1,191 @@ +// included by decode.asm +// cycles on extra-padded "Elizabeth Mary Patricia James Robert": +// without any interleaving of instructions and LUT: +// cycles: 7305-55=7250 +// instrs: 2665-17=2648 +// program size: 0x96 (150, 22 of which are the end-of-string comparisons) +// with JSRs branched-over instead of branched-to-and-back: +// cycles: 7293-55=7238 +// instrs: 2649-17=2632 +// program size: 0x92 (146, 22 of which are the end-of-string comparisons) +// with BPL instead of CLC+BCC: +// cycles: 7151-55=7096 +// instrs: 2578-17=2561 +// program size: 0x90 (144, 22 of which are the end-of-string comparisons) +// with interleaved instructions and LUT: +// cycles: 7151-55=7096 +// instrs: 2578-17=2561 +// program size: 0xC3 (note that this will *always* be 0xC3 with this method) + +// so like, same as v3, except +// instead of branching based on the high bits (through asl or ror), +// we mask out the two bits being used, and ORA them in through Absolute,X mode. + +// so that means, at these memory positions (possibly offset by X, on another page), +// we need to occupy a byte: +// actually, wait, in v3, the bits we branch on are always the top two. +// so what i could do is use X, both to offset to the end of the page, +// and also for each decoding case. +// X = A & %11000000 +// A = %00111100[X] + +align(0x100) +decode_ilut: // internal look-up table +db $00,$00,$00 + +decode_advance: +inc $02 // advance input +beq die // never branch (unless page boundary) +when_to_stop() +ldy #0 +lda ($02),y // load from input +tax // stash for after branch +lda #4 // pairs remaining +sta $04 // write pairs remaining +rts + +decode_exit: +pla +pla +rts + +nops(decode_ilut + 0x40) +db $04,$01,$04 + +decode_begin_next: +pla +pla + +decode: +// NOTE: output/input pointers cannot cross page boundaries. +// that means the effective longest lengths of output/input are 256/192 bytes. + +lda #4 // pairs remaining +sta $04 // write pairs remaining + +ldy #0 +lda ($02),y // load from input +tax // stash for after branch + +decode_xx: // decode from offset 0, unknown code length +// two things need to be done here: + +// 1. zp[0x05] |= {%0, %100, %1000, %10000}[A >> 6] +txa // TODO: unnecessary? +and #$C0 +tay +lda decode_ilut+0,y // would be ORA, but this is our first data point +sta $05 + +// 2. branch to decode_read4 is both bits were set, decode_read2 otherwise. +tya +eor #$C0 +beq decode_read4 +bne decode_read2 // always branch + +die: +db $F2 + +decode_nextbytew: +jsr decode_advance +bpl decode_xx // always branch + +decode_write: +lda $05 // TODO: unnecessary? +// decode_common stuff: +tay +lda decode_lut0xxx,y +ldy #0 +sta ($00),y // write to output +inc $00 // advance output +beq die // never branch (unless page boundary) +txa +asl +asl +tax +dec $04 // decrement pairs remaining +bne decode_xx // branch if we're good, otherwise... +beq decode_nextbytew // (always) branch if we need more pairs + +nops(decode_ilut + 0x80) +db $08,$02,$08 + +decode_read2: +txa +asl +asl +tax + +dec $04 // decrement pairs remaining +bne + +jsr decode_advance ++; + +decode_read2_again: +// we have at least one pair left to read from X + +// zp[0x05] |= {%0, %1, %10, %11}[A >> 6] +txa // TODO: unnecessary? +and #$C0 +tay +lda $05 +ora decode_ilut+1,y +sta $05 + +bpl decode_write // always branch + +decode_read4: +txa +asl +asl +tax + +dec $04 // decrement pairs remaining +bne + +jsr decode_advance ++; +// we have at least one pair left to read from X + +// zp[0x05] |= {%0, %1, %10, %11}[A >> 6] +txa // TODO: unnecessary? +and #$C0 +tay +lda $05 +ora decode_ilut+2,y +sta $05 + +bpl decode_read2 // always branch + +nops(decode_ilut + 0xC0) +db $10,$03,$0C + +done: +db $F2 + +if 0 { + align(0x100) + decode_ilut: // internal look-up table + db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00 + db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00 + db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00 + db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00 + db $04,$01, $04,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00 + db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00 + db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00 + db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00 + db $08,$02, $08,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00 + db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00 + db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00 + db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00 + db $10,$03, $0C,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00 + db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00 + db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00 + db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00 +} + +// decode_ilut[0b00000000] = 0b00000000 +// decode_ilut[0b01000000] = 0b00000100 +// decode_ilut[0b10000000] = 0b00001000 +// decode_ilut[0b11000000] = 0b00010000 + +// vim:ft=snes_bass diff --git a/6502_name_codec/decode_v5.asm b/6502_name_codec/decode_v5.asm new file mode 100644 index 0000000..fa09915 --- /dev/null +++ b/6502_name_codec/decode_v5.asm @@ -0,0 +1,264 @@ +// included by decode.asm +// stats on extra-padded "Elizabeth Mary Patricia James Robert": +// cycles: 7403-55=7348 +// instrs: 2702-17=2685 +// program size: 0xA3 +// with reorganization: (first and second program parts swapped, no need for long jump) +// cycles: 7391-55=7336 +// instrs: 2698-17=2681 +// program size: 0xA0 +// with inlined decode_nextbytew: +// 7358 2687 0xA0 +// with rearranged decode_write: +// exiting after 2678 instructions and 7351 cycles. 0xA0 +// with removed txa from decode_xx: +// exiting after 2629 instructions and 7253 cycles. 0xA0 +// with more txa/tax simplification: +// exiting after 2613 instructions and 7221 cycles. 0x9F +// without the lda $05 in decode_write: +// exiting after 2567 instructions and 7083 cycles. 0x9F +// without the sda $05 in decode_read2_again: +// exiting after 2521 instructions and 6945 cycles. 0x9F +// with one of the `txa; asl; asl; tax` blocks rearranged in branches: +// exiting after 2483 instructions and 6879 cycles. 0x9F +// with the other one too: (the one from decode_read4) +// exiting after 2461 instructions and 6841 cycles. 0xA1 +// with redundant taxes removed after those rearrangements: +// exiting after 2406 instructions and 6731 cycles. 0xA0 +// with merged decode_read2 and decode_read4: +// exiting after 2381 instructions and 6706 cycles. 0x86 +// cycles: 6706-55=6651 +// instrs: 2381-17=2364 +// program size: 0x86 (134, 22 of which are the end-of-string comparisons) + +constant decode_alt(1) +constant very_alt(0) // see decode_v6.asm for an expansion of this idea + +align(0x100) +decode_ilut: // internal look-up table +if very_alt { + db $00 +} else { + db $00,$00,$00 +} + +// === BOUNDARY === + +decode_write: +// decode_common stuff: +tay +lda decode_lut0xxx,y +ldy #0 +sta ($00),y // write to output +inc $00 // advance output +beq die // never branch (unless page boundary) +dec $04 // decrement pairs remaining +beq + // branch if we need more pairs + +txa +asl +asl +bvc decode_xx // always branch + ++; +jsr decode_advance +bpl decode_xx // always branch + +// === BOUNDARY === + +if decode_alt { + decode_exit: + pla + pla + rts +} + +// === BOUNDARY === + +nops(decode_ilut + 0x20) +if very_alt { + db $04 +} else { + db $04,$01,$04 +} + +// === BOUNDARY === + +die: +db $F2 + +// === BOUNDARY === + +decode_begin_next: +pla +pla + +decode: +// NOTE: output/input pointers cannot cross page boundaries. +// that means the effective longest lengths of output/input are 256/192 bytes. + +lda #4 // pairs remaining +sta $04 // write pairs remaining + +ldy #0 +lda ($02),y // load from input + +decode_xx: // decode from offset 0, unknown code length +// NOTE: Y is always 0 here, if that helps at all. +tax // stash for after branch + +// two things need to be done here: + +// 1. zp[0x05] |= {%0, %100, %1000, %10000}[A >> 6] +and #%11000000 +lsr +tay +lda decode_ilut+0,y // would be ORA, but this is our first data point + +if decode_alt { + bvc decode_read_either // always branch + + decode_read4_after: + if very_alt { + and #%00000011 + asl + asl + ora #%00010000 + bpl decode_read_either // always branch + } else { + and #%01111111 + // read4 + ora decode_ilut+2,y + bpl decode_read_either // always branch + } + +} else { + sta $05 // still part of 1. + + // 2. branch to decode_read4 is both bits were set, decode_read2 otherwise. + tya + eor #$60 + beq decode_read4 + bne decode_read2 // always branch +} + +// === BOUNDARY === + +nops(decode_ilut + 0x40) +if very_alt { + db $08 +} else { + db $08,$02,$08 +} + +// === BOUNDARY === + +macro decode_read_common() { // common between the subroutines, not WRT probability + dec $04 // decrement pairs remaining + bne + + jsr decode_advance + bpl ++ // always branch + + // we have at least one pair left to read from X + + +; + txa + asl + asl + + +; // decode_read2_again or decode_read4_again + tax // TODO: unnecessary? + if very_alt { + // zp[0x05] |= {%0, %1, %10, %11}[A >> 6] + rol + rol + rol + and #%00000011 + ora $05 + // then elsewhere, ora $05, instead of the lda $05 normally + // should be slightly faster for the read2 case + } else { + and #%11000000 + lsr + tay + lda $05 + } +} + +if decode_alt { + decode_read_either: + sta $05 // still part of 1. + + decode_read_common() + bmi decode_read4_after + + // read2 + if very_alt { + ; + } else { + ora decode_ilut+1,y + } + bpl decode_write // always branch + +} else { + decode_read2: + decode_read_common() + ora decode_ilut+1,y + bpl decode_write // always branch +} + +// === BOUNDARY === + +if decode_alt { + ; // nothing else to add +} else { + decode_exit: + pla + pla + rts +} + +// === BOUNDARY === + +nops(decode_ilut + 0x60) +if decode_alt { + if very_alt { + db $90 + } else { + db $90,$03,$0C + } +} else { + db $10,$03,$0C +} + +// === BOUNDARY === + +if decode_alt { + ; // nothing else to do + +} else { + decode_read4: + decode_read_common() + ora decode_ilut+2,y + sta $05 + bpl decode_read2 // always branch +} + +// === BOUNDARY === + +decode_advance: +inc $02 // advance input +beq die // never branch (unless page boundary) +when_to_stop() +ldy #0 +lda ($02),y // load from input +ldx #4 // pairs remaining +stx $04 // write pairs remaining +rts // A = *input; X = 4; Y = 0 + +// === BOUNDARY === + +done: +db $F2 // === BOUNDARY === + +// vim:ft=snes_bass diff --git a/6502_name_codec/decode_v6.asm b/6502_name_codec/decode_v6.asm new file mode 100644 index 0000000..7dbb340 --- /dev/null +++ b/6502_name_codec/decode_v6.asm @@ -0,0 +1,147 @@ +// included by decode.asm +// stats on extra-padded "Elizabeth Mary Patricia James Robert": +// exiting after 2456 instructions and 6714 cycles. 0x84 +// with a bunch of optimization: (yeah i forgot) +// exiting after 2483 instructions and 6599 cycles. 0x74 +// cycles: 6599-55=6544 +// instrs: 2483-17=2466 +// program size: 0x74 (116, 22 of which are the end-of-string comparisons) +// with decode_read4_after moved further down: +// exiting after 2459 instructions and 6527 cycles. 0x74 +// cycles: 6527-55=6472 +// instrs: 2459-17=2442 +// program size: 0x74 (116, 22 of which are the end-of-string comparisons) + +constant decode_internalize(1) + +if decode_internalize { + decode_ilut: + db "ETAOINSHRDLU????CMFPGWYBVKXJQZ. " +} + +decode_begin_next: +pla +pla + +decode: +// NOTE: output/input pointers cannot cross page boundaries. +// that means the effective longest lengths of output/input are 256/192 bytes. + +lda #4 // pairs remaining +sta $04 // write pairs remaining + +ldy #0 +lda ($02),y // load from input + +decode_xx: +// NOTE: Y is always 0 here, if that helps at all. +tax // stash for after branch + +and #%11000000 +lsr +lsr +lsr +cmp #%00011000 +ror +//bcc decode_read_either // always branch + +decode_read_either: +sta $05 + +dec $04 // decrement pairs remaining +bne + +jsr decode_advance +bpl decode_read_again // always branch + +// we have at least one pair left to read from X + ++; +txa +asl +asl + +decode_read_again: +tax + +rol +rol +rol +and #%00000011 +ora $05 + +bmi decode_read4_after +//cmp #%00011100 +//bcs decode_read4_after + +// read2, nothing left to do, so just write it +//bpl decode_write // always branch + +decode_write: +// decode_common stuff: +tay +if decode_internalize { + lda decode_ilut,y +} else { + lda decode_lut0xxx,y +} +ldy #0 +sta ($00),y // write to output +inc $00 // advance output +beq die // never branch (unless page boundary) +dec $04 // decrement pairs remaining +beq + // branch if we need more pairs + +txa +asl +asl +bvc decode_xx // always branch (FIXME: might depend on output address) + ++; +if 1 { + jsr decode_advance + bpl decode_xx // always branch +} else { + // 1 cycle(?) slower and 1 byte larger + lda #(decode_xx - 1) >> 8 + pha + lda #(decode_xx - 1) & 0xFF + pha + // fallthru to decode_advance +} + +decode_read4_after: +and #%00000011 +asl +asl +ora #%00010000 +bpl decode_read_either // always branch + +decode_advance: +inc $02 // advance input +beq die // never branch (unless page boundary) +when_to_stop() +ldy #0 +lda ($02),y // load from input +ldx #4 // pairs remaining +stx $04 // write pairs remaining +rts // A = *input; X = 4; Y = 0 + +decode_exit: +if 1 { + // 4+4=8 cycles, 2 bytes: + pla + pla +} else { + // 3+2=5 cycles, 3 bytes, also penalty for having to set up $06 in the first place: + ldx $06 + txs +} +rts + +die: +db $F2 + +done: +db $D2 + +// vim:ft=snes_bass