add 6502_name_codec

This commit is contained in:
Connor Olding 2022-06-07 04:03:30 +02:00
parent 760e60f814
commit 5d16339d59
9 changed files with 1564 additions and 0 deletions

View file

@ -0,0 +1,7 @@
squeeze simple north-american names into fewer bytes.
i made this with NES/SNES games in mind i.e. for player-given character names in JRPGs.
you will need [m6502.h](https://github.com/floooh/chips/blob/c011ef1/chips/m6502.h)
to compile the test program. as of writing, v1 is the fastest, but v6 is the smallest.
someday, i want to extend this to use a non-constant LUT. (hidden markov models?)

View file

@ -0,0 +1,184 @@
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#define CHIPS_IMPL
#include "m6502.h"
#define lament(...) fprintf(stderr, __VA_ARGS__)
#define error_when(cond, ...) do { \
if ((cond) || errno) { \
lament(__VA_ARGS__); \
lament(": %s\n", strerror(errno)); \
goto error; \
} \
} while (0)
// setup 64 kBytes of memory
#define MEMSIZE 65536
static uint8_t mem[MEMSIZE] = {0};
// NOTE: renamed KIL to JAM for consistency.
static const char instrnames[] =
"BRK\0ORA\0JAM\0SLO\0NOP\0ORA\0ASL\0SLO\0PHP\0ORA\0ASL\0ANC\0NOP\0ORA\0ASL\0SLO\0"
"BPL\0ORA\0JAM\0SLO\0NOP\0ORA\0ASL\0SLO\0CLC\0ORA\0NOP\0SLO\0NOP\0ORA\0ASL\0SLO\0"
"JSR\0AND\0JAM\0RLA\0BIT\0AND\0ROL\0RLA\0PLP\0AND\0ROL\0ANC\0BIT\0AND\0ROL\0RLA\0"
"BMI\0AND\0JAM\0RLA\0NOP\0AND\0ROL\0RLA\0SEC\0AND\0NOP\0RLA\0NOP\0AND\0ROL\0RLA\0"
"RTI\0EOR\0JAM\0SRE\0NOP\0EOR\0LSR\0SRE\0PHA\0EOR\0LSR\0ALR\0JMP\0EOR\0LSR\0SRE\0"
"BVC\0EOR\0JAM\0SRE\0NOP\0EOR\0LSR\0SRE\0CLI\0EOR\0NOP\0SRE\0NOP\0EOR\0LSR\0SRE\0"
"RTS\0ADC\0JAM\0RRA\0NOP\0ADC\0ROR\0RRA\0PLA\0ADC\0ROR\0ARR\0JMP\0ADC\0ROR\0RRA\0"
"BVS\0ADC\0JAM\0RRA\0NOP\0ADC\0ROR\0RRA\0SEI\0ADC\0NOP\0RRA\0NOP\0ADC\0ROR\0RRA\0"
"NOP\0STA\0NOP\0SAX\0STY\0STA\0STX\0SAX\0DEY\0NOP\0TXA\0XAA\0STY\0STA\0STX\0SAX\0"
"BCC\0STA\0JAM\0AHX\0STY\0STA\0STX\0SAX\0TYA\0STA\0TXS\0TAS\0SHY\0STA\0SHX\0AHX\0"
"LDY\0LDA\0LDX\0LAX\0LDY\0LDA\0LDX\0LAX\0TAY\0LDA\0TAX\0LAX\0LDY\0LDA\0LDX\0LAX\0"
"BCS\0LDA\0JAM\0LAX\0LDY\0LDA\0LDX\0LAX\0CLV\0LDA\0TSX\0LAS\0LDY\0LDA\0LDX\0LAX\0"
"CPY\0CMP\0NOP\0DCP\0CPY\0CMP\0DEC\0DCP\0INY\0CMP\0DEX\0AXS\0CPY\0CMP\0DEC\0DCP\0"
"BNE\0CMP\0JAM\0DCP\0NOP\0CMP\0DEC\0DCP\0CLD\0CMP\0NOP\0DCP\0NOP\0CMP\0DEC\0DCP\0"
"CPX\0SBC\0NOP\0ISC\0CPX\0SBC\0INC\0ISC\0INX\0SBC\0NOP\0SBC\0CPX\0SBC\0INC\0ISC\0"
"BEQ\0SBC\0JAM\0ISC\0NOP\0SBC\0INC\0ISC\0SED\0SBC\0NOP\0ISC\0NOP\0SBC\0INC\0ISC";
static const char documented[] = {
1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0,
1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0,
1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0,
1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0
};
static int loadmem(const char *fp) {
FILE *f = NULL;
long size = MEMSIZE;
errno = 0;
f = fopen(fp, "rb");
error_when(f == NULL, "Error opening file: %s", fp);
error_when(fread(mem, 1, size, f) != (size_t)size, "Error reading %li bytes from file: %s", size, fp);
error_when(fclose(f) != 0, "Error closing file: %s", fp);
return 0;
error:
//return 65; // EX_DATAERR
return 66; // EX_NOINPUT
}
static void memdebug(const m6502_t cpu, long instrs) {
uint64_t pins = cpu.PINS;
uint16_t pc = cpu.PC;
uint8_t instr = cpu.IR >> 3;
int ic = cpu.IR & 7;
uint16_t addr = M6502_GET_ADDR(pins);
const char *mode = (pins & M6502_RW) ? "READ " : "WRITE ";
uint8_t value = (pins & M6502_RW) ? mem[addr] : M6502_GET_DATA(pins);
const char *instrname = instrnames + instr * 4;
const char *ok = documented[instr] ? "..." : "!!!";
lament("[%4li.%i:$%04X:$%02X (%s %s)] %s mem[0x%04X]=0x%02X;\n",
instrs, ic, pc, instr, instrname, ok, mode, addr, value);
}
static void xxd(const uint8_t *start, int length) {
while (length >= 16) {
printf("%08x: ", (unsigned int)(start - mem));
for (int i = 0; i < 16; i += 2) {
printf("%02x%02x ", start[i], start[i + 1]);
}
printf(" ");
for (int i = 0; i < 16; i++) {
uint8_t value = start[i];
if (value < 0x20 || value >= 0x7F) {
value = '.';
}
printf("%c", value);
}
printf("\n");
start += 16; // FIXME: can technically invoke undefined behavior on boundary.
length -= 16;
}
// TODO: handle the remainder.
//for (int i = 0; i < length; i += 2) {
//}
}
int main(int argc, char **argv) {
char *name = NULL;
long cycle = 0;
long oldcycle = 0;
long instrs = 0;
long instr_limit = 1000;
uint64_t pins;
m6502_t cpu;
m6502_desc_t desc = {0};
if (argc <= 0 || argv == NULL || argv[0] == NULL) {
lament("You've met with a terrible fate.\n");
return 64; // EX_USAGE
}
name = argv[0];
if (argc != 2 && argc != 3) {
lament("usage: %s {ram.bin} [instructions]\n", name);
return 64; // EX_USAGE
}
if (argc == 3) {
instr_limit = strtol(argv[2], NULL, 0); // can be negative, i guess.
if (errno) {
lament("%s: failed to parse integer: %s\n", name, argv[2]);
return 64; // EX_USAGE
}
}
{
int res = 0;
if ((res = loadmem(argv[1]))) return res;
}
// initialize the CPU
desc.bcd_disabled = true; // TODO: do this from instructions?
pins = m6502_init(&cpu, &desc);
for (;; cycle++) {
// run the CPU emulation for one tick
pins = m6502_tick(&cpu, pins);
// extract 16-bit address from pin mask
const uint16_t addr = M6502_GET_ADDR(pins);
// perform memory access
if (pins & M6502_RW) {
// a memory read
uint8_t value = mem[addr];
memdebug(cpu, instrs);
M6502_SET_DATA(pins, value);
} else {
// a memory write
uint8_t value = M6502_GET_DATA(pins);
memdebug(cpu, instrs);
mem[addr] = value;
}
if (cycle >= oldcycle + 8) {
lament("CPU is locked up!\n");
break;
}
if (pins & M6502_SYNC) {
instrs++;
oldcycle = cycle;
if (instrs >= instr_limit) break;
}
}
fflush(stdout);
fflush(stderr);
printf("cpu.PC=0x%04X, cpu.A=0x%02X, cpu.X=0x%02X, cpu.Y=0x%02X, cpu.S=0x%02X, cpu.P=0x%02X;\n",
cpu.PC, cpu.A, cpu.X, cpu.Y, cpu.S, cpu.P);
lament("exiting after %li instructions and %li cycles.\n", instrs, cycle + 1);
xxd(mem, 0x100);
return 0;
}

145
6502_name_codec/decode.asm Normal file
View file

@ -0,0 +1,145 @@
arch nes.cpu
output "decode.bin", create
fill 65536
origin 0
macro align(size) { // Align Byte Amount
while (pc() % {size}) {
db 0
}
}
macro nops(new_pc) {
if (pc() > {new_pc}) {
error "PC is already past the point specified"
}
while (pc() < {new_pc}) {
print "adding a byte of padding at "
print pc()
print "\n"
nop
}
}
define version(6)
fill 8, $02 // jams
start:
cld // clear BCD flag
clc // clear carry
clv // clear overflow
// TODO: what i actually should be doing is an RTI
pla
pla
tax
tay
nop //php // push processor status just to advance stack a bit
macro when_to_stop() {
if 1 {
lda $02
cmp #(name1 & 0xFF)
beq decode_begin_next
cmp #(name2 & 0xFF)
beq decode_begin_next
cmp #(name3 & 0xFF)
beq decode_begin_next
cmp #(name4 & 0xFF)
beq decode_begin_next
cmp #(name5 & 0xFF)
beq decode_exit
} else {
// FIXME: doesn't work because output has already advanced to a null byte.
ldy #0
lda #$20 // ascii space
eor ($00),y // load from output
beq decode_begin_next
lda $02
cmp #(name5 & 0xFF)
beq decode_exit
}
}
// decode subroutine arguments:
lda #(names_out)
sta $00
lda #(names_out >> 8)
sta $01
lda #(names)
sta $02
lda #(names >> 8)
sta $03
jsr decode
db $D2 // jam (D for Done, i guess)
align(16)
// reorder() { python3 -c 's=__import__("sys").argv[1];print("".join(s[int(bin(len(s)|i)[-1:2:-1],2)] for i in range(len(s))))' "$@"; }
decode_lut0xxx:
db "ETAOINSH"
//db "EIASTNOH"
decode_lut10xx:
db "RDLU"
//db "RDLU"
if {version} >= 3 && {version} <= 6 {
db "????"
}
decode_lut11xxxx:
db "CMFPGWYBVKXJQZ. "
//db "CVGQFXY.MKWZPJB "
//origin 0x100 - 6 * 5 * 2
align(16)
names_out:
fill 6 * 5 * 2
db $FF
origin 0x200
names:
name0:
// FIXME: names are truncated unless they end in a padding byte!
db $0A,$4F,$4B,$70,$17,$FF // Elizabeth
name1:
db $C4,$A3,$6F,$FF,$FF,$FF // Mary 110001 00.10 1000 11.0110
name2:
db $CC,$86,$13,$04,$2F,$FF // Patricia
name3:
db $EC,$B1,$06,$FF,$FF,$FF // James 111011 00.10 110001. 0000 0110.
name4:
db $83,$DC,$20,$7F,$FF,$FF // Robert
name5:
origin 0x300
if {version} == 1 { ; include "decode_v1.asm"
} else if {version} == 2 { ;include "decode_v2.asm"
} else if {version} == 3 { ;include "decode_v3.asm"
} else if {version} == 4 { ;include "decode_v4.asm"
} else if {version} == 5 { ;include "decode_v5.asm"
} else if {version} == 6 { ;include "decode_v6.asm"
}
align(16)
db "DONE: "
dw done
origin 0xFFFC
db start
db start >> 8
//macro revbit(variable x) {
// evaluate lo(((x&(1<<7))>>7)|((x&(1<<6))>>5)|((x&(1<<5))>>3)|((x&(1<<4))>>1))
// evaluate hi(((x&(1<<3))<<1)|((x&(1<<2))<<3)|((x&(1<<1))<<5)|((x&(1<<0))<<7))
// db {lo}|{hi}
//}
//macro makename(variable a, variable b, variable c, variable d, variable e) {
// revbit(a)
// revbit(b)
// revbit(c)
// revbit(d)
// revbit(e)
//}
//makename($01,$23,$45,$67,$89)
// vim:ft=snes_bass

View file

@ -0,0 +1,240 @@
// included by decode.asm
// cycles on extra-padded "Elizabeth Mary Patricia James Robert":
// 3485-55=3430 (includes jsr and rts)
// program size: 0xFB
// instructions: 1248-17=1231 (includes jsr and rts)
decode:
// NOTE: output/input pointers cannot cross page boundaries.
// that means the effective longest lengths of output/input are 256/192 bytes.
decode00: // decode from offset 0, unknown code length (READS A BYTE)
ldy #$00
lda ($02),y // load from input
tax // stash for after branch
eor #$C0 // TODO: just use a cmp instruction?
and #$C0
beq decode06 // branch when mask is fully set
// fallthru decode04
decode04: // decode from offset 0, 4-bit code (then 4 under)
txa
lsr
lsr
lsr
lsr
tay
lda decode_lut0xxx,y
ldy #$00
sta ($00),y // write to output
inc $00 // advance output
//jsr decode_advance_output
bne decode40 // always branch (unless page boundary)
db $02 // jam
decode06: // decode from offset 0, 6-bit code (then 2 under)
txa
lsr
lsr
and #$0F
tay
lda decode_lut11xxxx,y
ldy #$00
sta ($00),y // write to output
inc $00 // advance output
//jsr decode_advance_output
bne decode60 // always branch (unless page boundary)
db $02 // jam
decode20: // decode from offset 2, unknown code length
txa
eor #$30
and #$30
beq decode26 // branch when mask is fully set
decode24: // decode from offset 2, 4-bit code (then 2 under)
txa
lsr
lsr
and #$0F
tay
lda decode_lut0xxx,y
ldy #$00
sta ($00),y // write to output
inc $00 // advance output
//jsr decode_advance_output
bne decode60 // always branch (unless page boundary)
db $02 // jam
decode26: // decode from offset 2, 6-bit code (then aligned)
txa
and #$0F
tay
lda decode_lut11xxxx,y
ldy #$00
sta ($00),y // write to output
//inc $02 // advance input
//beq die // never branch (unless page boundary)
jsr decode_advance_input
inc $00 // advance output
//jsr decode_advance_output
bne decode00 // always branch (unless page boundary)
db $02 // jam
decode40: // decode from offset 4, unknown code length
//inc $02 // advance input
//beq die // never branch (unless page boundary)
jsr decode_advance_input
txa
eor #$0C // TODO: just use a cmp instruction?
and #$0C
beq decode46 // branch when mask is fully set
// fallthru decode04
decode44: // decode from offset 4, 4-bit code (then aligned)
txa
and #$0F
tay
lda decode_lut0xxx,y
ldy #$00
sta ($00),y // write to output
inc $00 // advance output
//jsr decode_advance_output
bne decode00 // always branch (unless page boundary)
db $02 // jam
decode46: // decode from offset 4, 6-bit code (then 2 over) (READS A BYTE)
txa
and #$03
asl
asl
sta $04
ldy #$00
lda ($02),y // load from input
tax
rol
rol
rol
and #$03
ora $04
tay
lda decode_lut11xxxx,y
ldy #$00
sta ($00),y // write to output
inc $00 // advance output
//jsr decode_advance_output
bne decode20 // always branch (unless page boundary)
db $02 // jam
decode60: // decode from offset 6, unknown code length (READS A BYTE)
//inc $02 // advance input
//beq die // never branch (unless page boundary)
jsr decode_advance_input
ldy #$00
lda ($02),y // load from input
tay
txa
eor #$03
and #$03
beq decode66 // branch when mask is fully set
// fallthru decode64
decode64: // decode from offset 6, 4-bit code (then 2 over)
txa
and #$03
asl
asl
sta $04
tya // load in (restore) the new input
tax // and put it in X like the rest of the code expects
rol
rol
rol
and #$03
ora $04
tay
lda decode_lut0xxx,y
ldy #$00
sta ($00),y // write to output
inc $00 // advance output
//jsr decode_advance_output
bne decode20_shim // always branch (unless page boundary)
db $02 // jam
die:
db $F2 // um lammer jammy
decode20_shim:
jmp decode20
decode66: // decode from offset 6, 6-bit code (then 4 over)
tya // load in (restore) the new input
tax // and put it in X like the rest of the code expects
lsr
lsr
lsr
lsr
tay
lda decode_lut11xxxx,y
ldy #$00
sta ($00),y // write to output
inc $00 // advance output
//jsr decode_advance_output
bne decode40 // always branch (unless page boundary)
db $02 // jam
decode_advance_input:
// do not modify X or Y here
inc $02
beq die // never branch (unless page boundary)
when_to_stop()
rts
if 0 {
decode_advance_output:
// do not modify X or Y here
lda $02
cmp #(name1 & 0xFF)
beq decode_begin_next
cmp #(name2 & 0xFF)
beq decode_begin_next
cmp #(name3 & 0xFF)
beq decode_begin_next
cmp #(name4 & 0xFF)
beq decode_begin_next
cmp #(name5 & 0xFF)
beq decode_exit
inc $00 // advance output
//beq die // never branch (unless page boundary)
rts
}
decode_begin_next:
//inc $00 // advance output
//beq die // never branch (unless page boundary)
pla
pla
jmp decode00
decode_exit:
pla
pla
rts
//db $D2 // jam (D is for Done, i guess)
// //
done:
db $02 // jam
// //
// vim:ft=snes_bass

View file

@ -0,0 +1,176 @@
// included by decode.asm
// cycles on extra-padded "Elizabeth Mary Patricia James Robert":
// 4035-55=3980 (includes jsr and rts)
// program size: 0xCF
// instructions: 1358-17=1341 (includes jsr and rts)
decode_advance_input:
// do not modify X or Y here
inc $02
beq die // never branch (unless page boundary)
when_to_stop()
rts
decode_exit:
pla
pla
rts
//db $D2 // jam (D is for Done, i guess)
decode_common:
tay
lda decode_lut0xxx,y
ldy #$00
sta ($00),y // write to output
inc $00 // advance output
beq die // never branch (unless page boundary)
rts
decode_uncommon:
tay
lda decode_lut11xxxx,y
ldy #$00
sta ($00),y // write to output
inc $00 // advance output
beq die // never branch (unless page boundary)
rts
die:
db $F2 // um lammer jammy
decode_begin_next:
//inc $00 // advance output
//beq die // never branch (unless page boundary)
pla
pla
decode:
// NOTE: output/input pointers cannot cross page boundaries.
// that means the effective longest lengths of output/input are 256/192 bytes.
decode00: // decode from offset 0, unknown code length (READS A BYTE)
ldy #$00
lda ($02),y // load from input
tax // stash for after branch
eor #$C0 // TODO: just use a cmp instruction?
and #$C0
beq decode06 // branch when mask is fully set
// fallthru decode04
decode04: // decode from offset 0, 4-bit code (then 4 under)
txa
lsr
lsr
lsr
lsr
jsr decode_common
bne decode40 // always branch (unless page boundary)
decode06: // decode from offset 0, 6-bit code (then 2 under)
txa
lsr
lsr
and #$0F
jsr decode_uncommon
bne decode60 // always branch (unless page boundary)
decode20: // decode from offset 2, unknown code length
txa
eor #$30
and #$30
beq decode26 // branch when mask is fully set
decode24: // decode from offset 2, 4-bit code (then 2 under)
txa
lsr
lsr
and #$0F
jsr decode_common
bne decode60 // always branch (unless page boundary)
decode26: // decode from offset 2, 6-bit code (then aligned)
jsr decode_advance_input
txa
and #$0F
jsr decode_uncommon
bne decode00 // always branch (unless page boundary)
decode40: // decode from offset 4, unknown code length
//inc $02 // advance input
//beq die // never branch (unless page boundary)
jsr decode_advance_input
txa
eor #$0C // TODO: just use a cmp instruction?
and #$0C
beq decode46 // branch when mask is fully set
// fallthru decode04
decode44: // decode from offset 4, 4-bit code (then aligned)
txa
and #$0F
jsr decode_common
bne decode00 // always branch (unless page boundary)
decode46: // decode from offset 4, 6-bit code (then 2 over) (READS A BYTE)
txa
and #$03
asl
asl
sta $04
ldy #$00
lda ($02),y // load from input
tax
rol
rol
rol
and #$03
ora $04
jsr decode_uncommon
bne decode20 // always branch (unless page boundary)
decode60: // decode from offset 6, unknown code length (READS A BYTE)
//inc $02 // advance input
//beq die // never branch (unless page boundary)
jsr decode_advance_input
ldy #$00
lda ($02),y // load from input
tay
txa
eor #$03
and #$03
beq decode66 // branch when mask is fully set
// fallthru decode64
decode64: // decode from offset 6, 4-bit code (then 2 over)
txa
and #$03
asl
asl
sta $04
tya // load in (restore) the new input
tax // and put it in X like the rest of the code expects
rol
rol
rol
and #$03
ora $04
jsr decode_common
bne decode20 // always branch (unless page boundary)
decode66: // decode from offset 6, 6-bit code (then 4 over)
tya // load in (restore) the new input
tax // and put it in X like the rest of the code expects
lsr
lsr
lsr
lsr
jsr decode_uncommon
bne decode40 // always branch (unless page boundary)
// //
done:
db $02 // jam
// //
// vim:ft=snes_bass

View file

@ -0,0 +1,210 @@
// included by decode.asm
// cycles on extra-padded "Elizabeth Mary Patricia James Robert":
// with double-dec: (dec per bit)
// 7359-55=7304 (includes jsr and rts)
// program size: 0xE0
// with single-dec: (dec per pair)
// 6759-55=6704 (includes jsr and rts)
// program size: 0xDA
// with tax reduction:
// program size: 0xD4
// with single death:
// program size: 0xD3
// with TYA instead of LDA #0:
// program size: 0xD2
// with reduced ORA:
// 6612-55=6557 (includes jsr and rts)
// program size: 0xD0
// with stuff crammed into decode_advance:
// program size: 0xBE
// without a shim to jump back to decode_xx from decode_nextbytew:
// program size: 0xBD
// without any JMPs:
// 6564-55=6509 (includes jsr and rts)
// program size: 0xBC
// without any extraneous DBs:
// program size: 0xBB
// instructions: 2359-17=2342 (includes jsr and rts)
// with sty instead of tya + sta:
// cycles: 6466-55=6411
// instrs: 2310-17=2293
// program size: 0xBA (186, 22 of which are the end-of-string comparisons)
decode_advance:
inc $02
beq die // never branch (unless page boundary)
when_to_stop()
ldy #0
lda ($02),y // load from input
tax // stash for after branch
lda #4 // pairs remaining
sta $04 // write pairs remaining
rts
decode_exit:
pla
pla
rts
decode_begin_next:
pla
pla
decode:
// NOTE: output/input pointers cannot cross page boundaries.
// that means the effective longest lengths of output/input are 256/192 bytes.
lda #4 // pairs remaining
sta $04 // write pairs remaining
ldy #0
lda ($02),y // load from input
tax // stash for after branch
decode_xx: // decode from offset 0, unknown code length
//tya // lda #0
sty $05 // write data so far (nothing)
txa
asl
bcs decode_1x
decode_0x:
asl
tax
bcs decode_01
// fallthru to decode_00
decode_00:
lda #%00000000
bpl decode_read2 // always branch
decode_01:
lda #%00000100
bpl decode_read2 // always branch
decode_1x:
asl
tax
bcs decode_11
// fallthru to decode_10
decode_10:
lda #%00001000
bpl decode_read2 // always branch
decode_11:
lda #%00010000
bpl decode_read4 // always branch
die:
db $F2
decode_nextbytew:
jsr decode_advance
bpl decode_xx // always branch
decode_write:
ora $05
// decode_common stuff:
tay
lda decode_lut0xxx,y
ldy #0
sta ($00),y // write to output
inc $00 // advance output
beq die // never branch (unless page boundary)
dec $04 // decrement pairs remaining
bne decode_xx // branch if we're good, otherwise...
beq decode_nextbytew // (always) branch if we need more pairs
decode_read2_and_ora:
ora $05
decode_read2:
sta $05
dec $04 // decrement pairs remaining
beq decode_nextbyte2
decode_read2_again:
// we have at least one pair left to read from X
txa
asl
bcs decode_read2_1x
decode_read2_0x:
asl
tax
bcs decode_read2_01
decode_read2_00:
lda #%00000000
bpl decode_write // always branch
decode_read2_01:
lda #%00000001
bpl decode_write // always branch
decode_read2_1x:
asl
tax
bcs decode_read2_11
decode_read2_10:
lda #%00000010
bpl decode_write // always branch
decode_read2_11:
lda #%00000011
bpl decode_write // always branch
decode_read4:
sta $05
dec $04 // decrement pairs remaining
beq decode_nextbyte4
decode_read4_again:
// we have at least one pair left to read from X
txa
asl
bcs decode_read4_1x
decode_read4_0x:
asl
tax
bcs decode_read4_01
decode_read4_00:
lda #%00000000
bpl decode_read2_and_ora // always branch
decode_read4_01:
lda #%00000100
bpl decode_read2_and_ora // always branch
decode_read4_1x:
asl
tax
bcs decode_read4_11
decode_read4_10:
lda #%00001000
bpl decode_read2_and_ora // always branch
decode_read4_11:
lda #%00001100
bpl decode_read2_and_ora // always branch
decode_nextbyte2:
jsr decode_advance
bpl decode_read2_again // always branch
decode_nextbyte4:
jsr decode_advance
bpl decode_read4_again // always branch
done:
db $F2
// vim:ft=snes_bass

View file

@ -0,0 +1,191 @@
// included by decode.asm
// cycles on extra-padded "Elizabeth Mary Patricia James Robert":
// without any interleaving of instructions and LUT:
// cycles: 7305-55=7250
// instrs: 2665-17=2648
// program size: 0x96 (150, 22 of which are the end-of-string comparisons)
// with JSRs branched-over instead of branched-to-and-back:
// cycles: 7293-55=7238
// instrs: 2649-17=2632
// program size: 0x92 (146, 22 of which are the end-of-string comparisons)
// with BPL instead of CLC+BCC:
// cycles: 7151-55=7096
// instrs: 2578-17=2561
// program size: 0x90 (144, 22 of which are the end-of-string comparisons)
// with interleaved instructions and LUT:
// cycles: 7151-55=7096
// instrs: 2578-17=2561
// program size: 0xC3 (note that this will *always* be 0xC3 with this method)
// so like, same as v3, except
// instead of branching based on the high bits (through asl or ror),
// we mask out the two bits being used, and ORA them in through Absolute,X mode.
// so that means, at these memory positions (possibly offset by X, on another page),
// we need to occupy a byte:
// actually, wait, in v3, the bits we branch on are always the top two.
// so what i could do is use X, both to offset to the end of the page,
// and also for each decoding case.
// X = A & %11000000
// A = %00111100[X]
align(0x100)
decode_ilut: // internal look-up table
db $00,$00,$00
decode_advance:
inc $02 // advance input
beq die // never branch (unless page boundary)
when_to_stop()
ldy #0
lda ($02),y // load from input
tax // stash for after branch
lda #4 // pairs remaining
sta $04 // write pairs remaining
rts
decode_exit:
pla
pla
rts
nops(decode_ilut + 0x40)
db $04,$01,$04
decode_begin_next:
pla
pla
decode:
// NOTE: output/input pointers cannot cross page boundaries.
// that means the effective longest lengths of output/input are 256/192 bytes.
lda #4 // pairs remaining
sta $04 // write pairs remaining
ldy #0
lda ($02),y // load from input
tax // stash for after branch
decode_xx: // decode from offset 0, unknown code length
// two things need to be done here:
// 1. zp[0x05] |= {%0, %100, %1000, %10000}[A >> 6]
txa // TODO: unnecessary?
and #$C0
tay
lda decode_ilut+0,y // would be ORA, but this is our first data point
sta $05
// 2. branch to decode_read4 is both bits were set, decode_read2 otherwise.
tya
eor #$C0
beq decode_read4
bne decode_read2 // always branch
die:
db $F2
decode_nextbytew:
jsr decode_advance
bpl decode_xx // always branch
decode_write:
lda $05 // TODO: unnecessary?
// decode_common stuff:
tay
lda decode_lut0xxx,y
ldy #0
sta ($00),y // write to output
inc $00 // advance output
beq die // never branch (unless page boundary)
txa
asl
asl
tax
dec $04 // decrement pairs remaining
bne decode_xx // branch if we're good, otherwise...
beq decode_nextbytew // (always) branch if we need more pairs
nops(decode_ilut + 0x80)
db $08,$02,$08
decode_read2:
txa
asl
asl
tax
dec $04 // decrement pairs remaining
bne +
jsr decode_advance
+;
decode_read2_again:
// we have at least one pair left to read from X
// zp[0x05] |= {%0, %1, %10, %11}[A >> 6]
txa // TODO: unnecessary?
and #$C0
tay
lda $05
ora decode_ilut+1,y
sta $05
bpl decode_write // always branch
decode_read4:
txa
asl
asl
tax
dec $04 // decrement pairs remaining
bne +
jsr decode_advance
+;
// we have at least one pair left to read from X
// zp[0x05] |= {%0, %1, %10, %11}[A >> 6]
txa // TODO: unnecessary?
and #$C0
tay
lda $05
ora decode_ilut+2,y
sta $05
bpl decode_read2 // always branch
nops(decode_ilut + 0xC0)
db $10,$03,$0C
done:
db $F2
if 0 {
align(0x100)
decode_ilut: // internal look-up table
db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00
db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00
db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00
db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00
db $04,$01, $04,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00
db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00
db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00
db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00
db $08,$02, $08,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00
db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00
db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00
db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00
db $10,$03, $0C,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00
db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00
db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00
db $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00, $00,$00
}
// decode_ilut[0b00000000] = 0b00000000
// decode_ilut[0b01000000] = 0b00000100
// decode_ilut[0b10000000] = 0b00001000
// decode_ilut[0b11000000] = 0b00010000
// vim:ft=snes_bass

View file

@ -0,0 +1,264 @@
// included by decode.asm
// stats on extra-padded "Elizabeth Mary Patricia James Robert":
// cycles: 7403-55=7348
// instrs: 2702-17=2685
// program size: 0xA3
// with reorganization: (first and second program parts swapped, no need for long jump)
// cycles: 7391-55=7336
// instrs: 2698-17=2681
// program size: 0xA0
// with inlined decode_nextbytew:
// 7358 2687 0xA0
// with rearranged decode_write:
// exiting after 2678 instructions and 7351 cycles. 0xA0
// with removed txa from decode_xx:
// exiting after 2629 instructions and 7253 cycles. 0xA0
// with more txa/tax simplification:
// exiting after 2613 instructions and 7221 cycles. 0x9F
// without the lda $05 in decode_write:
// exiting after 2567 instructions and 7083 cycles. 0x9F
// without the sda $05 in decode_read2_again:
// exiting after 2521 instructions and 6945 cycles. 0x9F
// with one of the `txa; asl; asl; tax` blocks rearranged in branches:
// exiting after 2483 instructions and 6879 cycles. 0x9F
// with the other one too: (the one from decode_read4)
// exiting after 2461 instructions and 6841 cycles. 0xA1
// with redundant taxes removed after those rearrangements:
// exiting after 2406 instructions and 6731 cycles. 0xA0
// with merged decode_read2 and decode_read4:
// exiting after 2381 instructions and 6706 cycles. 0x86
// cycles: 6706-55=6651
// instrs: 2381-17=2364
// program size: 0x86 (134, 22 of which are the end-of-string comparisons)
constant decode_alt(1)
constant very_alt(0) // see decode_v6.asm for an expansion of this idea
align(0x100)
decode_ilut: // internal look-up table
if very_alt {
db $00
} else {
db $00,$00,$00
}
// === BOUNDARY ===
decode_write:
// decode_common stuff:
tay
lda decode_lut0xxx,y
ldy #0
sta ($00),y // write to output
inc $00 // advance output
beq die // never branch (unless page boundary)
dec $04 // decrement pairs remaining
beq + // branch if we need more pairs
txa
asl
asl
bvc decode_xx // always branch
+;
jsr decode_advance
bpl decode_xx // always branch
// === BOUNDARY ===
if decode_alt {
decode_exit:
pla
pla
rts
}
// === BOUNDARY ===
nops(decode_ilut + 0x20)
if very_alt {
db $04
} else {
db $04,$01,$04
}
// === BOUNDARY ===
die:
db $F2
// === BOUNDARY ===
decode_begin_next:
pla
pla
decode:
// NOTE: output/input pointers cannot cross page boundaries.
// that means the effective longest lengths of output/input are 256/192 bytes.
lda #4 // pairs remaining
sta $04 // write pairs remaining
ldy #0
lda ($02),y // load from input
decode_xx: // decode from offset 0, unknown code length
// NOTE: Y is always 0 here, if that helps at all.
tax // stash for after branch
// two things need to be done here:
// 1. zp[0x05] |= {%0, %100, %1000, %10000}[A >> 6]
and #%11000000
lsr
tay
lda decode_ilut+0,y // would be ORA, but this is our first data point
if decode_alt {
bvc decode_read_either // always branch
decode_read4_after:
if very_alt {
and #%00000011
asl
asl
ora #%00010000
bpl decode_read_either // always branch
} else {
and #%01111111
// read4
ora decode_ilut+2,y
bpl decode_read_either // always branch
}
} else {
sta $05 // still part of 1.
// 2. branch to decode_read4 is both bits were set, decode_read2 otherwise.
tya
eor #$60
beq decode_read4
bne decode_read2 // always branch
}
// === BOUNDARY ===
nops(decode_ilut + 0x40)
if very_alt {
db $08
} else {
db $08,$02,$08
}
// === BOUNDARY ===
macro decode_read_common() { // common between the subroutines, not WRT probability
dec $04 // decrement pairs remaining
bne +
jsr decode_advance
bpl ++ // always branch
// we have at least one pair left to read from X
+;
txa
asl
asl
+; // decode_read2_again or decode_read4_again
tax // TODO: unnecessary?
if very_alt {
// zp[0x05] |= {%0, %1, %10, %11}[A >> 6]
rol
rol
rol
and #%00000011
ora $05
// then elsewhere, ora $05, instead of the lda $05 normally
// should be slightly faster for the read2 case
} else {
and #%11000000
lsr
tay
lda $05
}
}
if decode_alt {
decode_read_either:
sta $05 // still part of 1.
decode_read_common()
bmi decode_read4_after
// read2
if very_alt {
;
} else {
ora decode_ilut+1,y
}
bpl decode_write // always branch
} else {
decode_read2:
decode_read_common()
ora decode_ilut+1,y
bpl decode_write // always branch
}
// === BOUNDARY ===
if decode_alt {
; // nothing else to add
} else {
decode_exit:
pla
pla
rts
}
// === BOUNDARY ===
nops(decode_ilut + 0x60)
if decode_alt {
if very_alt {
db $90
} else {
db $90,$03,$0C
}
} else {
db $10,$03,$0C
}
// === BOUNDARY ===
if decode_alt {
; // nothing else to do
} else {
decode_read4:
decode_read_common()
ora decode_ilut+2,y
sta $05
bpl decode_read2 // always branch
}
// === BOUNDARY ===
decode_advance:
inc $02 // advance input
beq die // never branch (unless page boundary)
when_to_stop()
ldy #0
lda ($02),y // load from input
ldx #4 // pairs remaining
stx $04 // write pairs remaining
rts // A = *input; X = 4; Y = 0
// === BOUNDARY ===
done:
db $F2 // === BOUNDARY ===
// vim:ft=snes_bass

View file

@ -0,0 +1,147 @@
// included by decode.asm
// stats on extra-padded "Elizabeth Mary Patricia James Robert":
// exiting after 2456 instructions and 6714 cycles. 0x84
// with a bunch of optimization: (yeah i forgot)
// exiting after 2483 instructions and 6599 cycles. 0x74
// cycles: 6599-55=6544
// instrs: 2483-17=2466
// program size: 0x74 (116, 22 of which are the end-of-string comparisons)
// with decode_read4_after moved further down:
// exiting after 2459 instructions and 6527 cycles. 0x74
// cycles: 6527-55=6472
// instrs: 2459-17=2442
// program size: 0x74 (116, 22 of which are the end-of-string comparisons)
constant decode_internalize(1)
if decode_internalize {
decode_ilut:
db "ETAOINSHRDLU????CMFPGWYBVKXJQZ. "
}
decode_begin_next:
pla
pla
decode:
// NOTE: output/input pointers cannot cross page boundaries.
// that means the effective longest lengths of output/input are 256/192 bytes.
lda #4 // pairs remaining
sta $04 // write pairs remaining
ldy #0
lda ($02),y // load from input
decode_xx:
// NOTE: Y is always 0 here, if that helps at all.
tax // stash for after branch
and #%11000000
lsr
lsr
lsr
cmp #%00011000
ror
//bcc decode_read_either // always branch
decode_read_either:
sta $05
dec $04 // decrement pairs remaining
bne +
jsr decode_advance
bpl decode_read_again // always branch
// we have at least one pair left to read from X
+;
txa
asl
asl
decode_read_again:
tax
rol
rol
rol
and #%00000011
ora $05
bmi decode_read4_after
//cmp #%00011100
//bcs decode_read4_after
// read2, nothing left to do, so just write it
//bpl decode_write // always branch
decode_write:
// decode_common stuff:
tay
if decode_internalize {
lda decode_ilut,y
} else {
lda decode_lut0xxx,y
}
ldy #0
sta ($00),y // write to output
inc $00 // advance output
beq die // never branch (unless page boundary)
dec $04 // decrement pairs remaining
beq + // branch if we need more pairs
txa
asl
asl
bvc decode_xx // always branch (FIXME: might depend on output address)
+;
if 1 {
jsr decode_advance
bpl decode_xx // always branch
} else {
// 1 cycle(?) slower and 1 byte larger
lda #(decode_xx - 1) >> 8
pha
lda #(decode_xx - 1) & 0xFF
pha
// fallthru to decode_advance
}
decode_read4_after:
and #%00000011
asl
asl
ora #%00010000
bpl decode_read_either // always branch
decode_advance:
inc $02 // advance input
beq die // never branch (unless page boundary)
when_to_stop()
ldy #0
lda ($02),y // load from input
ldx #4 // pairs remaining
stx $04 // write pairs remaining
rts // A = *input; X = 4; Y = 0
decode_exit:
if 1 {
// 4+4=8 cycles, 2 bytes:
pla
pla
} else {
// 3+2=5 cycles, 3 bytes, also penalty for having to set up $06 in the first place:
ldx $06
txs
}
rts
die:
db $F2
done:
db $D2
// vim:ft=snes_bass