264 lines
5.3 KiB
NASM
264 lines
5.3 KiB
NASM
// included by decode.asm
|
|
// stats on extra-padded "Elizabeth Mary Patricia James Robert":
|
|
// cycles: 7403-55=7348
|
|
// instrs: 2702-17=2685
|
|
// program size: 0xA3
|
|
// with reorganization: (first and second program parts swapped, no need for long jump)
|
|
// cycles: 7391-55=7336
|
|
// instrs: 2698-17=2681
|
|
// program size: 0xA0
|
|
// with inlined decode_nextbytew:
|
|
// 7358 2687 0xA0
|
|
// with rearranged decode_write:
|
|
// exiting after 2678 instructions and 7351 cycles. 0xA0
|
|
// with removed txa from decode_xx:
|
|
// exiting after 2629 instructions and 7253 cycles. 0xA0
|
|
// with more txa/tax simplification:
|
|
// exiting after 2613 instructions and 7221 cycles. 0x9F
|
|
// without the lda $05 in decode_write:
|
|
// exiting after 2567 instructions and 7083 cycles. 0x9F
|
|
// without the sda $05 in decode_read2_again:
|
|
// exiting after 2521 instructions and 6945 cycles. 0x9F
|
|
// with one of the `txa; asl; asl; tax` blocks rearranged in branches:
|
|
// exiting after 2483 instructions and 6879 cycles. 0x9F
|
|
// with the other one too: (the one from decode_read4)
|
|
// exiting after 2461 instructions and 6841 cycles. 0xA1
|
|
// with redundant taxes removed after those rearrangements:
|
|
// exiting after 2406 instructions and 6731 cycles. 0xA0
|
|
// with merged decode_read2 and decode_read4:
|
|
// exiting after 2381 instructions and 6706 cycles. 0x86
|
|
// cycles: 6706-55=6651
|
|
// instrs: 2381-17=2364
|
|
// program size: 0x86 (134, 22 of which are the end-of-string comparisons)
|
|
|
|
constant decode_alt(1)
|
|
constant very_alt(0) // see decode_v6.asm for an expansion of this idea
|
|
|
|
align(0x100)
|
|
decode_ilut: // internal look-up table
|
|
if very_alt {
|
|
db $00
|
|
} else {
|
|
db $00,$00,$00
|
|
}
|
|
|
|
// === BOUNDARY ===
|
|
|
|
decode_write:
|
|
// decode_common stuff:
|
|
tay
|
|
lda decode_lut0xxx,y
|
|
ldy #0
|
|
sta ($00),y // write to output
|
|
inc $00 // advance output
|
|
beq die // never branch (unless page boundary)
|
|
dec $04 // decrement pairs remaining
|
|
beq + // branch if we need more pairs
|
|
|
|
txa
|
|
asl
|
|
asl
|
|
bvc decode_xx // always branch
|
|
|
|
+;
|
|
jsr decode_advance
|
|
bpl decode_xx // always branch
|
|
|
|
// === BOUNDARY ===
|
|
|
|
if decode_alt {
|
|
decode_exit:
|
|
pla
|
|
pla
|
|
rts
|
|
}
|
|
|
|
// === BOUNDARY ===
|
|
|
|
nops(decode_ilut + 0x20)
|
|
if very_alt {
|
|
db $04
|
|
} else {
|
|
db $04,$01,$04
|
|
}
|
|
|
|
// === BOUNDARY ===
|
|
|
|
die:
|
|
db $F2
|
|
|
|
// === BOUNDARY ===
|
|
|
|
decode_begin_next:
|
|
pla
|
|
pla
|
|
|
|
decode:
|
|
// NOTE: output/input pointers cannot cross page boundaries.
|
|
// that means the effective longest lengths of output/input are 256/192 bytes.
|
|
|
|
lda #4 // pairs remaining
|
|
sta $04 // write pairs remaining
|
|
|
|
ldy #0
|
|
lda ($02),y // load from input
|
|
|
|
decode_xx: // decode from offset 0, unknown code length
|
|
// NOTE: Y is always 0 here, if that helps at all.
|
|
tax // stash for after branch
|
|
|
|
// two things need to be done here:
|
|
|
|
// 1. zp[0x05] |= {%0, %100, %1000, %10000}[A >> 6]
|
|
and #%11000000
|
|
lsr
|
|
tay
|
|
lda decode_ilut+0,y // would be ORA, but this is our first data point
|
|
|
|
if decode_alt {
|
|
bvc decode_read_either // always branch
|
|
|
|
decode_read4_after:
|
|
if very_alt {
|
|
and #%00000011
|
|
asl
|
|
asl
|
|
ora #%00010000
|
|
bpl decode_read_either // always branch
|
|
} else {
|
|
and #%01111111
|
|
// read4
|
|
ora decode_ilut+2,y
|
|
bpl decode_read_either // always branch
|
|
}
|
|
|
|
} else {
|
|
sta $05 // still part of 1.
|
|
|
|
// 2. branch to decode_read4 is both bits were set, decode_read2 otherwise.
|
|
tya
|
|
eor #$60
|
|
beq decode_read4
|
|
bne decode_read2 // always branch
|
|
}
|
|
|
|
// === BOUNDARY ===
|
|
|
|
nops(decode_ilut + 0x40)
|
|
if very_alt {
|
|
db $08
|
|
} else {
|
|
db $08,$02,$08
|
|
}
|
|
|
|
// === BOUNDARY ===
|
|
|
|
macro decode_read_common() { // common between the subroutines, not WRT probability
|
|
dec $04 // decrement pairs remaining
|
|
bne +
|
|
jsr decode_advance
|
|
bpl ++ // always branch
|
|
|
|
// we have at least one pair left to read from X
|
|
|
|
+;
|
|
txa
|
|
asl
|
|
asl
|
|
|
|
+; // decode_read2_again or decode_read4_again
|
|
tax // TODO: unnecessary?
|
|
if very_alt {
|
|
// zp[0x05] |= {%0, %1, %10, %11}[A >> 6]
|
|
rol
|
|
rol
|
|
rol
|
|
and #%00000011
|
|
ora $05
|
|
// then elsewhere, ora $05, instead of the lda $05 normally
|
|
// should be slightly faster for the read2 case
|
|
} else {
|
|
and #%11000000
|
|
lsr
|
|
tay
|
|
lda $05
|
|
}
|
|
}
|
|
|
|
if decode_alt {
|
|
decode_read_either:
|
|
sta $05 // still part of 1.
|
|
|
|
decode_read_common()
|
|
bmi decode_read4_after
|
|
|
|
// read2
|
|
if very_alt {
|
|
;
|
|
} else {
|
|
ora decode_ilut+1,y
|
|
}
|
|
bpl decode_write // always branch
|
|
|
|
} else {
|
|
decode_read2:
|
|
decode_read_common()
|
|
ora decode_ilut+1,y
|
|
bpl decode_write // always branch
|
|
}
|
|
|
|
// === BOUNDARY ===
|
|
|
|
if decode_alt {
|
|
; // nothing else to add
|
|
} else {
|
|
decode_exit:
|
|
pla
|
|
pla
|
|
rts
|
|
}
|
|
|
|
// === BOUNDARY ===
|
|
|
|
nops(decode_ilut + 0x60)
|
|
if decode_alt {
|
|
if very_alt {
|
|
db $90
|
|
} else {
|
|
db $90,$03,$0C
|
|
}
|
|
} else {
|
|
db $10,$03,$0C
|
|
}
|
|
|
|
// === BOUNDARY ===
|
|
|
|
if decode_alt {
|
|
; // nothing else to do
|
|
|
|
} else {
|
|
decode_read4:
|
|
decode_read_common()
|
|
ora decode_ilut+2,y
|
|
sta $05
|
|
bpl decode_read2 // always branch
|
|
}
|
|
|
|
// === BOUNDARY ===
|
|
|
|
decode_advance:
|
|
inc $02 // advance input
|
|
beq die // never branch (unless page boundary)
|
|
when_to_stop()
|
|
ldy #0
|
|
lda ($02),y // load from input
|
|
ldx #4 // pairs remaining
|
|
stx $04 // write pairs remaining
|
|
rts // A = *input; X = 4; Y = 0
|
|
|
|
// === BOUNDARY ===
|
|
|
|
done:
|
|
db $F2 // === BOUNDARY ===
|
|
|
|
// vim:ft=snes_bass
|