Improved memset/memcpy/memmove functions by Christian Krueger.

git-svn-id: svn://svn.cc65.org/cc65/trunk@4200 b7a2c559-68d2-44c3-8de9-860c34a00d81
This commit is contained in:
uz 2009-09-20 14:32:25 +00:00
parent 2153cc46db
commit 0e91f15ea8
3 changed files with 140 additions and 89 deletions

View file

@ -1,5 +1,7 @@
; ;
; Ullrich von Bassewitz, 2003-08-20 ; Ullrich von Bassewitz, 2003-08-20
; Performance increase (about 20%) by
; Christian Krueger, 2009-09-13
; ;
; void* __fastcall__ memcpy (void* dest, const void* src, size_t n); ; void* __fastcall__ memcpy (void* dest, const void* src, size_t n);
; ;
@ -10,61 +12,69 @@
.export _memcpy, memcpy_upwards, memcpy_getparams .export _memcpy, memcpy_upwards, memcpy_getparams
.import popax .import popax
.importzp ptr1, ptr2, ptr3, tmp1 .importzp sp, ptr1, ptr2, ptr3
; ---------------------------------------------------------------------- ; ----------------------------------------------------------------------
_memcpy: _memcpy:
jsr memcpy_getparams jsr memcpy_getparams
memcpy_upwards: memcpy_upwards: ; assert Y = 0
ldy #0 ldx ptr3+1 ; Get high byte of n
ldx ptr3 ; Get low counter byte beq L2 ; Jump if zero
; Copy loop L1: .repeat 2 ; Unroll this a bit to make it faster...
lda (ptr1),Y ; copy a byte
sta (ptr2),Y
iny
.endrepeat
bne L1
inc ptr1+1
inc ptr2+1
dex ; Next 256 byte block
bne L1 ; Repeat if any
@L1: inx ; Bump low counter byte ; the following section could be 10% faster if we were able to copy
beq @L3 ; Jump on overflow ; back to front - unfortunately we are forced to copy strict from
@L2: lda (ptr1),y ; low to high since this function is also used for
sta (ptr2),y ; memmove and blocks could be overlapping!
iny ; {
bne @L1 L2: ; assert Y = 0
inc ptr1+1 ; Bump pointers ldx ptr3 ; Get the low byte of n
inc ptr2+1 beq done ; something to copy
bne @L1 ; Branch always
@L3: inc ptr3+1 ; Bump high counter byte
bne @L2
; Done. The low byte of dest is still in ptr2 L3: lda (ptr1),Y ; copy a byte
sta (ptr2),Y
iny
dex
bne L3
done: lda ptr2 ; }
ldx tmp1 ; get function result (dest)
rts done: jmp popax ; Pop ptr and return as result
; ---------------------------------------------------------------------- ; ----------------------------------------------------------------------
; Get the parameters from stack as follows: ; Get the parameters from stack as follows:
; ;
; -(size-1) --> ptr3 ; size --> ptr3
; src --> ptr1 ; src --> ptr1
; dest --> ptr2 ; dest --> ptr2
; high(dest) --> tmp1 ; First argument (dest) will remain on stack and is returned in a/x!
;
; dest is returned in a/x.
memcpy_getparams: memcpy_getparams: ; IMPORTANT! Function has to leave with Y=0!
eor #$FF sta ptr3
sta ptr3 stx ptr3+1 ; save n to ptr3
txa
eor #$FF
sta ptr3+1 ; Save -(size-1)
jsr popax ; src jsr popax
sta ptr1 sta ptr1
stx ptr1+1 stx ptr1+1 ; save src to ptr1
jsr popax ; dest
sta ptr2
stx ptr2+1 ; Save work copy
stx tmp1 ; Save for function result
rts
; save dest to ptr2
ldy #1 ; (direct stack access is three cycles faster
; (total cycle count with return))
lda (sp),y
tax
stx ptr2+1 ; save high byte of ptr2
dey ; Y = 0
lda (sp),y ; Get ptr2 low
sta ptr2
rts

View file

@ -1,5 +1,7 @@
; ;
; Ullrich von Bassewitz, 2003-08-20 ; Ullrich von Bassewitz, 2003-08-20
; Performance increase (about 20%) by
; Christian Krueger, 2009-09-13
; ;
; void* __fastcall__ memmove (void* dest, const void* src, size_t size); ; void* __fastcall__ memmove (void* dest, const void* src, size_t size);
; ;
@ -7,7 +9,7 @@
; ;
.export _memmove .export _memmove
.import memcpy_getparams, memcpy_upwards .import memcpy_getparams, memcpy_upwards, popax
.importzp ptr1, ptr2, ptr3, ptr4, tmp1 .importzp ptr1, ptr2, ptr3, ptr4, tmp1
.macpack generic .macpack generic
@ -15,9 +17,6 @@
; ---------------------------------------------------------------------- ; ----------------------------------------------------------------------
_memmove: _memmove:
sta ptr4
stx ptr4+1 ; Size -> ptr4
jsr memcpy_getparams jsr memcpy_getparams
; Check for the copy direction. If dest < src, we must copy upwards (start at ; Check for the copy direction. If dest < src, we must copy upwards (start at
@ -33,35 +32,53 @@ _memmove:
; Copy downwards. Adjust the pointers to the end of the memory regions. ; Copy downwards. Adjust the pointers to the end of the memory regions.
lda ptr1+1 lda ptr1+1
add ptr4+1 add ptr3+1
sta ptr1+1 sta ptr1+1
lda ptr2+1 lda ptr2+1
add ptr4+1 add ptr3+1
sta ptr2+1 sta ptr2+1
; Load the low offset into Y, and the counter low byte into X. ; handle fractions of a page size first
ldy ptr4 ldy ptr3 ; count, low byte
ldx ptr3 bne @entry ; something to copy?
jmp @L2 beq PageSizeCopy ; here like bra...
; Copy loop @copyByte:
lda (ptr1),y
sta (ptr2),y
@entry:
dey
bne @copyByte
lda (ptr1),y ; copy remaining byte
sta (ptr2),y
@L1: dey PageSizeCopy: ; assert Y = 0
ldx ptr3+1 ; number of pages
beq done ; none? -> done
@initBase:
dec ptr1+1 ; adjust base...
dec ptr2+1
dey ; in entry case: 0 -> FF
lda (ptr1),y ; need to copy this 'intro byte'
sta (ptr2),y ; to 'land' later on Y=0! (as a result of the '.repeat'-block!)
dey ; FF ->FE
@copyBytes:
.repeat 2 ; Unroll this a bit to make it faster...
lda (ptr1),y lda (ptr1),y
sta (ptr2),y sta (ptr2),y
dey
@L2: inx ; Bump counter low byte .endrepeat
bne @L1 @copyEntry: ; in entry case: 0 -> FF
dec ptr1+1 bne @copyBytes
dec ptr2+1 lda (ptr1),y ; Y = 0, copy last byte
inc ptr3+1 ; Bump counter high byte sta (ptr2),y
bne @L1 dex ; one page to copy less
bne @initBase ; still a page to copy?
; Done, return dest ; Done, return dest
done: lda ptr2 done: jmp popax ; Pop ptr and return as result
ldx tmp1 ; get function result (dest)
rts

View file

@ -1,9 +1,11 @@
; ;
; void* memset (void* ptr, int c, size_t n); ; void* __fastcall__ memset (void* ptr, int c, size_t n);
; void* _bzero (void* ptr, size_t n); ; void* __fastcall__ _bzero (void* ptr, size_t n);
; void bzero (void* ptr, size_t n); ; void __fastcall__ bzero (void* ptr, size_t n);
; ;
; Ullrich von Bassewitz, 29.05.1998 ; Ullrich von Bassewitz, 29.05.1998
; Performance increase (about 20%) by
; Christian Krueger, 12.09.2009
; ;
; NOTE: bzero will return it's first argument as memset does. It is no problem ; NOTE: bzero will return it's first argument as memset does. It is no problem
; to declare the return value as void, since it may be ignored. _bzero ; to declare the return value as void, since it may be ignored. _bzero
@ -15,57 +17,79 @@
.export _memset, _bzero, __bzero .export _memset, _bzero, __bzero
.import popax .import popax
.importzp sp, ptr1, ptr2, ptr3, tmp1 .importzp sp, ptr1, ptr2, ptr3
_bzero: _bzero:
__bzero: __bzero:
sta ptr3 sta ptr3
stx ptr3+1 ; Save n stx ptr3+1 ; Save n
lda #0 ; Fill with zeros ldx #0 ; Fill with zeros
beq common beq common
_memset: _memset:
sta ptr3 ; Save n sta ptr3 ; Save n
stx ptr3+1 stx ptr3+1
jsr popax ; Get c jsr popax ; Get c
tax
; Common stuff for memset and bzero from here ; Common stuff for memset and bzero from here
common: sta tmp1 ; Save the fill value common: ; Fill value is in X!
ldy #1 ldy #1
lda (sp),y lda (sp),y
tax sta ptr1+1 ; save high byte of ptr
dey dey ; Y = 0
lda (sp),y ; Get ptr lda (sp),y ; Get ptr
sta ptr1 sta ptr1
stx ptr1+1 ; Save work copy
lda tmp1 ; Load fill value lsr ptr3+1 ; divide number of
ldy #0 ror ptr3 ; bytes by two to increase
bcc evenCount ; speed (ptr3 = ptr3/2)
oddCount:
; y is still 0 here
txa ; restore fill value
sta (ptr1),y ; save value and increase
inc ptr1 ; dest. pointer
bne evenCount
inc ptr1+1
evenCount:
lda ptr1 ; build second pointer section
clc
adc ptr3 ; ptr2 = ptr1 + (length/2) <- ptr3
sta ptr2
lda ptr1+1
adc ptr3+1
sta ptr2+1
txa ; restore fill value
ldx ptr3+1 ; Get high byte of n ldx ptr3+1 ; Get high byte of n
beq L2 ; Jump if zero beq L2 ; Jump if zero
; Set 256 byte blocks ; Set 256/512 byte blocks
; y is still 0 here
L1: .repeat 2 ; Unroll this a bit to make it faster L1: .repeat 2 ; Unroll this a bit to make it faster
sta (ptr1),y ; Set one byte sta (ptr1),y ; Set byte in lower section
iny sta (ptr2),y ; Set byte in upper section
iny
.endrepeat .endrepeat
bne L1 bne L1
inc ptr1+1 inc ptr1+1
inc ptr2+1
dex ; Next 256 byte block dex ; Next 256 byte block
bne L1 ; Repeat if any bne L1 ; Repeat if any
; Set the remaining bytes if any ; Set the remaining bytes if any
L2: ldx ptr3 ; Get the low byte of n L2: ldy ptr3 ; Get the low byte of n
beq L9 ; Low byte is zero bne L3 ; something to set?
jmp popax ; no -> Pop ptr and return as result
L3: sta (ptr1),y ; Set one byte L3a: sta (ptr1),y ; set bytes in low
iny sta (ptr2),y ; and high section
dex ; Done? L3: dey
bne L3 bne L3a
sta (ptr1),y ; Set remaining byte(s)
L9: jmp popax ; Pop ptr and return as result sta (ptr2),y
jmp popax ; Pop ptr and return as result