#
#  (c) Copyright 1999 -- Anders Torger
#
#  This software is free. You can redistribute it and/or modify it under the
#  terms of the GNU General Public License as published by the Free Software
#  Foundation.
#
################################################################################
#
# This code is an optimised assembler version of the function wfir_process_int
# found in wfir.c. That function can be used as a reference to understand this
# code.
#
#  -4(%ebp) accumulator low bits
#  -8(%ebp) accumulator high bits
# -12(%ebp) loop counter storage, to free up ecx
# -16(%ebp) buffer for new internal state value, to free up registers
#
.globl wfir_process_asm127
	.type	 wfir_process_asm127,@function
wfir_process_asm127:
	pushl %ebp           # set up stack and save registers
	movl %esp,%ebp
	subl $20,%esp
	pushl %ebx
	pushl %esi
	pushl %edi
	xorl %eax, %eax      # set accumulator to zero
	movl %eax, -4(%ebp) 
	movl %eax, -8(%ebp) 
	movl 8(%ebp), %ebx   # load sample
	movl 20(%ebp), %esi  # load state pointer
	movl 16(%ebp), %ecx  # load filter length (loop counter)
	movl 12(%ebp), %edi  # load coefficients pointer
	# loop
	.align 16            # 16 byte loop entry alignment
	.L1127:              
	movl (%esi), %edx    # load state[i]
	movl 4(%esi), %eax   # load state[i+1]
	subl %ebx, %eax      # eax = state[i+1] - z
	movl %ecx, -12(%ebp) # free up ecx
	addl %eax, %edx      # edx = state[i] + state[i+1] - z
	movl %eax, %ecx      # if eax is negative add 127, correction
	sarl $31, %ecx       #   for the division by shifting
	andl $127, %ecx     
	addl %ecx, %eax    
	movl %ebx, (%esi)    # state[i] = z
	sarl $7, %eax        # divide (state[i+1] - z) with 128
	subl %eax, %edx      # edx = state[i] + 127/128(state[i+1] - z)
	movl (%edi), %eax    # load coeffs[i]
	movl %edx, -16(%ebp) # free up edx
	movl %eax, %ecx      # copy coeffs[i] to ecx

	mull %ebx            # coeffs[i] * state[i], i. e. eax * ebx
	sarl $31, %ecx       # if coeffs[i] < 0 then ecx = -1 else 0
	imul %ebx, %ecx      # ecx = state[i] * ecx
	addl %ecx, %edx      # add ecx to high order bits
	movl (%edi), %ecx    # copy coeffs[i] to ecx
	sarl $31, %ebx       # if state[i] < 0 then ecx = -1 else 0
	imul %ecx, %ebx      # ebx = coeffs[i] * ebx
	addl %ebx, %edx      # add ebx to high order bits
	addl $4, %edi        

	movl -4(%ebp), %ebx  # copy accsum low order bits to ebx
	movl -8(%ebp), %ecx  # copy accsum high order bits to ecx
	addl %ebx, %eax      # add low order bits
	movl -16(%ebp), %ebx # ebx = z
	adcl %ecx, %edx      # add high order bits with carry
	movl %eax, -4(%ebp)  # copy new accsum low order bits
	movl -12(%ebp), %ecx # restore loop counter 
	movl %edx, -8(%ebp)  # copy new accsum high order bits

	addl $4, %esi        
	decl %ecx            # decrement loop counter
	jnz .L1127           # if result of above != 0, goto loop start

	movl -24(%ebp),%ebx  # restore registers and return
	movl -28(%ebp),%esi
	movl -32(%ebp),%edi
	leave
	ret

# below there are copies of the same function, however with different warp
# factors.

.globl wfir_process_asm1
	.type	 wfir_process_asm1,@function
wfir_process_asm1:
	pushl %ebp          
	movl %esp,%ebp
	subl $20,%esp
	pushl %ebx
	pushl %esi
	pushl %edi
	xorl %eax, %eax     
	movl %eax, -4(%ebp) 
	movl %eax, -8(%ebp) 
	movl 8(%ebp), %ebx  
	movl 20(%ebp), %esi 
	movl 16(%ebp), %ecx 
	movl 12(%ebp), %edi 
	# loop
	.align 16           
	.L1001:              
	movl (%esi), %edx   
	movl 4(%esi), %eax  
	subl %ebx, %eax     
	movl %ecx, -12(%ebp)
	addl %eax, %edx     
	movl %eax, %ecx
	sarl $31, %ecx 
	andl $1, %ecx     
	addl %ecx, %eax    
	movl %ebx, (%esi)
	sarl $1, %eax
	subl %eax, %edx     
	movl (%edi), %eax   
	movl %edx, -16(%ebp)
	movl %eax, %ecx     

	mull %ebx           
	sarl $31, %ecx      
	imul %ebx, %ecx     
	addl %ecx, %edx     
	movl (%edi), %ecx   
	sarl $31, %ebx      
	imul %ecx, %ebx     
	addl %ebx, %edx     
	addl $4, %edi        

	movl -4(%ebp), %ebx 
	movl -8(%ebp), %ecx 
	addl %ebx, %eax     
	movl -16(%ebp), %ebx
	adcl %ecx, %edx     
	movl %eax, -4(%ebp) 
	movl -12(%ebp), %ecx
	movl %edx, -8(%ebp) 

	addl $4, %esi        
	decl %ecx         
	jnz .L1001        

	movl -24(%ebp),%ebx
	movl -28(%ebp),%esi
	movl -32(%ebp),%edi
	leave
	ret

.globl wfir_process_asm3
	.type	 wfir_process_asm3,@function
wfir_process_asm3:
	pushl %ebp          
	movl %esp,%ebp
	subl $20,%esp
	pushl %ebx
	pushl %esi
	pushl %edi
	xorl %eax, %eax     
	movl %eax, -4(%ebp) 
	movl %eax, -8(%ebp) 
	movl 8(%ebp), %ebx  
	movl 20(%ebp), %esi 
	movl 16(%ebp), %ecx 
	movl 12(%ebp), %edi 
	# loop
	.align 16           
	.L1003:              
	movl (%esi), %edx   
	movl 4(%esi), %eax  
	subl %ebx, %eax     
	movl %ecx, -12(%ebp)
	addl %eax, %edx     
	movl %eax, %ecx
	sarl $31, %ecx 
	andl $3, %ecx     
	addl %ecx, %eax    
	movl %ebx, (%esi)
	sarl $2, %eax
	subl %eax, %edx     
	movl (%edi), %eax   
	movl %edx, -16(%ebp)
	movl %eax, %ecx     

	mull %ebx           
	sarl $31, %ecx      
	imul %ebx, %ecx     
	addl %ecx, %edx     
	movl (%edi), %ecx   
	sarl $31, %ebx      
	imul %ecx, %ebx     
	addl %ebx, %edx     
	addl $4, %edi        

	movl -4(%ebp), %ebx 
	movl -8(%ebp), %ecx 
	addl %ebx, %eax     
	movl -16(%ebp), %ebx
	adcl %ecx, %edx     
	movl %eax, -4(%ebp) 
	movl -12(%ebp), %ecx
	movl %edx, -8(%ebp) 

	addl $4, %esi        
	decl %ecx         
	jnz .L1003        

	movl -24(%ebp),%ebx
	movl -28(%ebp),%esi
	movl -32(%ebp),%edi
	leave
	ret

.globl wfir_process_asm7
	.type	 wfir_process_asm7,@function
wfir_process_asm7:
	pushl %ebp          
	movl %esp,%ebp
	subl $20,%esp
	pushl %ebx
	pushl %esi
	pushl %edi
	xorl %eax, %eax     
	movl %eax, -4(%ebp) 
	movl %eax, -8(%ebp) 
	movl 8(%ebp), %ebx  
	movl 20(%ebp), %esi 
	movl 16(%ebp), %ecx 
	movl 12(%ebp), %edi 
	# loop
	.align 16           
	.L1007:              
	movl (%esi), %edx   
	movl 4(%esi), %eax  
	subl %ebx, %eax     
	movl %ecx, -12(%ebp)
	addl %eax, %edx     
	movl %eax, %ecx
	sarl $31, %ecx
	andl $7, %ecx     
	addl %ecx, %eax    
	movl %ebx, (%esi)
	sarl $3, %eax 
	subl %eax, %edx     
	movl (%edi), %eax   
	movl %edx, -16(%ebp)
	movl %eax, %ecx     

	mull %ebx           
	sarl $31, %ecx      
	imul %ebx, %ecx     
	addl %ecx, %edx     
	movl (%edi), %ecx   
	sarl $31, %ebx      
	imul %ecx, %ebx     
	addl %ebx, %edx     
	addl $4, %edi        

	movl -4(%ebp), %ebx 
	movl -8(%ebp), %ecx 
	addl %ebx, %eax     
	movl -16(%ebp), %ebx
	adcl %ecx, %edx     
	movl %eax, -4(%ebp) 
	movl -12(%ebp), %ecx
	movl %edx, -8(%ebp) 

	addl $4, %esi        
	decl %ecx         
	jnz .L1007        

	movl -24(%ebp),%ebx
	movl -28(%ebp),%esi
	movl -32(%ebp),%edi
	leave
	ret

.globl wfir_process_asm15
	.type	 wfir_process_asm15,@function
wfir_process_asm15:
	pushl %ebp          
	movl %esp,%ebp
	subl $20,%esp
	pushl %ebx
	pushl %esi
	pushl %edi
	xorl %eax, %eax     
	movl %eax, -4(%ebp) 
	movl %eax, -8(%ebp) 
	movl 8(%ebp), %ebx  
	movl 20(%ebp), %esi 
	movl 16(%ebp), %ecx 
	movl 12(%ebp), %edi 
	# loop
	.align 16           
	.L1015:              
	movl (%esi), %edx   
	movl 4(%esi), %eax  
	subl %ebx, %eax     
	movl %ecx, -12(%ebp)
	addl %eax, %edx     
	movl %eax, %ecx 
	sarl $31, %ecx  
	andl $15, %ecx     
	addl %ecx, %eax    
	movl %ebx, (%esi) 
	sarl $4, %eax     
	subl %eax, %edx     
	movl (%edi), %eax   
	movl %edx, -16(%ebp)
	movl %eax, %ecx     

	mull %ebx           
	sarl $31, %ecx      
	imul %ebx, %ecx     
	addl %ecx, %edx     
	movl (%edi), %ecx   
	sarl $31, %ebx      
	imul %ecx, %ebx     
	addl %ebx, %edx     
	addl $4, %edi        

	movl -4(%ebp), %ebx 
	movl -8(%ebp), %ecx 
	addl %ebx, %eax     
	movl -16(%ebp), %ebx
	adcl %ecx, %edx     
	movl %eax, -4(%ebp) 
	movl -12(%ebp), %ecx
	movl %edx, -8(%ebp) 

	addl $4, %esi        
	decl %ecx         
	jnz .L1015        

	movl -24(%ebp),%ebx
	movl -28(%ebp),%esi
	movl -32(%ebp),%edi
	leave
	ret

.globl wfir_process_asm31
	.type	 wfir_process_asm31,@function
wfir_process_asm31:
	pushl %ebp          
	movl %esp,%ebp
	subl $20,%esp
	pushl %ebx
	pushl %esi
	pushl %edi
	xorl %eax, %eax     
	movl %eax, -4(%ebp) 
	movl %eax, -8(%ebp) 
	movl 8(%ebp), %ebx  
	movl 20(%ebp), %esi 
	movl 16(%ebp), %ecx 
	movl 12(%ebp), %edi 
	# loop
	.align 16           
	.L1031:              
	movl (%esi), %edx   
	movl 4(%esi), %eax  
	subl %ebx, %eax     
	movl %ecx, -12(%ebp)
	addl %eax, %edx     
	movl %eax, %ecx 
	sarl $31, %ecx  
	andl $31, %ecx     
	addl %ecx, %eax    
	movl %ebx, (%esi)
	sarl $5, %eax  
	subl %eax, %edx     
	movl (%edi), %eax   
	movl %edx, -16(%ebp)
	movl %eax, %ecx     

	mull %ebx           
	sarl $31, %ecx      
	imul %ebx, %ecx     
	addl %ecx, %edx     
	movl (%edi), %ecx   
	sarl $31, %ebx      
	imul %ecx, %ebx     
	addl %ebx, %edx     
	addl $4, %edi        

	movl -4(%ebp), %ebx 
	movl -8(%ebp), %ecx 
	addl %ebx, %eax     
	movl -16(%ebp), %ebx
	adcl %ecx, %edx     
	movl %eax, -4(%ebp) 
	movl -12(%ebp), %ecx
	movl %edx, -8(%ebp) 

	addl $4, %esi        
	decl %ecx         
	jnz .L1031        

	movl -24(%ebp),%ebx
	movl -28(%ebp),%esi
	movl -32(%ebp),%edi
	leave
	ret

.globl wfir_process_asm63
	.type	 wfir_process_asm63,@function
wfir_process_asm63:
	pushl %ebp          
	movl %esp,%ebp
	subl $20,%esp
	pushl %ebx
	pushl %esi
	pushl %edi
	xorl %eax, %eax     
	movl %eax, -4(%ebp) 
	movl %eax, -8(%ebp) 
	movl 8(%ebp), %ebx  
	movl 20(%ebp), %esi 
	movl 16(%ebp), %ecx 
	movl 12(%ebp), %edi 
	# loop
	.align 16           
	.L1063:              
	movl (%esi), %edx   
	movl 4(%esi), %eax  
	subl %ebx, %eax     
	movl %ecx, -12(%ebp)
	addl %eax, %edx     
	movl %eax, %ecx 
	sarl $31, %ecx  
	andl $63, %ecx     
	addl %ecx, %eax    
	movl %ebx, (%esi)
	sarl $6, %eax    
	subl %eax, %edx     
	movl (%edi), %eax   
	movl %edx, -16(%ebp)
	movl %eax, %ecx     

	mull %ebx           
	sarl $31, %ecx      
	imul %ebx, %ecx     
	addl %ecx, %edx     
	movl (%edi), %ecx   
	sarl $31, %ebx      
	imul %ecx, %ebx     
	addl %ebx, %edx     
	addl $4, %edi        

	movl -4(%ebp), %ebx 
	movl -8(%ebp), %ecx 
	addl %ebx, %eax     
	movl -16(%ebp), %ebx
	adcl %ecx, %edx     
	movl %eax, -4(%ebp) 
	movl -12(%ebp), %ecx
	movl %edx, -8(%ebp) 

	addl $4, %esi        
	decl %ecx         
	jnz .L1063        

	movl -24(%ebp),%ebx
	movl -28(%ebp),%esi
	movl -32(%ebp),%edi
	leave
	ret

.globl wfir_process_asm255
	.type	 wfir_process_asm255,@function
wfir_process_asm255:
	pushl %ebp          
	movl %esp,%ebp
	subl $20,%esp
	pushl %ebx
	pushl %esi
	pushl %edi
	xorl %eax, %eax     
	movl %eax, -4(%ebp) 
	movl %eax, -8(%ebp) 
	movl 8(%ebp), %ebx  
	movl 20(%ebp), %esi 
	movl 16(%ebp), %ecx 
	movl 12(%ebp), %edi 
	# loop
	.align 16           
	.L1255:              
	movl (%esi), %edx   
	movl 4(%esi), %eax  
	subl %ebx, %eax     
	movl %ecx, -12(%ebp)
	addl %eax, %edx     
	movl %eax, %ecx  
	sarl $31, %ecx   
	andl $255, %ecx     
	addl %ecx, %eax    
	movl %ebx, (%esi)
	sarl $8, %eax    
	subl %eax, %edx     
	movl (%edi), %eax   
	movl %edx, -16(%ebp)
	movl %eax, %ecx     

	mull %ebx           
	sarl $31, %ecx      
	imul %ebx, %ecx     
	addl %ecx, %edx     
	movl (%edi), %ecx   
	sarl $31, %ebx      
	imul %ecx, %ebx     
	addl %ebx, %edx     
	addl $4, %edi        

	movl -4(%ebp), %ebx 
	movl -8(%ebp), %ecx 
	addl %ebx, %eax     
	movl -16(%ebp), %ebx
	adcl %ecx, %edx     
	movl %eax, -4(%ebp) 
	movl -12(%ebp), %ecx
	movl %edx, -8(%ebp) 

	addl $4, %esi        
	decl %ecx         
	jnz .L1255        

	movl -24(%ebp),%ebx
	movl -28(%ebp),%esi
	movl -32(%ebp),%edi
	leave
	ret
