ZeroMemory SSE 实现

      .686                      ; create 32 bit code
	  .mmx
	  .xmm                     
      .model flat, stdcall      ; 32 bit memory model
      option casemap :none      ; case sensitive

    .code

; ml -c /omf ad.asm
; Microsoft (R) Macro Assembler Version 10.00.30319.01
; Copyright (C) Microsoft Corporation.  All rights reserved.

_align16_zero_memory proc C  _byte_num, _source
		option prologue:none, epilogue:none	
		
		mov eax, [esp+4] ; - U load _byte_num
		push edi         ; - V save old frame 
		mov edi, [esp+12] ; - U load _source 
		push ecx         ; - V save old frame 
		mov ecx, eax     ; - U save old frame 
		nop              ; - V spare 
		shr eax, 7       ; - U get round 
		je _remain_byte_deal ; - V/N deal remain frame 
		pxor xmm0, xmm0  ; get zero 
		align 16 
	main_loop:  

		movdqa [edi], xmm0 
		movdqa [edi+010h], xmm0
		movdqa [edi+020h], xmm0 
		movdqa [edi+030h], xmm0
		
		movdqa [edi+040h], xmm0 
		movdqa [edi+050h], xmm0
		movdqa [edi+060h], xmm0 
		movdqa [edi+070h], xmm0
		
		add edi, 128 
		nop 
		dec eax 
		jnz main_loop ; jnz opr ... maybe unsafe ... 
		mov eax, ecx  ; - U save old frame 
		and ecx, 112  ; - V/N 0x0111 0000
		shr ecx, 3    ; - U 0x0000 1110 0/2/4/6/
		align 16      ; - V/N maybe jmp opcode/nop opcode 
		_remain_byte_deal:
				pxor xmm0, xmm0               ; get zero
				lea edi, [edi-070h+ecx*8]     ; - N 
				jmp [RemainDQwordTable+ecx*4] ; - N
		align 16 
		SB0E:
		SB0F:
			movdqa [edi], xmm0 
		SB0C:
		SB0D:
			movdqa [edi+010h], xmm0
		SB0A:
		SB0B:
			movdqa [edi+020h], xmm0 
		SB08:
		SB09:
			movdqa [edi+030h], xmm0
		SB06:
		SB07:
			movdqa [edi+040h], xmm0 
		SB04:
		SB05:
			movdqa [edi+050h], xmm0
		SB02:
		SB03:
			movdqa [edi+060h], xmm0 	
		SB00:
		SB01:
			neg ecx 
			and eax, 15 
			lea edi, [edi+070h+ecx*8]
			mov ecx, eax 
			xor eax, eax 
			cld 
			rep stosb 
			pop ecx
			pop edi
			ret 
		RemainDQwordTable dd   SB00, SB01, SB02, SB03, SB04, SB05, SB06, SB07
						  dd   SB08, SB09, SB0A, SB0B, SB0C, SB0D, SB0E, SB0F
_align16_zero_memory endp

SetUpZeroMem proc C  _byte_num, _source
			 option prologue:none, epilogue:none	
			 
			 mov eax, [esp+8] ; - U load source  
			 push edi ; - V save old frame 
			 mov edi, eax     ; - U
			 push ecx ; - V save old frame 
			 mov ecx, [esp+12] ; - U load num 
			 nop      ; - V spare 
			 cmp ecx, 16 
			 jb __miniature_copy 
			 and eax, 15       ; - U 
			 jmp [eax*4+_ByteTable]
			 align 16 
			 OP00:
				  push edi 
				  push ecx 
				  call _align16_zero_memory
				  add esp, 8
				  pop ecx 
				  pop edi 
				  ret 
			 OP01: ; 0000 0001 
				  mov byte ptr [edi], 0
				  mov word ptr [edi+1], 0
				  mov dword ptr [edi+3], 0
				  mov dword ptr [edi+7], 0
				  mov dword ptr [edi+11], 0
				  add edi, 15
				  sub ecx, 15 
				  jmp OP00
			 OP02: ; 0000 0002 
				  mov word ptr [edi], 0
				  mov dword ptr [edi+2], 0
				  mov dword ptr [edi+6], 0
				  mov dword ptr [edi+10], 0
				  add edi, 14
				  sub ecx, 14 
				  jmp OP00
			 OP03: ; 0000 0003 
				  mov byte ptr [edi], 0
				  mov dword ptr [edi+1], 0
				  mov dword ptr [edi+5], 0
				  mov dword ptr [edi+9], 0
				  add edi, 13
				  sub ecx, 13
				  jmp OP00
			 OP04: ; 0000 0004 
				  mov dword ptr [edi], 0
				  mov dword ptr [edi+4], 0
				  mov dword ptr [edi+8], 0
				  add edi, 12
				  sub ecx, 12
				  jmp OP00
			 OP05: ; 0000 0005 
				  mov byte ptr [edi], 0
				  mov word ptr [edi+1], 0
				  mov dword ptr [edi+3], 0
				  mov dword ptr [edi+7], 0
				  add edi, 11
				  sub ecx, 11
				  jmp OP00
		     OP06: ; 0000 0006 
				  mov word ptr [edi], 0
				  mov dword ptr [edi+2], 0
				  mov dword ptr [edi+6], 0
				  add edi, 10
				  sub ecx, 10
				  jmp OP00
			 OP07: ; 0000 0007 
				  mov byte ptr [edi], 0
				  mov dword ptr [edi+1], 0
				  mov dword ptr [edi+5], 0
				  add edi, 9
				  sub ecx, 9
				  jmp OP00
			 OP08: 
				  mov dword ptr [edi], 0
				  mov dword ptr [edi+8], 0				  
				  sub ecx, 8
				  add edi, 8
				  jmp OP00
			 OP09: 
				  mov byte ptr [edi], 0
				  mov word ptr [edi+1], 0
				  mov dword ptr [edi+3], 0				  
				  sub ecx, 7
				  add edi, 7
				  jmp OP00	  
			 OP0A: 
				  mov word ptr [edi], 0
				  mov dword ptr [edi+2], 0				  
				  sub ecx, 6
				  add edi, 6
				  jmp OP00
			 OP0B: 
				  mov byte ptr [edi], 0
				  mov dword ptr [edi+1], 0				  
				  sub ecx, 5
				  add edi, 5
				  jmp OP00
			 OP0C: 
				  mov dword ptr [edi], 0			  
				  sub ecx, 4
				  add edi, 4
				  jmp OP00
			 OP0D: 
				  mov byte ptr [edi], 0
				  mov word ptr [edi+2], 0			 	  
				  sub ecx, 3
				  add edi, 3
				  jmp OP00	  
			 OP0E: 
				  mov word ptr [edi], 0			  
				  sub ecx, 2
				  add edi, 2
				  jmp OP00
			 OP0F: 
				  mov byte ptr [edi], 0			  
				  dec ecx 
				  inc edi 
				  jmp OP00
		__miniature_copy:	 
			 cld 
			 xor eax, eax 
			 rep stosb ; rep maybe rubbish 
			 pop ecx 
			 pop edi 
			 ret 			 
			 _ByteTable    dd   OP00, OP01, OP02, OP03, OP04, OP05, OP06, OP07
						   dd   OP08, OP09, OP0A, OP0B, OP0C, OP0D, OP0E, OP0F
SetUpZeroMem endp
		end 


猜你喜欢

转载自xuling1993728.iteye.com/blog/2207891
SSE
今日推荐