Search Options

Results per page
Sort
Preferred Languages
Advance

Results 1 - 10 of 16 for VMOVDQU (0.09 sec)

  1. src/runtime/memmove_amd64.s

    	// The tail will be put on its place after main body copying.
    	// It's time for the unaligned heading part.
    	VMOVDQU	(SI), Y4
    	// Adjust source address to point past head.
    	ADDQ	R11, SI
    	SUBQ	AX, BX
    	// Aligned memory copying there
    gobble_128_loop:
    	VMOVDQU	(SI), Y0
    	VMOVDQU	0x20(SI), Y1
    	VMOVDQU	0x40(SI), Y2
    	VMOVDQU	0x60(SI), Y3
    	ADDQ	AX, SI
    	VMOVDQA	Y0, (DI)
    	VMOVDQA	Y1, 0x20(DI)
    	VMOVDQA	Y2, 0x40(DI)
    Registered: Wed Jun 12 16:32:35 UTC 2024
    - Last Modified: Sun Apr 10 15:52:08 UTC 2022
    - 12.5K bytes
    - Viewed (0)
  2. src/runtime/memclr_amd64.s

    	CMPQ    BX, $0x2000000
    	JAE	loop_preheader_avx2_huge
    
    loop_avx2:
    	VMOVDQU	Y0, 0(DI)
    	VMOVDQU	Y0, 32(DI)
    	VMOVDQU	Y0, 64(DI)
    	VMOVDQU	Y0, 96(DI)
    	SUBQ	$128, BX
    	ADDQ	$128, DI
    	CMPQ	BX, $128
    	JAE	loop_avx2
    	VMOVDQU  Y0, -32(DI)(BX*1)
    	VMOVDQU  Y0, -64(DI)(BX*1)
    	VMOVDQU  Y0, -96(DI)(BX*1)
    	VMOVDQU  Y0, -128(DI)(BX*1)
    	VZEROUPPER
    	RET
    
    loop_preheader_erms:
    #ifndef hasAVX2
    Registered: Wed Jun 12 16:32:35 UTC 2024
    - Last Modified: Tue May 10 20:52:34 UTC 2022
    - 4.9K bytes
    - Viewed (0)
  3. src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s

    	VMOVDQU    CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
    	VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
    	VPXOR      (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
    Registered: Wed Jun 12 16:32:35 UTC 2024
    - Last Modified: Wed Nov 29 21:28:33 UTC 2023
    - 105.6K bytes
    - Viewed (0)
  4. src/internal/bytealg/index_amd64.s

    _32_or_more:
    	CMPQ AX, $32
    	JA   _33_to_63
    	VMOVDQU (R8), Y1
    	LEAQ -31(DI)(DX*1), DX
    loop32:
    	VMOVDQU (DI), Y2
    	VPCMPEQB Y1, Y2, Y3
    	VPMOVMSKB Y3, SI
    	CMPL  SI, $0xffffffff
    	JE   success_avx2
    	ADDQ $1,DI
    	CMPQ DI,DX
    	JB loop32
    	JMP fail_avx2
    _33_to_63:
    	LEAQ 1(DI)(DX*1), DX
    	SUBQ AX, DX
    	VMOVDQU -32(R8)(AX*1), Y0
    	VMOVDQU (R8), Y1
    loop33to63:
    	VMOVDQU (DI), Y2
    	VPCMPEQB Y1, Y2, Y3
    Registered: Wed Jun 12 16:32:35 UTC 2024
    - Last Modified: Mon Aug 07 00:20:48 UTC 2023
    - 5.1K bytes
    - Viewed (0)
  5. src/internal/bytealg/equal_amd64.s

    	JEQ	hugeloop
    	XORQ	AX, AX	// return 0
    	RET
    #endif
    
    	// 64 bytes at a time using ymm registers
    	PCALIGN $16
    hugeloop_avx2:
    	CMPQ	BX, $64
    	JB	bigloop_avx2
    	VMOVDQU	(SI), Y0
    	VMOVDQU	(DI), Y1
    	VMOVDQU	32(SI), Y2
    	VMOVDQU	32(DI), Y3
    	VPCMPEQB	Y1, Y0, Y4
    	VPCMPEQB	Y2, Y3, Y5
    	VPAND	Y4, Y5, Y6
    	VPMOVMSKB Y6, DX
    	ADDQ	$64, SI
    	ADDQ	$64, DI
    	SUBQ	$64, BX
    	CMPL	DX, $0xffffffff
    	JEQ	hugeloop_avx2
    Registered: Wed Jun 12 16:32:35 UTC 2024
    - Last Modified: Fri Nov 17 16:34:40 UTC 2023
    - 2.8K bytes
    - Viewed (0)
  6. src/internal/bytealg/count_amd64.s

    	CMPB   internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
    	JNE sse
    #endif
    	MOVD AX, X0
    	LEAQ -64(SI)(BX*1), R11
    	LEAQ (SI)(BX*1), R13
    	VPBROADCASTB  X0, Y1
    	PCALIGN $32
    avx2_loop:
    	VMOVDQU (DI), Y2
    	VMOVDQU 32(DI), Y4
    	VPCMPEQB Y1, Y2, Y3
    	VPCMPEQB Y1, Y4, Y5
    	VPMOVMSKB Y3, DX
    	VPMOVMSKB Y5, CX
    	POPCNTL DX, DX
    	POPCNTL CX, CX
    	ADDQ DX, R12
    	ADDQ CX, R12
    	ADDQ $64, DI
    	CMPQ DI, R11
    Registered: Wed Jun 12 16:32:35 UTC 2024
    - Last Modified: Fri Oct 06 20:54:43 UTC 2023
    - 4.7K bytes
    - Viewed (0)
  7. src/internal/bytealg/compare_amd64.s

    	SUBQ	$64, R8
    	CMPQ	R8, $64
    	JBE	loop
    	JMP	big_loop
    #endif
    
    	// Compare 64-bytes per loop iteration.
    	// Loop is unrolled and uses AVX2.
    big_loop_avx2:
    	VMOVDQU	(SI), Y2
    	VMOVDQU	(DI), Y3
    	VMOVDQU	32(SI), Y4
    	VMOVDQU	32(DI), Y5
    	VPCMPEQB Y2, Y3, Y0
    	VPMOVMSKB Y0, AX
    	XORL	$0xffffffff, AX
    	JNE	diff32_avx2
    	VPCMPEQB Y4, Y5, Y6
    	VPMOVMSKB Y6, AX
    	XORL	$0xffffffff, AX
    	JNE	diff64_avx2
    Registered: Wed Jun 12 16:32:35 UTC 2024
    - Last Modified: Thu Aug 18 17:17:01 UTC 2022
    - 4.3K bytes
    - Viewed (0)
  8. src/crypto/sha256/sha256block_amd64.s

    	MOVL 24(CTX), g // g = H6
    	MOVL 28(CTX), h // h = H7
    
    avx2_loop0: // at each iteration works with one block (512 bit)
    
    	VMOVDQU (0*32)(INP), XTMP0
    	VMOVDQU (1*32)(INP), XTMP1
    	VMOVDQU (2*32)(INP), XTMP2
    	VMOVDQU (3*32)(INP), XTMP3
    
    	VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
    
    	// Apply Byte Flip Mask: LE -> BE
    	VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0
    	VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1
    Registered: Wed Jun 12 16:32:35 UTC 2024
    - Last Modified: Mon Mar 04 17:29:44 UTC 2024
    - 47.3K bytes
    - Viewed (0)
  9. src/internal/bytealg/indexbyte_amd64.s

    	JNE sse
    #endif
    	MOVD AX, X0
    	LEAQ -32(SI)(BX*1), R11
    	VPBROADCASTB  X0, Y1
    
    	PCALIGN $32
    avx2_loop:
    	VMOVDQU (DI), Y2
    	VPCMPEQB Y1, Y2, Y3
    	VPTEST Y3, Y3
    	JNZ avx2success
    	ADDQ $32, DI
    	CMPQ DI, R11
    	JLT avx2_loop
    	MOVQ R11, DI
    	VMOVDQU (DI), Y2
    	VPCMPEQB Y1, Y2, Y3
    	VPTEST Y3, Y3
    	JNZ avx2success
    	VZEROUPPER
    	MOVQ $-1, (R8)
    	RET
    
    avx2success:
    Registered: Wed Jun 12 16:32:35 UTC 2024
    - Last Modified: Wed Nov 01 19:06:01 UTC 2023
    - 3.1K bytes
    - Viewed (0)
  10. src/crypto/sha512/sha512block_amd64.s

    	MOVQ BX, R12
    	ANDQ CX, DI
    	ANDQ R8, R12
    	ADDQ R13, R15
    
    	ADDQ AX, DX
    	ORQ  R12, DI
    	ADDQ R14, AX
    
    	ADDQ R15, DX
    
    	ADDQ R15, AX
    
    	ADDQ DI, AX
    
    	VMOVDQU Y6, Y4
    	VMOVDQU Y7, Y5
    
    	SUBQ $1, frame_SRND(SP)
    	JNE  loop2
    
    	addm(8*0(SI),AX)
    	addm(8*1(SI),BX)
    	addm(8*2(SI),CX)
    	addm(8*3(SI),R8)
    	addm(8*4(SI),DX)
    	addm(8*5(SI),R9)
    Registered: Wed Jun 12 16:32:35 UTC 2024
    - Last Modified: Mon Mar 04 17:29:44 UTC 2024
    - 27K bytes
    - Viewed (0)
Back to top