// Code generated by command: go run valid_asm.go -pkg utf8 -out ../utf8/valid_amd64.s -stubs ../utf8/valid_amd64.go. DO NOT EDIT. //go:build !purego #include "textflag.h" // func validateAvx(p []byte) byte // Requires: AVX, AVX2 TEXT ·validateAvx(SB), NOSPLIT, $0-25 MOVQ p_base+0(FP), AX MOVQ p_len+8(FP), CX MOVB $0x01, DL // Prepare the constant masks VMOVDQU incomplete_mask<>+0(SB), Y0 VMOVDQU cont4_vec<>+0(SB), Y1 VMOVDQU cont3_vec<>+0(SB), Y2 // High nibble of current byte VMOVDQU nibble1_errors<>+0(SB), Y3 // Low nibble of current byte VMOVDQU nibble2_errors<>+0(SB), Y4 // High nibble of the next byte VMOVDQU nibble3_errors<>+0(SB), Y5 // Nibble mask VMOVDQU nibble_mask<>+0(SB), Y6 // MSB mask VMOVDQU msb_mask<>+0(SB), Y7 // For the first pass, set the previous block as zero. VXORPS Y8, Y8, Y8 // Zeroes the error vector. VXORPS Y9, Y9, Y9 // Zeroes the "previous block was incomplete" vector. VXORPS Y10, Y10, Y10 // Top of the loop. check_input: // if bytes left >= 32 CMPQ CX, $0x20 // go process the next block JGE process // If < 32 bytes left // Fast exit if done CMPQ CX, $0x00 JE end // If 0 < bytes left < 32 VPXOR Y12, Y12, Y12 MOVQ $0x0000000000000020, BX SUBQ CX, BX SUBQ BX, AX VMOVDQU (AX), Y11 CMPQ CX, $0x10 JA tail_load_large // Shift right that works if remaining bytes <= 16, safe next to a page boundary VPERM2I128 $0x03, Y11, Y12, Y11 LEAQ shuffle_clear_mask<>+16(SB), SI ADDQ CX, BX ADDQ CX, BX SUBQ $0x20, BX SUBQ BX, SI VMOVDQU (SI), Y13 VPSHUFB Y13, Y11, Y11 XORQ CX, CX JMP loaded // Shift right that works if remaining bytes >= 16, safe next to a page boundary tail_load_large: ADDQ CX, BX ADDQ CX, BX SUBQ $0x30, BX LEAQ shuffle_mask<>+16(SB), SI SUBQ BX, SI VMOVDQU (SI), Y13 VPSHUFB Y13, Y11, Y14 VPERM2I128 $0x03, Y11, Y12, Y11 VPSHUFB Y13, Y11, Y11 LEAQ blend_mask<>+16(SB), CX SUBQ BX, CX VBROADCASTF128 (CX), Y12 VPBLENDVB Y12, Y14, Y11, Y11 XORQ CX, CX JMP loaded // Process one 32B block of data process: // Load the next block of bytes VMOVDQU (AX), Y11 SUBQ $0x20, CX ADDQ $0x20, AX loaded: // Fast check to see if ASCII VPMOVMSKB Y11, BX CMPL BX, $0x00 JNZ non_ascii // If this whole block is ASCII, there is nothing to do, and it is an error if any of the previous code point was incomplete. VPOR Y9, Y10, Y9 JMP check_input non_ascii: XORB DL, DL // Prepare intermediate vector for push operations VPERM2I128 $0x03, Y8, Y11, Y8 // Check errors on the high nibble of the previous byte VPALIGNR $0x0f, Y8, Y11, Y10 VPSRLW $0x04, Y10, Y12 VPAND Y12, Y6, Y12 VPSHUFB Y12, Y3, Y12 // Check errors on the low nibble of the previous byte VPAND Y10, Y6, Y10 VPSHUFB Y10, Y4, Y10 VPAND Y10, Y12, Y12 // Check errors on the high nibble on the current byte VPSRLW $0x04, Y11, Y10 VPAND Y10, Y6, Y10 VPSHUFB Y10, Y5, Y10 VPAND Y10, Y12, Y12 // Find 3 bytes continuations VPALIGNR $0x0e, Y8, Y11, Y10 VPSUBUSB Y2, Y10, Y10 // Find 4 bytes continuations VPALIGNR $0x0d, Y8, Y11, Y8 VPSUBUSB Y1, Y8, Y8 // Combine them to have all continuations VPOR Y10, Y8, Y8 // Perform a byte-sized signed comparison with zero to turn any non-zero bytes into 0xFF. VXORPS Y10, Y10, Y10 VPCMPGTB Y10, Y8, Y8 // Find bytes that are continuations by looking at their most significant bit. VPAND Y7, Y8, Y8 // Find mismatches between expected and actual continuation bytes VPXOR Y8, Y12, Y8 // Store result in sticky error VPOR Y9, Y8, Y9 // Prepare for next iteration VPSUBUSB Y0, Y11, Y10 VMOVDQU Y11, Y8 // End of loop JMP check_input end: // If the previous block was incomplete, this is an error. VPOR Y10, Y9, Y9 // Return whether any error bit was set VPTEST Y9, Y9 SETEQ AL // Bit 0 tells if the input is valid utf8, bit 1 tells if it's valid ascii ANDB AL, DL SHLB $0x01, DL ORB DL, AL MOVB AL, ret+24(FP) VZEROUPPER RET DATA incomplete_mask<>+0(SB)/8, $0xffffffffffffffff DATA incomplete_mask<>+8(SB)/8, $0xffffffffffffffff DATA incomplete_mask<>+16(SB)/8, $0xffffffffffffffff DATA incomplete_mask<>+24(SB)/8, $0xbfdfefffffffffff GLOBL incomplete_mask<>(SB), RODATA|NOPTR, $32 DATA cont4_vec<>+0(SB)/8, $0xefefefefefefefef DATA cont4_vec<>+8(SB)/8, $0xefefefefefefefef DATA cont4_vec<>+16(SB)/8, $0xefefefefefefefef DATA cont4_vec<>+24(SB)/8, $0xefefefefefefefef GLOBL cont4_vec<>(SB), RODATA|NOPTR, $32 DATA cont3_vec<>+0(SB)/8, $0xdfdfdfdfdfdfdfdf DATA cont3_vec<>+8(SB)/8, $0xdfdfdfdfdfdfdfdf DATA cont3_vec<>+16(SB)/8, $0xdfdfdfdfdfdfdfdf DATA cont3_vec<>+24(SB)/8, $0xdfdfdfdfdfdfdfdf GLOBL cont3_vec<>(SB), RODATA|NOPTR, $32 DATA nibble1_errors<>+0(SB)/8, $0x0202020202020202 DATA nibble1_errors<>+8(SB)/8, $0x4915012180808080 DATA nibble1_errors<>+16(SB)/8, $0x0202020202020202 DATA nibble1_errors<>+24(SB)/8, $0x4915012180808080 GLOBL nibble1_errors<>(SB), RODATA|NOPTR, $32 DATA nibble2_errors<>+0(SB)/8, $0xcbcbcb8b8383a3e7 DATA nibble2_errors<>+8(SB)/8, $0xcbcbdbcbcbcbcbcb DATA nibble2_errors<>+16(SB)/8, $0xcbcbcb8b8383a3e7 DATA nibble2_errors<>+24(SB)/8, $0xcbcbdbcbcbcbcbcb GLOBL nibble2_errors<>(SB), RODATA|NOPTR, $32 DATA nibble3_errors<>+0(SB)/8, $0x0101010101010101 DATA nibble3_errors<>+8(SB)/8, $0x01010101babaaee6 DATA nibble3_errors<>+16(SB)/8, $0x0101010101010101 DATA nibble3_errors<>+24(SB)/8, $0x01010101babaaee6 GLOBL nibble3_errors<>(SB), RODATA|NOPTR, $32 DATA nibble_mask<>+0(SB)/8, $0x0f0f0f0f0f0f0f0f DATA nibble_mask<>+8(SB)/8, $0x0f0f0f0f0f0f0f0f DATA nibble_mask<>+16(SB)/8, $0x0f0f0f0f0f0f0f0f DATA nibble_mask<>+24(SB)/8, $0x0f0f0f0f0f0f0f0f GLOBL nibble_mask<>(SB), RODATA|NOPTR, $32 DATA msb_mask<>+0(SB)/8, $0x8080808080808080 DATA msb_mask<>+8(SB)/8, $0x8080808080808080 DATA msb_mask<>+16(SB)/8, $0x8080808080808080 DATA msb_mask<>+24(SB)/8, $0x8080808080808080 GLOBL msb_mask<>(SB), RODATA|NOPTR, $32 DATA shuffle_mask<>+0(SB)/8, $0x0706050403020100 DATA shuffle_mask<>+8(SB)/8, $0x0f0e0d0c0b0a0908 DATA shuffle_mask<>+16(SB)/8, $0x0706050403020100 DATA shuffle_mask<>+24(SB)/8, $0x0f0e0d0c0b0a0908 DATA shuffle_mask<>+32(SB)/8, $0x0706050403020100 DATA shuffle_mask<>+40(SB)/8, $0x0f0e0d0c0b0a0908 GLOBL shuffle_mask<>(SB), RODATA|NOPTR, $48 DATA shuffle_clear_mask<>+0(SB)/8, $0x0706050403020100 DATA shuffle_clear_mask<>+8(SB)/8, $0x0f0e0d0c0b0a0908 DATA shuffle_clear_mask<>+16(SB)/8, $0xffffffffffffffff DATA shuffle_clear_mask<>+24(SB)/8, $0xffffffffffffffff DATA shuffle_clear_mask<>+32(SB)/8, $0xffffffffffffffff DATA shuffle_clear_mask<>+40(SB)/8, $0xffffffffffffffff GLOBL shuffle_clear_mask<>(SB), RODATA|NOPTR, $48 DATA blend_mask<>+0(SB)/8, $0xffffffffffffffff DATA blend_mask<>+8(SB)/8, $0xffffffffffffffff DATA blend_mask<>+16(SB)/8, $0x0000000000000000 DATA blend_mask<>+24(SB)/8, $0x0000000000000000 DATA blend_mask<>+32(SB)/8, $0xffffffffffffffff DATA blend_mask<>+40(SB)/8, $0xffffffffffffffff GLOBL blend_mask<>(SB), RODATA|NOPTR, $48