|  | /* | 
|  | * Author: Anton Blanchard <anton@au.ibm.com> | 
|  | * Copyright 2015 IBM Corporation. | 
|  | * | 
|  | * This program is free software; you can redistribute it and/or | 
|  | * modify it under the terms of the GNU General Public License | 
|  | * as published by the Free Software Foundation; either version | 
|  | * 2 of the License, or (at your option) any later version. | 
|  | */ | 
|  | #include <asm/ppc_asm.h> | 
|  | #include <asm/export.h> | 
|  | #include <asm/ppc-opcode.h> | 
|  |  | 
|  | #define off8	r6 | 
|  | #define off16	r7 | 
|  | #define off24	r8 | 
|  |  | 
|  | #define rA	r9 | 
|  | #define rB	r10 | 
|  | #define rC	r11 | 
|  | #define rD	r27 | 
|  | #define rE	r28 | 
|  | #define rF	r29 | 
|  | #define rG	r30 | 
|  | #define rH	r31 | 
|  |  | 
|  | #ifdef __LITTLE_ENDIAN__ | 
|  | #define LH	lhbrx | 
|  | #define LW	lwbrx | 
|  | #define LD	ldbrx | 
|  | #define LVS	lvsr | 
|  | #define VPERM(_VRT,_VRA,_VRB,_VRC) \ | 
|  | vperm _VRT,_VRB,_VRA,_VRC | 
|  | #else | 
|  | #define LH	lhzx | 
|  | #define LW	lwzx | 
|  | #define LD	ldx | 
|  | #define LVS	lvsl | 
|  | #define VPERM(_VRT,_VRA,_VRB,_VRC) \ | 
|  | vperm _VRT,_VRA,_VRB,_VRC | 
|  | #endif | 
|  |  | 
|  | #define VMX_THRESH 4096 | 
|  | #define ENTER_VMX_OPS	\ | 
|  | mflr    r0;	\ | 
|  | std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ | 
|  | std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ | 
|  | std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ | 
|  | std     r0,16(r1); \ | 
|  | stdu    r1,-STACKFRAMESIZE(r1); \ | 
|  | bl      enter_vmx_ops; \ | 
|  | cmpwi   cr1,r3,0; \ | 
|  | ld      r0,STACKFRAMESIZE+16(r1); \ | 
|  | ld      r3,STK_REG(R31)(r1); \ | 
|  | ld      r4,STK_REG(R30)(r1); \ | 
|  | ld      r5,STK_REG(R29)(r1); \ | 
|  | addi	r1,r1,STACKFRAMESIZE; \ | 
|  | mtlr    r0 | 
|  |  | 
|  | #define EXIT_VMX_OPS \ | 
|  | mflr    r0; \ | 
|  | std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \ | 
|  | std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \ | 
|  | std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \ | 
|  | std     r0,16(r1); \ | 
|  | stdu    r1,-STACKFRAMESIZE(r1); \ | 
|  | bl      exit_vmx_ops; \ | 
|  | ld      r0,STACKFRAMESIZE+16(r1); \ | 
|  | ld      r3,STK_REG(R31)(r1); \ | 
|  | ld      r4,STK_REG(R30)(r1); \ | 
|  | ld      r5,STK_REG(R29)(r1); \ | 
|  | addi	r1,r1,STACKFRAMESIZE; \ | 
|  | mtlr    r0 | 
|  |  | 
|  | /* | 
|  | * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with | 
|  | * 16 bytes boundary and permute the result with the 1st 16 bytes. | 
|  |  | 
|  | *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z | | 
|  | *    ^                                  ^                                 ^ | 
|  | * 0xbbbb10                          0xbbbb20                          0xbbb30 | 
|  | *                                 ^ | 
|  | *                                _vaddr | 
|  | * | 
|  | * | 
|  | * _vmask is the mask generated by LVS | 
|  | * _v1st_qw is the 1st aligned QW of current addr which is already loaded. | 
|  | *   for example: 0xyyyyyyyyyyyyy012 for big endian | 
|  | * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded. | 
|  | *   for example: 0x3456789abcdefzzz for big endian | 
|  | * The permute result is saved in _v_res. | 
|  | *   for example: 0x0123456789abcdef for big endian. | 
|  | */ | 
|  | #define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \ | 
|  | lvx     _v2nd_qw,_vaddr,off16; \ | 
|  | VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask) | 
|  |  | 
|  | /* | 
|  | * There are 2 categories for memcmp: | 
|  | * 1) src/dst has the same offset to the 8 bytes boundary. The handlers | 
|  | * are named like .Lsameoffset_xxxx | 
|  | * 2) src/dst has different offset to the 8 bytes boundary. The handlers | 
|  | * are named like .Ldiffoffset_xxxx | 
|  | */ | 
|  | _GLOBAL_TOC(memcmp) | 
|  | cmpdi	cr1,r5,0 | 
|  |  | 
|  | /* Use the short loop if the src/dst addresses are not | 
|  | * with the same offset of 8 bytes align boundary. | 
|  | */ | 
|  | xor	r6,r3,r4 | 
|  | andi.	r6,r6,7 | 
|  |  | 
|  | /* Fall back to short loop if compare at aligned addrs | 
|  | * with less than 8 bytes. | 
|  | */ | 
|  | cmpdi   cr6,r5,7 | 
|  |  | 
|  | beq	cr1,.Lzero | 
|  | bgt	cr6,.Lno_short | 
|  |  | 
|  | .Lshort: | 
|  | mtctr	r5 | 
|  | 1:	lbz	rA,0(r3) | 
|  | lbz	rB,0(r4) | 
|  | subf.	rC,rB,rA | 
|  | bne	.Lnon_zero | 
|  | bdz	.Lzero | 
|  |  | 
|  | lbz	rA,1(r3) | 
|  | lbz	rB,1(r4) | 
|  | subf.	rC,rB,rA | 
|  | bne	.Lnon_zero | 
|  | bdz	.Lzero | 
|  |  | 
|  | lbz	rA,2(r3) | 
|  | lbz	rB,2(r4) | 
|  | subf.	rC,rB,rA | 
|  | bne	.Lnon_zero | 
|  | bdz	.Lzero | 
|  |  | 
|  | lbz	rA,3(r3) | 
|  | lbz	rB,3(r4) | 
|  | subf.	rC,rB,rA | 
|  | bne	.Lnon_zero | 
|  |  | 
|  | addi	r3,r3,4 | 
|  | addi	r4,r4,4 | 
|  |  | 
|  | bdnz	1b | 
|  |  | 
|  | .Lzero: | 
|  | li	r3,0 | 
|  | blr | 
|  |  | 
|  | .Lno_short: | 
|  | dcbt	0,r3 | 
|  | dcbt	0,r4 | 
|  | bne	.Ldiffoffset_8bytes_make_align_start | 
|  |  | 
|  |  | 
|  | .Lsameoffset_8bytes_make_align_start: | 
|  | /* attempt to compare bytes not aligned with 8 bytes so that | 
|  | * rest comparison can run based on 8 bytes alignment. | 
|  | */ | 
|  | andi.   r6,r3,7 | 
|  |  | 
|  | /* Try to compare the first double word which is not 8 bytes aligned: | 
|  | * load the first double word at (src & ~7UL) and shift left appropriate | 
|  | * bits before comparision. | 
|  | */ | 
|  | rlwinm  r6,r3,3,26,28 | 
|  | beq     .Lsameoffset_8bytes_aligned | 
|  | clrrdi	r3,r3,3 | 
|  | clrrdi	r4,r4,3 | 
|  | LD	rA,0,r3 | 
|  | LD	rB,0,r4 | 
|  | sld	rA,rA,r6 | 
|  | sld	rB,rB,r6 | 
|  | cmpld	cr0,rA,rB | 
|  | srwi	r6,r6,3 | 
|  | bne	cr0,.LcmpAB_lightweight | 
|  | subfic  r6,r6,8 | 
|  | subf.	r5,r6,r5 | 
|  | addi	r3,r3,8 | 
|  | addi	r4,r4,8 | 
|  | beq	.Lzero | 
|  |  | 
|  | .Lsameoffset_8bytes_aligned: | 
|  | /* now we are aligned with 8 bytes. | 
|  | * Use .Llong loop if left cmp bytes are equal or greater than 32B. | 
|  | */ | 
|  | cmpdi   cr6,r5,31 | 
|  | bgt	cr6,.Llong | 
|  |  | 
|  | .Lcmp_lt32bytes: | 
|  | /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */ | 
|  | cmpdi   cr5,r5,7 | 
|  | srdi    r0,r5,3 | 
|  | ble	cr5,.Lcmp_rest_lt8bytes | 
|  |  | 
|  | /* handle 8 ~ 31 bytes */ | 
|  | clrldi  r5,r5,61 | 
|  | mtctr   r0 | 
|  | 2: | 
|  | LD	rA,0,r3 | 
|  | LD	rB,0,r4 | 
|  | cmpld	cr0,rA,rB | 
|  | addi	r3,r3,8 | 
|  | addi	r4,r4,8 | 
|  | bne	cr0,.LcmpAB_lightweight | 
|  | bdnz	2b | 
|  |  | 
|  | cmpwi   r5,0 | 
|  | beq	.Lzero | 
|  |  | 
|  | .Lcmp_rest_lt8bytes: | 
|  | /* Here we have only less than 8 bytes to compare with. at least s1 | 
|  | * Address is aligned with 8 bytes. | 
|  | * The next double words are load and shift right with appropriate | 
|  | * bits. | 
|  | */ | 
|  | subfic  r6,r5,8 | 
|  | slwi	r6,r6,3 | 
|  | LD	rA,0,r3 | 
|  | LD	rB,0,r4 | 
|  | srd	rA,rA,r6 | 
|  | srd	rB,rB,r6 | 
|  | cmpld	cr0,rA,rB | 
|  | bne	cr0,.LcmpAB_lightweight | 
|  | b	.Lzero | 
|  |  | 
|  | .Lnon_zero: | 
|  | mr	r3,rC | 
|  | blr | 
|  |  | 
|  | .Llong: | 
|  | #ifdef CONFIG_ALTIVEC | 
|  | BEGIN_FTR_SECTION | 
|  | /* Try to use vmx loop if length is equal or greater than 4K */ | 
|  | cmpldi  cr6,r5,VMX_THRESH | 
|  | bge	cr6,.Lsameoffset_vmx_cmp | 
|  | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) | 
|  |  | 
|  | .Llong_novmx_cmp: | 
|  | #endif | 
|  | /* At least s1 addr is aligned with 8 bytes */ | 
|  | li	off8,8 | 
|  | li	off16,16 | 
|  | li	off24,24 | 
|  |  | 
|  | std	r31,-8(r1) | 
|  | std	r30,-16(r1) | 
|  | std	r29,-24(r1) | 
|  | std	r28,-32(r1) | 
|  | std	r27,-40(r1) | 
|  |  | 
|  | srdi	r0,r5,5 | 
|  | mtctr	r0 | 
|  | andi.	r5,r5,31 | 
|  |  | 
|  | LD	rA,0,r3 | 
|  | LD	rB,0,r4 | 
|  |  | 
|  | LD	rC,off8,r3 | 
|  | LD	rD,off8,r4 | 
|  |  | 
|  | LD	rE,off16,r3 | 
|  | LD	rF,off16,r4 | 
|  |  | 
|  | LD	rG,off24,r3 | 
|  | LD	rH,off24,r4 | 
|  | cmpld	cr0,rA,rB | 
|  |  | 
|  | addi	r3,r3,32 | 
|  | addi	r4,r4,32 | 
|  |  | 
|  | bdz	.Lfirst32 | 
|  |  | 
|  | LD	rA,0,r3 | 
|  | LD	rB,0,r4 | 
|  | cmpld	cr1,rC,rD | 
|  |  | 
|  | LD	rC,off8,r3 | 
|  | LD	rD,off8,r4 | 
|  | cmpld	cr6,rE,rF | 
|  |  | 
|  | LD	rE,off16,r3 | 
|  | LD	rF,off16,r4 | 
|  | cmpld	cr7,rG,rH | 
|  | bne	cr0,.LcmpAB | 
|  |  | 
|  | LD	rG,off24,r3 | 
|  | LD	rH,off24,r4 | 
|  | cmpld	cr0,rA,rB | 
|  | bne	cr1,.LcmpCD | 
|  |  | 
|  | addi	r3,r3,32 | 
|  | addi	r4,r4,32 | 
|  |  | 
|  | bdz	.Lsecond32 | 
|  |  | 
|  | .balign	16 | 
|  |  | 
|  | 1:	LD	rA,0,r3 | 
|  | LD	rB,0,r4 | 
|  | cmpld	cr1,rC,rD | 
|  | bne	cr6,.LcmpEF | 
|  |  | 
|  | LD	rC,off8,r3 | 
|  | LD	rD,off8,r4 | 
|  | cmpld	cr6,rE,rF | 
|  | bne	cr7,.LcmpGH | 
|  |  | 
|  | LD	rE,off16,r3 | 
|  | LD	rF,off16,r4 | 
|  | cmpld	cr7,rG,rH | 
|  | bne	cr0,.LcmpAB | 
|  |  | 
|  | LD	rG,off24,r3 | 
|  | LD	rH,off24,r4 | 
|  | cmpld	cr0,rA,rB | 
|  | bne	cr1,.LcmpCD | 
|  |  | 
|  | addi	r3,r3,32 | 
|  | addi	r4,r4,32 | 
|  |  | 
|  | bdnz	1b | 
|  |  | 
|  | .Lsecond32: | 
|  | cmpld	cr1,rC,rD | 
|  | bne	cr6,.LcmpEF | 
|  |  | 
|  | cmpld	cr6,rE,rF | 
|  | bne	cr7,.LcmpGH | 
|  |  | 
|  | cmpld	cr7,rG,rH | 
|  | bne	cr0,.LcmpAB | 
|  |  | 
|  | bne	cr1,.LcmpCD | 
|  | bne	cr6,.LcmpEF | 
|  | bne	cr7,.LcmpGH | 
|  |  | 
|  | .Ltail: | 
|  | ld	r31,-8(r1) | 
|  | ld	r30,-16(r1) | 
|  | ld	r29,-24(r1) | 
|  | ld	r28,-32(r1) | 
|  | ld	r27,-40(r1) | 
|  |  | 
|  | cmpdi	r5,0 | 
|  | beq	.Lzero | 
|  | b	.Lshort | 
|  |  | 
|  | .Lfirst32: | 
|  | cmpld	cr1,rC,rD | 
|  | cmpld	cr6,rE,rF | 
|  | cmpld	cr7,rG,rH | 
|  |  | 
|  | bne	cr0,.LcmpAB | 
|  | bne	cr1,.LcmpCD | 
|  | bne	cr6,.LcmpEF | 
|  | bne	cr7,.LcmpGH | 
|  |  | 
|  | b	.Ltail | 
|  |  | 
|  | .LcmpAB: | 
|  | li	r3,1 | 
|  | bgt	cr0,.Lout | 
|  | li	r3,-1 | 
|  | b	.Lout | 
|  |  | 
|  | .LcmpCD: | 
|  | li	r3,1 | 
|  | bgt	cr1,.Lout | 
|  | li	r3,-1 | 
|  | b	.Lout | 
|  |  | 
|  | .LcmpEF: | 
|  | li	r3,1 | 
|  | bgt	cr6,.Lout | 
|  | li	r3,-1 | 
|  | b	.Lout | 
|  |  | 
|  | .LcmpGH: | 
|  | li	r3,1 | 
|  | bgt	cr7,.Lout | 
|  | li	r3,-1 | 
|  |  | 
|  | .Lout: | 
|  | ld	r31,-8(r1) | 
|  | ld	r30,-16(r1) | 
|  | ld	r29,-24(r1) | 
|  | ld	r28,-32(r1) | 
|  | ld	r27,-40(r1) | 
|  | blr | 
|  |  | 
|  | .LcmpAB_lightweight:   /* skip NV GPRS restore */ | 
|  | li	r3,1 | 
|  | bgtlr | 
|  | li	r3,-1 | 
|  | blr | 
|  |  | 
|  | #ifdef CONFIG_ALTIVEC | 
|  | .Lsameoffset_vmx_cmp: | 
|  | /* Enter with src/dst addrs has the same offset with 8 bytes | 
|  | * align boundary. | 
|  | * | 
|  | * There is an optimization based on following fact: memcmp() | 
|  | * prones to fail early at the first 32 bytes. | 
|  | * Before applying VMX instructions which will lead to 32x128bits | 
|  | * VMX regs load/restore penalty, we compare the first 32 bytes | 
|  | * so that we can catch the ~80% fail cases. | 
|  | */ | 
|  |  | 
|  | li	r0,4 | 
|  | mtctr	r0 | 
|  | .Lsameoffset_prechk_32B_loop: | 
|  | LD	rA,0,r3 | 
|  | LD	rB,0,r4 | 
|  | cmpld	cr0,rA,rB | 
|  | addi	r3,r3,8 | 
|  | addi	r4,r4,8 | 
|  | bne     cr0,.LcmpAB_lightweight | 
|  | addi	r5,r5,-8 | 
|  | bdnz	.Lsameoffset_prechk_32B_loop | 
|  |  | 
|  | ENTER_VMX_OPS | 
|  | beq     cr1,.Llong_novmx_cmp | 
|  |  | 
|  | 3: | 
|  | /* need to check whether r4 has the same offset with r3 | 
|  | * for 16 bytes boundary. | 
|  | */ | 
|  | xor	r0,r3,r4 | 
|  | andi.	r0,r0,0xf | 
|  | bne	.Ldiffoffset_vmx_cmp_start | 
|  |  | 
|  | /* len is no less than 4KB. Need to align with 16 bytes further. | 
|  | */ | 
|  | andi.	rA,r3,8 | 
|  | LD	rA,0,r3 | 
|  | beq	4f | 
|  | LD	rB,0,r4 | 
|  | cmpld	cr0,rA,rB | 
|  | addi	r3,r3,8 | 
|  | addi	r4,r4,8 | 
|  | addi	r5,r5,-8 | 
|  |  | 
|  | beq	cr0,4f | 
|  | /* save and restore cr0 */ | 
|  | mfocrf  r5,128 | 
|  | EXIT_VMX_OPS | 
|  | mtocrf  128,r5 | 
|  | b	.LcmpAB_lightweight | 
|  |  | 
|  | 4: | 
|  | /* compare 32 bytes for each loop */ | 
|  | srdi	r0,r5,5 | 
|  | mtctr	r0 | 
|  | clrldi  r5,r5,59 | 
|  | li	off16,16 | 
|  |  | 
|  | .balign 16 | 
|  | 5: | 
|  | lvx 	v0,0,r3 | 
|  | lvx 	v1,0,r4 | 
|  | VCMPEQUD_RC(v0,v0,v1) | 
|  | bnl	cr6,7f | 
|  | lvx 	v0,off16,r3 | 
|  | lvx 	v1,off16,r4 | 
|  | VCMPEQUD_RC(v0,v0,v1) | 
|  | bnl	cr6,6f | 
|  | addi	r3,r3,32 | 
|  | addi	r4,r4,32 | 
|  | bdnz	5b | 
|  |  | 
|  | EXIT_VMX_OPS | 
|  | cmpdi	r5,0 | 
|  | beq	.Lzero | 
|  | b	.Lcmp_lt32bytes | 
|  |  | 
|  | 6: | 
|  | addi	r3,r3,16 | 
|  | addi	r4,r4,16 | 
|  |  | 
|  | 7: | 
|  | /* diff the last 16 bytes */ | 
|  | EXIT_VMX_OPS | 
|  | LD	rA,0,r3 | 
|  | LD	rB,0,r4 | 
|  | cmpld	cr0,rA,rB | 
|  | li	off8,8 | 
|  | bne	cr0,.LcmpAB_lightweight | 
|  |  | 
|  | LD	rA,off8,r3 | 
|  | LD	rB,off8,r4 | 
|  | cmpld	cr0,rA,rB | 
|  | bne	cr0,.LcmpAB_lightweight | 
|  | b	.Lzero | 
|  | #endif | 
|  |  | 
|  | .Ldiffoffset_8bytes_make_align_start: | 
|  | /* now try to align s1 with 8 bytes */ | 
|  | rlwinm  r6,r3,3,26,28 | 
|  | beq     .Ldiffoffset_align_s1_8bytes | 
|  |  | 
|  | clrrdi	r3,r3,3 | 
|  | LD	rA,0,r3 | 
|  | LD	rB,0,r4  /* unaligned load */ | 
|  | sld	rA,rA,r6 | 
|  | srd	rA,rA,r6 | 
|  | srd	rB,rB,r6 | 
|  | cmpld	cr0,rA,rB | 
|  | srwi	r6,r6,3 | 
|  | bne	cr0,.LcmpAB_lightweight | 
|  |  | 
|  | subfic  r6,r6,8 | 
|  | subf.	r5,r6,r5 | 
|  | addi	r3,r3,8 | 
|  | add	r4,r4,r6 | 
|  |  | 
|  | beq	.Lzero | 
|  |  | 
|  | .Ldiffoffset_align_s1_8bytes: | 
|  | /* now s1 is aligned with 8 bytes. */ | 
|  | #ifdef CONFIG_ALTIVEC | 
|  | BEGIN_FTR_SECTION | 
|  | /* only do vmx ops when the size equal or greater than 4K bytes */ | 
|  | cmpdi	cr5,r5,VMX_THRESH | 
|  | bge	cr5,.Ldiffoffset_vmx_cmp | 
|  | END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) | 
|  |  | 
|  | .Ldiffoffset_novmx_cmp: | 
|  | #endif | 
|  |  | 
|  |  | 
|  | cmpdi   cr5,r5,31 | 
|  | ble	cr5,.Lcmp_lt32bytes | 
|  |  | 
|  | #ifdef CONFIG_ALTIVEC | 
|  | b	.Llong_novmx_cmp | 
|  | #else | 
|  | b	.Llong | 
|  | #endif | 
|  |  | 
|  | #ifdef CONFIG_ALTIVEC | 
|  | .Ldiffoffset_vmx_cmp: | 
|  | /* perform a 32 bytes pre-checking before | 
|  | * enable VMX operations. | 
|  | */ | 
|  | li	r0,4 | 
|  | mtctr	r0 | 
|  | .Ldiffoffset_prechk_32B_loop: | 
|  | LD	rA,0,r3 | 
|  | LD	rB,0,r4 | 
|  | cmpld	cr0,rA,rB | 
|  | addi	r3,r3,8 | 
|  | addi	r4,r4,8 | 
|  | bne     cr0,.LcmpAB_lightweight | 
|  | addi	r5,r5,-8 | 
|  | bdnz	.Ldiffoffset_prechk_32B_loop | 
|  |  | 
|  | ENTER_VMX_OPS | 
|  | beq     cr1,.Ldiffoffset_novmx_cmp | 
|  |  | 
|  | .Ldiffoffset_vmx_cmp_start: | 
|  | /* Firstly try to align r3 with 16 bytes */ | 
|  | andi.   r6,r3,0xf | 
|  | li	off16,16 | 
|  | beq     .Ldiffoffset_vmx_s1_16bytes_align | 
|  |  | 
|  | LVS	v3,0,r3 | 
|  | LVS	v4,0,r4 | 
|  |  | 
|  | lvx     v5,0,r3 | 
|  | lvx     v6,0,r4 | 
|  | LD_VSR_CROSS16B(r3,v3,v5,v7,v9) | 
|  | LD_VSR_CROSS16B(r4,v4,v6,v8,v10) | 
|  |  | 
|  | VCMPEQUB_RC(v7,v9,v10) | 
|  | bnl	cr6,.Ldiffoffset_vmx_diff_found | 
|  |  | 
|  | subfic  r6,r6,16 | 
|  | subf    r5,r6,r5 | 
|  | add     r3,r3,r6 | 
|  | add     r4,r4,r6 | 
|  |  | 
|  | .Ldiffoffset_vmx_s1_16bytes_align: | 
|  | /* now s1 is aligned with 16 bytes */ | 
|  | lvx     v6,0,r4 | 
|  | LVS	v4,0,r4 | 
|  | srdi	r6,r5,5  /* loop for 32 bytes each */ | 
|  | clrldi  r5,r5,59 | 
|  | mtctr	r6 | 
|  |  | 
|  | .balign	16 | 
|  | .Ldiffoffset_vmx_32bytesloop: | 
|  | /* the first qw of r4 was saved in v6 */ | 
|  | lvx	v9,0,r3 | 
|  | LD_VSR_CROSS16B(r4,v4,v6,v8,v10) | 
|  | VCMPEQUB_RC(v7,v9,v10) | 
|  | vor	v6,v8,v8 | 
|  | bnl	cr6,.Ldiffoffset_vmx_diff_found | 
|  |  | 
|  | addi	r3,r3,16 | 
|  | addi	r4,r4,16 | 
|  |  | 
|  | lvx	v9,0,r3 | 
|  | LD_VSR_CROSS16B(r4,v4,v6,v8,v10) | 
|  | VCMPEQUB_RC(v7,v9,v10) | 
|  | vor	v6,v8,v8 | 
|  | bnl	cr6,.Ldiffoffset_vmx_diff_found | 
|  |  | 
|  | addi	r3,r3,16 | 
|  | addi	r4,r4,16 | 
|  |  | 
|  | bdnz	.Ldiffoffset_vmx_32bytesloop | 
|  |  | 
|  | EXIT_VMX_OPS | 
|  |  | 
|  | cmpdi	r5,0 | 
|  | beq	.Lzero | 
|  | b	.Lcmp_lt32bytes | 
|  |  | 
|  | .Ldiffoffset_vmx_diff_found: | 
|  | EXIT_VMX_OPS | 
|  | /* anyway, the diff will appear in next 16 bytes */ | 
|  | li	r5,16 | 
|  | b	.Lcmp_lt32bytes | 
|  |  | 
|  | #endif | 
|  | EXPORT_SYMBOL(memcmp) |