OpenJDK / portola / portola
changeset 50546:a6a44177f99c
8201193: Use XMM/YMM for objects initialization
Reviewed-by: jrose, kvn
Contributed-by: rohitarulraj@gmail.com
author | kvn |
---|---|
date | Tue, 12 Jun 2018 21:29:47 -0700 |
parents | 7c5fbc953121 |
children | e1b3def12624 |
files | src/hotspot/cpu/x86/globals_x86.hpp src/hotspot/cpu/x86/macroAssembler_x86.cpp src/hotspot/cpu/x86/macroAssembler_x86.hpp src/hotspot/cpu/x86/vm_version_x86.cpp src/hotspot/cpu/x86/x86_32.ad src/hotspot/cpu/x86/x86_64.ad |
diffstat | 6 files changed, 206 insertions(+), 23 deletions(-) [+] |
line wrap: on
line diff
--- a/src/hotspot/cpu/x86/globals_x86.hpp Mon Jun 11 14:06:50 2018 -0700 +++ b/src/hotspot/cpu/x86/globals_x86.hpp Tue Jun 12 21:29:47 2018 -0700 @@ -150,6 +150,9 @@ product(bool, UseUnalignedLoadStores, false, \ "Use SSE2 MOVDQU instruction for Arraycopy") \ \ + product(bool, UseXMMForObjInit, false, \ + "Use XMM/YMM MOVDQU instruction for Object Initialization") \ + \ product(bool, UseFastStosb, false, \ "Use fast-string operation for zeroing: rep stosb") \ \
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp Mon Jun 11 14:06:50 2018 -0700 +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp Tue Jun 12 21:29:47 2018 -0700 @@ -6777,7 +6777,59 @@ } -void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, bool is_large) { +// clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers +void MacroAssembler::xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp) { + // cnt - number of qwords (8-byte words). + // base - start address, qword aligned. + Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end; + if (UseAVX >= 2) { + vpxor(xtmp, xtmp, xtmp, AVX_256bit); + } else { + pxor(xtmp, xtmp); + } + jmp(L_zero_64_bytes); + + BIND(L_loop); + if (UseAVX >= 2) { + vmovdqu(Address(base, 0), xtmp); + vmovdqu(Address(base, 32), xtmp); + } else { + movdqu(Address(base, 0), xtmp); + movdqu(Address(base, 16), xtmp); + movdqu(Address(base, 32), xtmp); + movdqu(Address(base, 48), xtmp); + } + addptr(base, 64); + + BIND(L_zero_64_bytes); + subptr(cnt, 8); + jccb(Assembler::greaterEqual, L_loop); + addptr(cnt, 4); + jccb(Assembler::less, L_tail); + // Copy trailing 32 bytes + if (UseAVX >= 2) { + vmovdqu(Address(base, 0), xtmp); + } else { + movdqu(Address(base, 0), xtmp); + movdqu(Address(base, 16), xtmp); + } + addptr(base, 32); + subptr(cnt, 4); + + BIND(L_tail); + addptr(cnt, 4); + jccb(Assembler::lessEqual, L_end); + decrement(cnt); + + BIND(L_sloop); + movq(Address(base, 0), xtmp); + addptr(base, 8); + decrement(cnt); + jccb(Assembler::greaterEqual, L_sloop); + BIND(L_end); +} + +void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp, bool is_large) { // cnt - number of qwords (8-byte words). // base - start address, qword aligned. // is_large - if optimizers know cnt is larger than InitArrayShortSize @@ -6789,7 +6841,9 @@ Label DONE; - xorptr(tmp, tmp); + if (!is_large || !UseXMMForObjInit) { + xorptr(tmp, tmp); + } if (!is_large) { Label LOOP, LONG; @@ -6815,6 +6869,9 @@ if (UseFastStosb) { shlptr(cnt, 3); // convert to number of bytes rep_stosb(); + } else if (UseXMMForObjInit) { + movptr(tmp, base); + xmm_clear_mem(tmp, cnt, xtmp); } else { NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM rep_stos();
--- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp Mon Jun 11 14:06:50 2018 -0700 +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp Tue Jun 12 21:29:47 2018 -0700 @@ -1578,7 +1578,10 @@ // clear memory of size 'cnt' qwords, starting at 'base'; // if 'is_large' is set, do not try to produce short loop - void clear_mem(Register base, Register cnt, Register rtmp, bool is_large); + void clear_mem(Register base, Register cnt, Register rtmp, XMMRegister xtmp, bool is_large); + + // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers + void xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp); #ifdef COMPILER2 void string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp Mon Jun 11 14:06:50 2018 -0700 +++ b/src/hotspot/cpu/x86/vm_version_x86.cpp Tue Jun 12 21:29:47 2018 -0700 @@ -1396,6 +1396,16 @@ FLAG_SET_DEFAULT(UseFastStosb, false); } + // Use XMM/YMM MOVDQU instruction for Object Initialization + if (!UseFastStosb && UseSSE >= 2 && UseUnalignedLoadStores) { + if (FLAG_IS_DEFAULT(UseXMMForObjInit)) { + UseXMMForObjInit = true; + } + } else if (UseXMMForObjInit) { + warning("UseXMMForObjInit requires SSE2 and unaligned load/stores. Feature is switched off."); + FLAG_SET_DEFAULT(UseXMMForObjInit, false); + } + #ifdef COMPILER2 if (FLAG_IS_DEFAULT(AlignVector)) { // Modern processors allow misaligned memory operations for vectors.
--- a/src/hotspot/cpu/x86/x86_32.ad Mon Jun 11 14:06:50 2018 -0700 +++ b/src/hotspot/cpu/x86/x86_32.ad Tue Jun 12 21:29:47 2018 -0700 @@ -11482,10 +11482,10 @@ // ======================================================================= // fast clearing of an array -instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{ +instruct rep_stos(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI zero, Universe dummy, eFlagsReg cr) %{ predicate(!((ClearArrayNode*)n)->is_large()); match(Set dummy (ClearArray cnt base)); - effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr); + effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr); format %{ $$template $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t" @@ -11502,6 +11502,32 @@ if (UseFastStosb) { $$emit$$"SHL ECX,3\t# Convert doublewords to bytes\n\t" $$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t" + } else if (UseXMMForObjInit) { + $$emit$$"MOV RDI,RAX\n\t" + $$emit$$"VPXOR YMM0,YMM0,YMM0\n\t" + $$emit$$"JMPQ L_zero_64_bytes\n\t" + $$emit$$"# L_loop:\t# 64-byte LOOP\n\t" + $$emit$$"VMOVDQU YMM0,(RAX)\n\t" + $$emit$$"VMOVDQU YMM0,0x20(RAX)\n\t" + $$emit$$"ADD 0x40,RAX\n\t" + $$emit$$"# L_zero_64_bytes:\n\t" + $$emit$$"SUB 0x8,RCX\n\t" + $$emit$$"JGE L_loop\n\t" + $$emit$$"ADD 0x4,RCX\n\t" + $$emit$$"JL L_tail\n\t" + $$emit$$"VMOVDQU YMM0,(RAX)\n\t" + $$emit$$"ADD 0x20,RAX\n\t" + $$emit$$"SUB 0x4,RCX\n\t" + $$emit$$"# L_tail:\t# Clearing tail bytes\n\t" + $$emit$$"ADD 0x4,RCX\n\t" + $$emit$$"JLE L_end\n\t" + $$emit$$"DEC RCX\n\t" + $$emit$$"# L_sloop:\t# 8-byte short loop\n\t" + $$emit$$"VMOVQ XMM0,(RAX)\n\t" + $$emit$$"ADD 0x8,RAX\n\t" + $$emit$$"DEC RCX\n\t" + $$emit$$"JGE L_sloop\n\t" + $$emit$$"# L_end:\n\t" } else { $$emit$$"SHL ECX,1\t# Convert doublewords to words\n\t" $$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t" @@ -11509,28 +11535,57 @@ $$emit$$"# DONE" %} ins_encode %{ - __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false); - %} - ins_pipe( pipe_slow ); -%} - -instruct rep_stos_large(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{ + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, + $tmp$$XMMRegister, false); + %} + ins_pipe( pipe_slow ); +%} + +instruct rep_stos_large(eCXRegI cnt, eDIRegP base, regD tmp, eAXRegI zero, Universe dummy, eFlagsReg cr) %{ predicate(((ClearArrayNode*)n)->is_large()); match(Set dummy (ClearArray cnt base)); - effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr); + effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr); format %{ $$template - $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t" if (UseFastStosb) { + $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t" $$emit$$"SHL ECX,3\t# Convert doublewords to bytes\n\t" $$emit$$"REP STOSB\t# store EAX into [EDI++] while ECX--\n\t" + } else if (UseXMMForObjInit) { + $$emit$$"MOV RDI,RAX\t# ClearArray:\n\t" + $$emit$$"VPXOR YMM0,YMM0,YMM0\n\t" + $$emit$$"JMPQ L_zero_64_bytes\n\t" + $$emit$$"# L_loop:\t# 64-byte LOOP\n\t" + $$emit$$"VMOVDQU YMM0,(RAX)\n\t" + $$emit$$"VMOVDQU YMM0,0x20(RAX)\n\t" + $$emit$$"ADD 0x40,RAX\n\t" + $$emit$$"# L_zero_64_bytes:\n\t" + $$emit$$"SUB 0x8,RCX\n\t" + $$emit$$"JGE L_loop\n\t" + $$emit$$"ADD 0x4,RCX\n\t" + $$emit$$"JL L_tail\n\t" + $$emit$$"VMOVDQU YMM0,(RAX)\n\t" + $$emit$$"ADD 0x20,RAX\n\t" + $$emit$$"SUB 0x4,RCX\n\t" + $$emit$$"# L_tail:\t# Clearing tail bytes\n\t" + $$emit$$"ADD 0x4,RCX\n\t" + $$emit$$"JLE L_end\n\t" + $$emit$$"DEC RCX\n\t" + $$emit$$"# L_sloop:\t# 8-byte short loop\n\t" + $$emit$$"VMOVQ XMM0,(RAX)\n\t" + $$emit$$"ADD 0x8,RAX\n\t" + $$emit$$"DEC RCX\n\t" + $$emit$$"JGE L_sloop\n\t" + $$emit$$"# L_end:\n\t" } else { + $$emit$$"XOR EAX,EAX\t# ClearArray:\n\t" $$emit$$"SHL ECX,1\t# Convert doublewords to words\n\t" $$emit$$"REP STOS\t# store EAX into [EDI++] while ECX--\n\t" } $$emit$$"# DONE" %} ins_encode %{ - __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true); + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, + $tmp$$XMMRegister, true); %} ins_pipe( pipe_slow ); %}
--- a/src/hotspot/cpu/x86/x86_64.ad Mon Jun 11 14:06:50 2018 -0700 +++ b/src/hotspot/cpu/x86/x86_64.ad Tue Jun 12 21:29:47 2018 -0700 @@ -10770,12 +10770,12 @@ // ======================================================================= // fast clearing of an array -instruct rep_stos(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy, - rFlagsReg cr) +instruct rep_stos(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero, + Universe dummy, rFlagsReg cr) %{ predicate(!((ClearArrayNode*)n)->is_large()); match(Set dummy (ClearArray cnt base)); - effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr); + effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr); format %{ $$template $$emit$$"xorq rax, rax\t# ClearArray:\n\t" @@ -10791,35 +10791,90 @@ if (UseFastStosb) { $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t" $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--\n\t" + } else if (UseXMMForObjInit) { + $$emit$$"mov rdi,rax\n\t" + $$emit$$"vpxor ymm0,ymm0,ymm0\n\t" + $$emit$$"jmpq L_zero_64_bytes\n\t" + $$emit$$"# L_loop:\t# 64-byte LOOP\n\t" + $$emit$$"vmovdqu ymm0,(rax)\n\t" + $$emit$$"vmovdqu ymm0,0x20(rax)\n\t" + $$emit$$"add 0x40,rax\n\t" + $$emit$$"# L_zero_64_bytes:\n\t" + $$emit$$"sub 0x8,rcx\n\t" + $$emit$$"jge L_loop\n\t" + $$emit$$"add 0x4,rcx\n\t" + $$emit$$"jl L_tail\n\t" + $$emit$$"vmovdqu ymm0,(rax)\n\t" + $$emit$$"add 0x20,rax\n\t" + $$emit$$"sub 0x4,rcx\n\t" + $$emit$$"# L_tail:\t# Clearing tail bytes\n\t" + $$emit$$"add 0x4,rcx\n\t" + $$emit$$"jle L_end\n\t" + $$emit$$"dec rcx\n\t" + $$emit$$"# L_sloop:\t# 8-byte short loop\n\t" + $$emit$$"vmovq xmm0,(rax)\n\t" + $$emit$$"add 0x8,rax\n\t" + $$emit$$"dec rcx\n\t" + $$emit$$"jge L_sloop\n\t" + $$emit$$"# L_end:\n\t" } else { $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--\n\t" } $$emit$$"# DONE" %} ins_encode %{ - __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, false); + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, + $tmp$$XMMRegister, false); %} ins_pipe(pipe_slow); %} -instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy, - rFlagsReg cr) +instruct rep_stos_large(rcx_RegL cnt, rdi_RegP base, regD tmp, rax_RegI zero, + Universe dummy, rFlagsReg cr) %{ predicate(((ClearArrayNode*)n)->is_large()); match(Set dummy (ClearArray cnt base)); - effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr); + effect(USE_KILL cnt, USE_KILL base, TEMP tmp, KILL zero, KILL cr); format %{ $$template - $$emit$$"xorq rax, rax\t# ClearArray:\n\t" if (UseFastStosb) { + $$emit$$"xorq rax, rax\t# ClearArray:\n\t" $$emit$$"shlq rcx,3\t# Convert doublewords to bytes\n\t" $$emit$$"rep stosb\t# Store rax to *rdi++ while rcx--" + } else if (UseXMMForObjInit) { + $$emit$$"mov rdi,rax\t# ClearArray:\n\t" + $$emit$$"vpxor ymm0,ymm0,ymm0\n\t" + $$emit$$"jmpq L_zero_64_bytes\n\t" + $$emit$$"# L_loop:\t# 64-byte LOOP\n\t" + $$emit$$"vmovdqu ymm0,(rax)\n\t" + $$emit$$"vmovdqu ymm0,0x20(rax)\n\t" + $$emit$$"add 0x40,rax\n\t" + $$emit$$"# L_zero_64_bytes:\n\t" + $$emit$$"sub 0x8,rcx\n\t" + $$emit$$"jge L_loop\n\t" + $$emit$$"add 0x4,rcx\n\t" + $$emit$$"jl L_tail\n\t" + $$emit$$"vmovdqu ymm0,(rax)\n\t" + $$emit$$"add 0x20,rax\n\t" + $$emit$$"sub 0x4,rcx\n\t" + $$emit$$"# L_tail:\t# Clearing tail bytes\n\t" + $$emit$$"add 0x4,rcx\n\t" + $$emit$$"jle L_end\n\t" + $$emit$$"dec rcx\n\t" + $$emit$$"# L_sloop:\t# 8-byte short loop\n\t" + $$emit$$"vmovq xmm0,(rax)\n\t" + $$emit$$"add 0x8,rax\n\t" + $$emit$$"dec rcx\n\t" + $$emit$$"jge L_sloop\n\t" + $$emit$$"# L_end:\n\t" } else { + $$emit$$"xorq rax, rax\t# ClearArray:\n\t" $$emit$$"rep stosq\t# Store rax to *rdi++ while rcx--" } %} ins_encode %{ - __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, true); + __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register, + $tmp$$XMMRegister, true); %} ins_pipe(pipe_slow); %}