alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Java example source code file (x86_32.ad)

This example Java source code file (x86_32.ad) is included in the alvinalexander.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Learn more about this Java project at its project page.

Java - Java tags/keywords

address, binary, esp, fld, high_from_low, kill, mov, opcode, opcp, register, regmem, usesse, xmmregister

The x86_32.ad Java example source code

//
// Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
//
// This code is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License version 2 only, as
// published by the Free Software Foundation.
//
// This code is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// version 2 for more details (a copy is included in the LICENSE file that
// accompanied this code).
//
// You should have received a copy of the GNU General Public License version
// 2 along with this work; if not, write to the Free Software Foundation,
// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
//
// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
// or visit www.oracle.com if you need additional information or have any
// questions.
//
//

// X86 Architecture Description File

//----------REGISTER DEFINITION BLOCK------------------------------------------
// This information is used by the matcher and the register allocator to
// describe individual registers and classes of registers within the target
// archtecture.

register %{
//----------Architecture Description Register Definitions----------------------
// General Registers
// "reg_def"  name ( register save type, C convention save type,
//                   ideal register type, encoding );
// Register Save Types:
//
// NS  = No-Save:       The register allocator assumes that these registers
//                      can be used without saving upon entry to the method, &
//                      that they do not need to be saved at call sites.
//
// SOC = Save-On-Call:  The register allocator assumes that these registers
//                      can be used without saving upon entry to the method,
//                      but that they must be saved at call sites.
//
// SOE = Save-On-Entry: The register allocator assumes that these registers
//                      must be saved before using them upon entry to the
//                      method, but they do not need to be saved at call
//                      sites.
//
// AS  = Always-Save:   The register allocator assumes that these registers
//                      must be saved before using them upon entry to the
//                      method, & that they must be saved at call sites.
//
// Ideal Register Type is used to determine how to save & restore a
// register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
// spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
//
// The encoding number is the actual bit-pattern placed into the opcodes.

// General Registers
// Previously set EBX, ESI, and EDI as save-on-entry for java code
// Turn off SOE in java-code due to frequent use of uncommon-traps.
// Now that allocator is better, turn on ESI and EDI as SOE registers.

reg_def EBX(SOC, SOE, Op_RegI, 3, rbx->as_VMReg());
reg_def ECX(SOC, SOC, Op_RegI, 1, rcx->as_VMReg());
reg_def ESI(SOC, SOE, Op_RegI, 6, rsi->as_VMReg());
reg_def EDI(SOC, SOE, Op_RegI, 7, rdi->as_VMReg());
// now that adapter frames are gone EBP is always saved and restored by the prolog/epilog code
reg_def EBP(NS, SOE, Op_RegI, 5, rbp->as_VMReg());
reg_def EDX(SOC, SOC, Op_RegI, 2, rdx->as_VMReg());
reg_def EAX(SOC, SOC, Op_RegI, 0, rax->as_VMReg());
reg_def ESP( NS,  NS, Op_RegI, 4, rsp->as_VMReg());

// Float registers.  We treat TOS/FPR0 special.  It is invisible to the
// allocator, and only shows up in the encodings.
reg_def FPR0L( SOC, SOC, Op_RegF, 0, VMRegImpl::Bad());
reg_def FPR0H( SOC, SOC, Op_RegF, 0, VMRegImpl::Bad());
// Ok so here's the trick FPR1 is really st(0) except in the midst
// of emission of assembly for a machnode. During the emission the fpu stack
// is pushed making FPR1 == st(1) temporarily. However at any safepoint
// the stack will not have this element so FPR1 == st(0) from the
// oopMap viewpoint. This same weirdness with numbering causes
// instruction encoding to have to play games with the register
// encode to correct for this 0/1 issue. See MachSpillCopyNode::implementation
// where it does flt->flt moves to see an example
//
reg_def FPR1L( SOC, SOC, Op_RegF, 1, as_FloatRegister(0)->as_VMReg());
reg_def FPR1H( SOC, SOC, Op_RegF, 1, as_FloatRegister(0)->as_VMReg()->next());
reg_def FPR2L( SOC, SOC, Op_RegF, 2, as_FloatRegister(1)->as_VMReg());
reg_def FPR2H( SOC, SOC, Op_RegF, 2, as_FloatRegister(1)->as_VMReg()->next());
reg_def FPR3L( SOC, SOC, Op_RegF, 3, as_FloatRegister(2)->as_VMReg());
reg_def FPR3H( SOC, SOC, Op_RegF, 3, as_FloatRegister(2)->as_VMReg()->next());
reg_def FPR4L( SOC, SOC, Op_RegF, 4, as_FloatRegister(3)->as_VMReg());
reg_def FPR4H( SOC, SOC, Op_RegF, 4, as_FloatRegister(3)->as_VMReg()->next());
reg_def FPR5L( SOC, SOC, Op_RegF, 5, as_FloatRegister(4)->as_VMReg());
reg_def FPR5H( SOC, SOC, Op_RegF, 5, as_FloatRegister(4)->as_VMReg()->next());
reg_def FPR6L( SOC, SOC, Op_RegF, 6, as_FloatRegister(5)->as_VMReg());
reg_def FPR6H( SOC, SOC, Op_RegF, 6, as_FloatRegister(5)->as_VMReg()->next());
reg_def FPR7L( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg());
reg_def FPR7H( SOC, SOC, Op_RegF, 7, as_FloatRegister(6)->as_VMReg()->next());

// Specify priority of register selection within phases of register
// allocation.  Highest priority is first.  A useful heuristic is to
// give registers a low priority when they are required by machine
// instructions, like EAX and EDX.  Registers which are used as
// pairs must fall on an even boundary (witness the FPR#L's in this list).
// For the Intel integer registers, the equivalent Long pairs are
// EDX:EAX, EBX:ECX, and EDI:EBP.
alloc_class chunk0( ECX,   EBX,   EBP,   EDI,   EAX,   EDX,   ESI, ESP,
                    FPR0L, FPR0H, FPR1L, FPR1H, FPR2L, FPR2H,
                    FPR3L, FPR3H, FPR4L, FPR4H, FPR5L, FPR5H,
                    FPR6L, FPR6H, FPR7L, FPR7H );


//----------Architecture Description Register Classes--------------------------
// Several register classes are automatically defined based upon information in
// this architecture description.
// 1) reg_class inline_cache_reg           ( /* as def'd in frame section */ )
// 2) reg_class compiler_method_oop_reg    ( /* as def'd in frame section */ )
// 2) reg_class interpreter_method_oop_reg ( /* as def'd in frame section */ )
// 3) reg_class stack_slots( /* one chunk of stack-based "registers" */ )
//
// Class for all registers
reg_class any_reg(EAX, EDX, EBP, EDI, ESI, ECX, EBX, ESP);
// Class for general registers
reg_class int_reg(EAX, EDX, EBP, EDI, ESI, ECX, EBX);
// Class for general registers which may be used for implicit null checks on win95
// Also safe for use by tailjump. We don't want to allocate in rbp,
reg_class int_reg_no_rbp(EAX, EDX, EDI, ESI, ECX, EBX);
// Class of "X" registers
reg_class int_x_reg(EBX, ECX, EDX, EAX);
// Class of registers that can appear in an address with no offset.
// EBP and ESP require an extra instruction byte for zero offset.
// Used in fast-unlock
reg_class p_reg(EDX, EDI, ESI, EBX);
// Class for general registers not including ECX
reg_class ncx_reg(EAX, EDX, EBP, EDI, ESI, EBX);
// Class for general registers not including EAX
reg_class nax_reg(EDX, EDI, ESI, ECX, EBX);
// Class for general registers not including EAX or EBX.
reg_class nabx_reg(EDX, EDI, ESI, ECX, EBP);
// Class of EAX (for multiply and divide operations)
reg_class eax_reg(EAX);
// Class of EBX (for atomic add)
reg_class ebx_reg(EBX);
// Class of ECX (for shift and JCXZ operations and cmpLTMask)
reg_class ecx_reg(ECX);
// Class of EDX (for multiply and divide operations)
reg_class edx_reg(EDX);
// Class of EDI (for synchronization)
reg_class edi_reg(EDI);
// Class of ESI (for synchronization)
reg_class esi_reg(ESI);
// Singleton class for interpreter's stack pointer
reg_class ebp_reg(EBP);
// Singleton class for stack pointer
reg_class sp_reg(ESP);
// Singleton class for instruction pointer
// reg_class ip_reg(EIP);
// Class of integer register pairs
reg_class long_reg( EAX,EDX, ECX,EBX, EBP,EDI );
// Class of integer register pairs that aligns with calling convention
reg_class eadx_reg( EAX,EDX );
reg_class ebcx_reg( ECX,EBX );
// Not AX or DX, used in divides
reg_class nadx_reg( EBX,ECX,ESI,EDI,EBP );

// Floating point registers.  Notice FPR0 is not a choice.
// FPR0 is not ever allocated; we use clever encodings to fake
// a 2-address instructions out of Intels FP stack.
reg_class fp_flt_reg( FPR1L,FPR2L,FPR3L,FPR4L,FPR5L,FPR6L,FPR7L );

reg_class fp_dbl_reg( FPR1L,FPR1H, FPR2L,FPR2H, FPR3L,FPR3H,
                      FPR4L,FPR4H, FPR5L,FPR5H, FPR6L,FPR6H,
                      FPR7L,FPR7H );

reg_class fp_flt_reg0( FPR1L );
reg_class fp_dbl_reg0( FPR1L,FPR1H );
reg_class fp_dbl_reg1( FPR2L,FPR2H );
reg_class fp_dbl_notreg0( FPR2L,FPR2H, FPR3L,FPR3H, FPR4L,FPR4H,
                          FPR5L,FPR5H, FPR6L,FPR6H, FPR7L,FPR7H );

%}


//----------SOURCE BLOCK-------------------------------------------------------
// This is a block of C++ code which provides values, functions, and
// definitions necessary in the rest of the architecture description
source_hpp %{
// Must be visible to the DFA in dfa_x86_32.cpp
extern bool is_operand_hi32_zero(Node* n);
%}

source %{
#define   RELOC_IMM32    Assembler::imm_operand
#define   RELOC_DISP32   Assembler::disp32_operand

#define __ _masm.

// How to find the high register of a Long pair, given the low register
#define   HIGH_FROM_LOW(x) ((x)+2)

// These masks are used to provide 128-bit aligned bitmasks to the XMM
// instructions, to allow sign-masking or sign-bit flipping.  They allow
// fast versions of NegF/NegD and AbsF/AbsD.

// Note: 'double' and 'long long' have 32-bits alignment on x86.
static jlong* double_quadword(jlong *adr, jlong lo, jlong hi) {
  // Use the expression (adr)&(~0xF) to provide 128-bits aligned address
  // of 128-bits operands for SSE instructions.
  jlong *operand = (jlong*)(((uintptr_t)adr)&((uintptr_t)(~0xF)));
  // Store the value to a 128-bits operand.
  operand[0] = lo;
  operand[1] = hi;
  return operand;
}

// Buffer for 128-bits masks used by SSE instructions.
static jlong fp_signmask_pool[(4+1)*2]; // 4*128bits(data) + 128bits(alignment)

// Static initialization during VM startup.
static jlong *float_signmask_pool  = double_quadword(&fp_signmask_pool[1*2], CONST64(0x7FFFFFFF7FFFFFFF), CONST64(0x7FFFFFFF7FFFFFFF));
static jlong *double_signmask_pool = double_quadword(&fp_signmask_pool[2*2], CONST64(0x7FFFFFFFFFFFFFFF), CONST64(0x7FFFFFFFFFFFFFFF));
static jlong *float_signflip_pool  = double_quadword(&fp_signmask_pool[3*2], CONST64(0x8000000080000000), CONST64(0x8000000080000000));
static jlong *double_signflip_pool = double_quadword(&fp_signmask_pool[4*2], CONST64(0x8000000000000000), CONST64(0x8000000000000000));

// Offset hacking within calls.
static int pre_call_resets_size() {
  int size = 0;
  Compile* C = Compile::current();
  if (C->in_24_bit_fp_mode()) {
    size += 6; // fldcw
  }
  if (C->max_vector_size() > 16) {
    size += 3; // vzeroupper
  }
  return size;
}

static int preserve_SP_size() {
  return 2;  // op, rm(reg/reg)
}

// !!!!! Special hack to get all type of calls to specify the byte offset
//       from the start of the call to the point where the return address
//       will point.
int MachCallStaticJavaNode::ret_addr_offset() {
  int offset = 5 + pre_call_resets_size();  // 5 bytes from start of call to where return address points
  if (_method_handle_invoke)
    offset += preserve_SP_size();
  return offset;
}

int MachCallDynamicJavaNode::ret_addr_offset() {
  return 10 + pre_call_resets_size();  // 10 bytes from start of call to where return address points
}

static int sizeof_FFree_Float_Stack_All = -1;

int MachCallRuntimeNode::ret_addr_offset() {
  assert(sizeof_FFree_Float_Stack_All != -1, "must have been emitted already");
  return sizeof_FFree_Float_Stack_All + 5 + pre_call_resets_size();
}

// Indicate if the safepoint node needs the polling page as an input.
// Since x86 does have absolute addressing, it doesn't.
bool SafePointNode::needs_polling_address_input() {
  return false;
}

//
// Compute padding required for nodes which need alignment
//

// The address of the call instruction needs to be 4-byte aligned to
// ensure that it does not span a cache line so that it can be patched.
int CallStaticJavaDirectNode::compute_padding(int current_offset) const {
  current_offset += pre_call_resets_size();  // skip fldcw, if any
  current_offset += 1;      // skip call opcode byte
  return round_to(current_offset, alignment_required()) - current_offset;
}

// The address of the call instruction needs to be 4-byte aligned to
// ensure that it does not span a cache line so that it can be patched.
int CallStaticJavaHandleNode::compute_padding(int current_offset) const {
  current_offset += pre_call_resets_size();  // skip fldcw, if any
  current_offset += preserve_SP_size();   // skip mov rbp, rsp
  current_offset += 1;      // skip call opcode byte
  return round_to(current_offset, alignment_required()) - current_offset;
}

// The address of the call instruction needs to be 4-byte aligned to
// ensure that it does not span a cache line so that it can be patched.
int CallDynamicJavaDirectNode::compute_padding(int current_offset) const {
  current_offset += pre_call_resets_size();  // skip fldcw, if any
  current_offset += 5;      // skip MOV instruction
  current_offset += 1;      // skip call opcode byte
  return round_to(current_offset, alignment_required()) - current_offset;
}

// EMIT_RM()
void emit_rm(CodeBuffer &cbuf, int f1, int f2, int f3) {
  unsigned char c = (unsigned char)((f1 << 6) | (f2 << 3) | f3);
  cbuf.insts()->emit_int8(c);
}

// EMIT_CC()
void emit_cc(CodeBuffer &cbuf, int f1, int f2) {
  unsigned char c = (unsigned char)( f1 | f2 );
  cbuf.insts()->emit_int8(c);
}

// EMIT_OPCODE()
void emit_opcode(CodeBuffer &cbuf, int code) {
  cbuf.insts()->emit_int8((unsigned char) code);
}

// EMIT_OPCODE() w/ relocation information
void emit_opcode(CodeBuffer &cbuf, int code, relocInfo::relocType reloc, int offset = 0) {
  cbuf.relocate(cbuf.insts_mark() + offset, reloc);
  emit_opcode(cbuf, code);
}

// EMIT_D8()
void emit_d8(CodeBuffer &cbuf, int d8) {
  cbuf.insts()->emit_int8((unsigned char) d8);
}

// EMIT_D16()
void emit_d16(CodeBuffer &cbuf, int d16) {
  cbuf.insts()->emit_int16(d16);
}

// EMIT_D32()
void emit_d32(CodeBuffer &cbuf, int d32) {
  cbuf.insts()->emit_int32(d32);
}

// emit 32 bit value and construct relocation entry from relocInfo::relocType
void emit_d32_reloc(CodeBuffer &cbuf, int d32, relocInfo::relocType reloc,
        int format) {
  cbuf.relocate(cbuf.insts_mark(), reloc, format);
  cbuf.insts()->emit_int32(d32);
}

// emit 32 bit value and construct relocation entry from RelocationHolder
void emit_d32_reloc(CodeBuffer &cbuf, int d32, RelocationHolder const& rspec,
        int format) {
#ifdef ASSERT
  if (rspec.reloc()->type() == relocInfo::oop_type && d32 != 0 && d32 != (int)Universe::non_oop_word()) {
    assert(cast_to_oop(d32)->is_oop() && (ScavengeRootsInCode || !cast_to_oop(d32)->is_scavengable()), "cannot embed scavengable oops in code");
  }
#endif
  cbuf.relocate(cbuf.insts_mark(), rspec, format);
  cbuf.insts()->emit_int32(d32);
}

// Access stack slot for load or store
void store_to_stackslot(CodeBuffer &cbuf, int opcode, int rm_field, int disp) {
  emit_opcode( cbuf, opcode );               // (e.g., FILD   [ESP+src])
  if( -128 <= disp && disp <= 127 ) {
    emit_rm( cbuf, 0x01, rm_field, ESP_enc );  // R/M byte
    emit_rm( cbuf, 0x00, ESP_enc, ESP_enc);    // SIB byte
    emit_d8 (cbuf, disp);     // Displacement  // R/M byte
  } else {
    emit_rm( cbuf, 0x02, rm_field, ESP_enc );  // R/M byte
    emit_rm( cbuf, 0x00, ESP_enc, ESP_enc);    // SIB byte
    emit_d32(cbuf, disp);     // Displacement  // R/M byte
  }
}

   // rRegI ereg, memory mem) %{    // emit_reg_mem
void encode_RegMem( CodeBuffer &cbuf, int reg_encoding, int base, int index, int scale, int displace, relocInfo::relocType disp_reloc ) {
  // There is no index & no scale, use form without SIB byte
  if ((index == 0x4) &&
      (scale == 0) && (base != ESP_enc)) {
    // If no displacement, mode is 0x0; unless base is [EBP]
    if ( (displace == 0) && (base != EBP_enc) ) {
      emit_rm(cbuf, 0x0, reg_encoding, base);
    }
    else {                    // If 8-bit displacement, mode 0x1
      if ((displace >= -128) && (displace <= 127)
          && (disp_reloc == relocInfo::none) ) {
        emit_rm(cbuf, 0x1, reg_encoding, base);
        emit_d8(cbuf, displace);
      }
      else {                  // If 32-bit displacement
        if (base == -1) { // Special flag for absolute address
          emit_rm(cbuf, 0x0, reg_encoding, 0x5);
          // (manual lies; no SIB needed here)
          if ( disp_reloc != relocInfo::none ) {
            emit_d32_reloc(cbuf, displace, disp_reloc, 1);
          } else {
            emit_d32      (cbuf, displace);
          }
        }
        else {                // Normal base + offset
          emit_rm(cbuf, 0x2, reg_encoding, base);
          if ( disp_reloc != relocInfo::none ) {
            emit_d32_reloc(cbuf, displace, disp_reloc, 1);
          } else {
            emit_d32      (cbuf, displace);
          }
        }
      }
    }
  }
  else {                      // Else, encode with the SIB byte
    // If no displacement, mode is 0x0; unless base is [EBP]
    if (displace == 0 && (base != EBP_enc)) {  // If no displacement
      emit_rm(cbuf, 0x0, reg_encoding, 0x4);
      emit_rm(cbuf, scale, index, base);
    }
    else {                    // If 8-bit displacement, mode 0x1
      if ((displace >= -128) && (displace <= 127)
          && (disp_reloc == relocInfo::none) ) {
        emit_rm(cbuf, 0x1, reg_encoding, 0x4);
        emit_rm(cbuf, scale, index, base);
        emit_d8(cbuf, displace);
      }
      else {                  // If 32-bit displacement
        if (base == 0x04 ) {
          emit_rm(cbuf, 0x2, reg_encoding, 0x4);
          emit_rm(cbuf, scale, index, 0x04);
        } else {
          emit_rm(cbuf, 0x2, reg_encoding, 0x4);
          emit_rm(cbuf, scale, index, base);
        }
        if ( disp_reloc != relocInfo::none ) {
          emit_d32_reloc(cbuf, displace, disp_reloc, 1);
        } else {
          emit_d32      (cbuf, displace);
        }
      }
    }
  }
}


void encode_Copy( CodeBuffer &cbuf, int dst_encoding, int src_encoding ) {
  if( dst_encoding == src_encoding ) {
    // reg-reg copy, use an empty encoding
  } else {
    emit_opcode( cbuf, 0x8B );
    emit_rm(cbuf, 0x3, dst_encoding, src_encoding );
  }
}

void emit_cmpfp_fixup(MacroAssembler& _masm) {
  Label exit;
  __ jccb(Assembler::noParity, exit);
  __ pushf();
  //
  // comiss/ucomiss instructions set ZF,PF,CF flags and
  // zero OF,AF,SF for NaN values.
  // Fixup flags by zeroing ZF,PF so that compare of NaN
  // values returns 'less than' result (CF is set).
  // Leave the rest of flags unchanged.
  //
  //    7 6 5 4 3 2 1 0
  //   |S|Z|r|A|r|P|r|C|  (r - reserved bit)
  //    0 0 1 0 1 0 1 1   (0x2B)
  //
  __ andl(Address(rsp, 0), 0xffffff2b);
  __ popf();
  __ bind(exit);
}

void emit_cmpfp3(MacroAssembler& _masm, Register dst) {
  Label done;
  __ movl(dst, -1);
  __ jcc(Assembler::parity, done);
  __ jcc(Assembler::below, done);
  __ setb(Assembler::notEqual, dst);
  __ movzbl(dst, dst);
  __ bind(done);
}


//=============================================================================
const RegMask& MachConstantBaseNode::_out_RegMask = RegMask::Empty;

int Compile::ConstantTable::calculate_table_base_offset() const {
  return 0;  // absolute addressing, no offset
}

void MachConstantBaseNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const {
  // Empty encoding
}

uint MachConstantBaseNode::size(PhaseRegAlloc* ra_) const {
  return 0;
}

#ifndef PRODUCT
void MachConstantBaseNode::format(PhaseRegAlloc* ra_, outputStream* st) const {
  st->print("# MachConstantBaseNode (empty encoding)");
}
#endif


//=============================================================================
#ifndef PRODUCT
void MachPrologNode::format(PhaseRegAlloc* ra_, outputStream* st) const {
  Compile* C = ra_->C;

  int framesize = C->frame_slots() << LogBytesPerInt;
  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  // Remove wordSize for return addr which is already pushed.
  framesize -= wordSize;

  if (C->need_stack_bang(framesize)) {
    framesize -= wordSize;
    st->print("# stack bang");
    st->print("\n\t");
    st->print("PUSH   EBP\t# Save EBP");
    if (framesize) {
      st->print("\n\t");
      st->print("SUB    ESP, #%d\t# Create frame",framesize);
    }
  } else {
    st->print("SUB    ESP, #%d\t# Create frame",framesize);
    st->print("\n\t");
    framesize -= wordSize;
    st->print("MOV    [ESP + #%d], EBP\t# Save EBP",framesize);
  }

  if (VerifyStackAtCalls) {
    st->print("\n\t");
    framesize -= wordSize;
    st->print("MOV    [ESP + #%d], 0xBADB100D\t# Majik cookie for stack depth check",framesize);
  }

  if( C->in_24_bit_fp_mode() ) {
    st->print("\n\t");
    st->print("FLDCW  \t# load 24 bit fpu control word");
  }
  if (UseSSE >= 2 && VerifyFPU) {
    st->print("\n\t");
    st->print("# verify FPU stack (must be clean on entry)");
  }

#ifdef ASSERT
  if (VerifyStackAtCalls) {
    st->print("\n\t");
    st->print("# stack alignment check");
  }
#endif
  st->cr();
}
#endif


void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
  Compile* C = ra_->C;
  MacroAssembler _masm(&cbuf);

  int framesize = C->frame_slots() << LogBytesPerInt;

  __ verified_entry(framesize, C->need_stack_bang(framesize), C->in_24_bit_fp_mode());

  C->set_frame_complete(cbuf.insts_size());

  if (C->has_mach_constant_base_node()) {
    // NOTE: We set the table base offset here because users might be
    // emitted before MachConstantBaseNode.
    Compile::ConstantTable& constant_table = C->constant_table();
    constant_table.set_table_base_offset(constant_table.calculate_table_base_offset());
  }
}

uint MachPrologNode::size(PhaseRegAlloc *ra_) const {
  return MachNode::size(ra_); // too many variables; just compute it the hard way
}

int MachPrologNode::reloc() const {
  return 0; // a large enough number
}

//=============================================================================
#ifndef PRODUCT
void MachEpilogNode::format( PhaseRegAlloc *ra_, outputStream* st ) const {
  Compile *C = ra_->C;
  int framesize = C->frame_slots() << LogBytesPerInt;
  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  // Remove two words for return addr and rbp,
  framesize -= 2*wordSize;

  if (C->max_vector_size() > 16) {
    st->print("VZEROUPPER");
    st->cr(); st->print("\t");
  }
  if (C->in_24_bit_fp_mode()) {
    st->print("FLDCW  standard control word");
    st->cr(); st->print("\t");
  }
  if (framesize) {
    st->print("ADD    ESP,%d\t# Destroy frame",framesize);
    st->cr(); st->print("\t");
  }
  st->print_cr("POPL   EBP"); st->print("\t");
  if (do_polling() && C->is_method_compilation()) {
    st->print("TEST   PollPage,EAX\t! Poll Safepoint");
    st->cr(); st->print("\t");
  }
}
#endif

void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
  Compile *C = ra_->C;

  if (C->max_vector_size() > 16) {
    // Clear upper bits of YMM registers when current compiled code uses
    // wide vectors to avoid AVX <-> SSE transition penalty during call.
    MacroAssembler masm(&cbuf);
    masm.vzeroupper();
  }
  // If method set FPU control word, restore to standard control word
  if (C->in_24_bit_fp_mode()) {
    MacroAssembler masm(&cbuf);
    masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
  }

  int framesize = C->frame_slots() << LogBytesPerInt;
  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  // Remove two words for return addr and rbp,
  framesize -= 2*wordSize;

  // Note that VerifyStackAtCalls' Majik cookie does not change the frame size popped here

  if (framesize >= 128) {
    emit_opcode(cbuf, 0x81); // add  SP, #framesize
    emit_rm(cbuf, 0x3, 0x00, ESP_enc);
    emit_d32(cbuf, framesize);
  } else if (framesize) {
    emit_opcode(cbuf, 0x83); // add  SP, #framesize
    emit_rm(cbuf, 0x3, 0x00, ESP_enc);
    emit_d8(cbuf, framesize);
  }

  emit_opcode(cbuf, 0x58 | EBP_enc);

  if (do_polling() && C->is_method_compilation()) {
    cbuf.relocate(cbuf.insts_end(), relocInfo::poll_return_type, 0);
    emit_opcode(cbuf,0x85);
    emit_rm(cbuf, 0x0, EAX_enc, 0x5); // EAX
    emit_d32(cbuf, (intptr_t)os::get_polling_page());
  }
}

uint MachEpilogNode::size(PhaseRegAlloc *ra_) const {
  Compile *C = ra_->C;
  // If method set FPU control word, restore to standard control word
  int size = C->in_24_bit_fp_mode() ? 6 : 0;
  if (C->max_vector_size() > 16) size += 3; // vzeroupper
  if (do_polling() && C->is_method_compilation()) size += 6;

  int framesize = C->frame_slots() << LogBytesPerInt;
  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
  // Remove two words for return addr and rbp,
  framesize -= 2*wordSize;

  size++; // popl rbp,

  if (framesize >= 128) {
    size += 6;
  } else {
    size += framesize ? 3 : 0;
  }
  return size;
}

int MachEpilogNode::reloc() const {
  return 0; // a large enough number
}

const Pipeline * MachEpilogNode::pipeline() const {
  return MachNode::pipeline_class();
}

int MachEpilogNode::safepoint_offset() const { return 0; }

//=============================================================================

enum RC { rc_bad, rc_int, rc_float, rc_xmm, rc_stack };
static enum RC rc_class( OptoReg::Name reg ) {

  if( !OptoReg::is_valid(reg)  ) return rc_bad;
  if (OptoReg::is_stack(reg)) return rc_stack;

  VMReg r = OptoReg::as_VMReg(reg);
  if (r->is_Register()) return rc_int;
  if (r->is_FloatRegister()) {
    assert(UseSSE < 2, "shouldn't be used in SSE2+ mode");
    return rc_float;
  }
  assert(r->is_XMMRegister(), "must be");
  return rc_xmm;
}

static int impl_helper( CodeBuffer *cbuf, bool do_size, bool is_load, int offset, int reg,
                        int opcode, const char *op_str, int size, outputStream* st ) {
  if( cbuf ) {
    emit_opcode  (*cbuf, opcode );
    encode_RegMem(*cbuf, Matcher::_regEncode[reg], ESP_enc, 0x4, 0, offset, relocInfo::none);
#ifndef PRODUCT
  } else if( !do_size ) {
    if( size != 0 ) st->print("\n\t");
    if( opcode == 0x8B || opcode == 0x89 ) { // MOV
      if( is_load ) st->print("%s   %s,[ESP + #%d]",op_str,Matcher::regName[reg],offset);
      else          st->print("%s   [ESP + #%d],%s",op_str,offset,Matcher::regName[reg]);
    } else { // FLD, FST, PUSH, POP
      st->print("%s [ESP + #%d]",op_str,offset);
    }
#endif
  }
  int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
  return size+3+offset_size;
}

// Helper for XMM registers.  Extra opcode bits, limited syntax.
static int impl_x_helper( CodeBuffer *cbuf, bool do_size, bool is_load,
                         int offset, int reg_lo, int reg_hi, int size, outputStream* st ) {
  if (cbuf) {
    MacroAssembler _masm(cbuf);
    if (reg_lo+1 == reg_hi) { // double move?
      if (is_load) {
        __ movdbl(as_XMMRegister(Matcher::_regEncode[reg_lo]), Address(rsp, offset));
      } else {
        __ movdbl(Address(rsp, offset), as_XMMRegister(Matcher::_regEncode[reg_lo]));
      }
    } else {
      if (is_load) {
        __ movflt(as_XMMRegister(Matcher::_regEncode[reg_lo]), Address(rsp, offset));
      } else {
        __ movflt(Address(rsp, offset), as_XMMRegister(Matcher::_regEncode[reg_lo]));
      }
    }
#ifndef PRODUCT
  } else if (!do_size) {
    if (size != 0) st->print("\n\t");
    if (reg_lo+1 == reg_hi) { // double move?
      if (is_load) st->print("%s %s,[ESP + #%d]",
                              UseXmmLoadAndClearUpper ? "MOVSD " : "MOVLPD",
                              Matcher::regName[reg_lo], offset);
      else         st->print("MOVSD  [ESP + #%d],%s",
                              offset, Matcher::regName[reg_lo]);
    } else {
      if (is_load) st->print("MOVSS  %s,[ESP + #%d]",
                              Matcher::regName[reg_lo], offset);
      else         st->print("MOVSS  [ESP + #%d],%s",
                              offset, Matcher::regName[reg_lo]);
    }
#endif
  }
  int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
  // VEX_2bytes prefix is used if UseAVX > 0, so it takes the same 2 bytes as SIMD prefix.
  return size+5+offset_size;
}


static int impl_movx_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
                            int src_hi, int dst_hi, int size, outputStream* st ) {
  if (cbuf) {
    MacroAssembler _masm(cbuf);
    if (src_lo+1 == src_hi && dst_lo+1 == dst_hi) { // double move?
      __ movdbl(as_XMMRegister(Matcher::_regEncode[dst_lo]),
                as_XMMRegister(Matcher::_regEncode[src_lo]));
    } else {
      __ movflt(as_XMMRegister(Matcher::_regEncode[dst_lo]),
                as_XMMRegister(Matcher::_regEncode[src_lo]));
    }
#ifndef PRODUCT
  } else if (!do_size) {
    if (size != 0) st->print("\n\t");
    if (UseXmmRegToRegMoveAll) {//Use movaps,movapd to move between xmm registers
      if (src_lo+1 == src_hi && dst_lo+1 == dst_hi) { // double move?
        st->print("MOVAPD %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
      } else {
        st->print("MOVAPS %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
      }
    } else {
      if( src_lo+1 == src_hi && dst_lo+1 == dst_hi ) { // double move?
        st->print("MOVSD  %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
      } else {
        st->print("MOVSS  %s,%s",Matcher::regName[dst_lo],Matcher::regName[src_lo]);
      }
    }
#endif
  }
  // VEX_2bytes prefix is used if UseAVX > 0, and it takes the same 2 bytes as SIMD prefix.
  // Only MOVAPS SSE prefix uses 1 byte.
  int sz = 4;
  if (!(src_lo+1 == src_hi && dst_lo+1 == dst_hi) &&
      UseXmmRegToRegMoveAll && (UseAVX == 0)) sz = 3;
  return size + sz;
}

static int impl_movgpr2x_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
                            int src_hi, int dst_hi, int size, outputStream* st ) {
  // 32-bit
  if (cbuf) {
    MacroAssembler _masm(cbuf);
    __ movdl(as_XMMRegister(Matcher::_regEncode[dst_lo]),
             as_Register(Matcher::_regEncode[src_lo]));
#ifndef PRODUCT
  } else if (!do_size) {
    st->print("movdl   %s, %s\t# spill", Matcher::regName[dst_lo], Matcher::regName[src_lo]);
#endif
  }
  return 4;
}


static int impl_movx2gpr_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
                                 int src_hi, int dst_hi, int size, outputStream* st ) {
  // 32-bit
  if (cbuf) {
    MacroAssembler _masm(cbuf);
    __ movdl(as_Register(Matcher::_regEncode[dst_lo]),
             as_XMMRegister(Matcher::_regEncode[src_lo]));
#ifndef PRODUCT
  } else if (!do_size) {
    st->print("movdl   %s, %s\t# spill", Matcher::regName[dst_lo], Matcher::regName[src_lo]);
#endif
  }
  return 4;
}

static int impl_mov_helper( CodeBuffer *cbuf, bool do_size, int src, int dst, int size, outputStream* st ) {
  if( cbuf ) {
    emit_opcode(*cbuf, 0x8B );
    emit_rm    (*cbuf, 0x3, Matcher::_regEncode[dst], Matcher::_regEncode[src] );
#ifndef PRODUCT
  } else if( !do_size ) {
    if( size != 0 ) st->print("\n\t");
    st->print("MOV    %s,%s",Matcher::regName[dst],Matcher::regName[src]);
#endif
  }
  return size+2;
}

static int impl_fp_store_helper( CodeBuffer *cbuf, bool do_size, int src_lo, int src_hi, int dst_lo, int dst_hi,
                                 int offset, int size, outputStream* st ) {
  if( src_lo != FPR1L_num ) {      // Move value to top of FP stack, if not already there
    if( cbuf ) {
      emit_opcode( *cbuf, 0xD9 );  // FLD (i.e., push it)
      emit_d8( *cbuf, 0xC0-1+Matcher::_regEncode[src_lo] );
#ifndef PRODUCT
    } else if( !do_size ) {
      if( size != 0 ) st->print("\n\t");
      st->print("FLD    %s",Matcher::regName[src_lo]);
#endif
    }
    size += 2;
  }

  int st_op = (src_lo != FPR1L_num) ? EBX_num /*store & pop*/ : EDX_num /*store no pop*/;
  const char *op_str;
  int op;
  if( src_lo+1 == src_hi && dst_lo+1 == dst_hi ) { // double store?
    op_str = (src_lo != FPR1L_num) ? "FSTP_D" : "FST_D ";
    op = 0xDD;
  } else {                   // 32-bit store
    op_str = (src_lo != FPR1L_num) ? "FSTP_S" : "FST_S ";
    op = 0xD9;
    assert( !OptoReg::is_valid(src_hi) && !OptoReg::is_valid(dst_hi), "no non-adjacent float-stores" );
  }

  return impl_helper(cbuf,do_size,false,offset,st_op,op,op_str,size, st);
}

// Next two methods are shared by 32- and 64-bit VM. They are defined in x86.ad.
static int vec_mov_helper(CodeBuffer *cbuf, bool do_size, int src_lo, int dst_lo,
                          int src_hi, int dst_hi, uint ireg, outputStream* st);

static int vec_spill_helper(CodeBuffer *cbuf, bool do_size, bool is_load,
                            int stack_offset, int reg, uint ireg, outputStream* st);

static int vec_stack_to_stack_helper(CodeBuffer *cbuf, bool do_size, int src_offset,
                                     int dst_offset, uint ireg, outputStream* st) {
  int calc_size = 0;
  int src_offset_size = (src_offset == 0) ? 0 : ((src_offset < 0x80) ? 1 : 4);
  int dst_offset_size = (dst_offset == 0) ? 0 : ((dst_offset < 0x80) ? 1 : 4);
  switch (ireg) {
  case Op_VecS:
    calc_size = 3+src_offset_size + 3+dst_offset_size;
    break;
  case Op_VecD:
    calc_size = 3+src_offset_size + 3+dst_offset_size;
    src_offset += 4;
    dst_offset += 4;
    src_offset_size = (src_offset == 0) ? 0 : ((src_offset < 0x80) ? 1 : 4);
    dst_offset_size = (dst_offset == 0) ? 0 : ((dst_offset < 0x80) ? 1 : 4);
    calc_size += 3+src_offset_size + 3+dst_offset_size;
    break;
  case Op_VecX:
    calc_size = 6 + 6 + 5+src_offset_size + 5+dst_offset_size;
    break;
  case Op_VecY:
    calc_size = 6 + 6 + 5+src_offset_size + 5+dst_offset_size;
    break;
  default:
    ShouldNotReachHere();
  }
  if (cbuf) {
    MacroAssembler _masm(cbuf);
    int offset = __ offset();
    switch (ireg) {
    case Op_VecS:
      __ pushl(Address(rsp, src_offset));
      __ popl (Address(rsp, dst_offset));
      break;
    case Op_VecD:
      __ pushl(Address(rsp, src_offset));
      __ popl (Address(rsp, dst_offset));
      __ pushl(Address(rsp, src_offset+4));
      __ popl (Address(rsp, dst_offset+4));
      break;
    case Op_VecX:
      __ movdqu(Address(rsp, -16), xmm0);
      __ movdqu(xmm0, Address(rsp, src_offset));
      __ movdqu(Address(rsp, dst_offset), xmm0);
      __ movdqu(xmm0, Address(rsp, -16));
      break;
    case Op_VecY:
      __ vmovdqu(Address(rsp, -32), xmm0);
      __ vmovdqu(xmm0, Address(rsp, src_offset));
      __ vmovdqu(Address(rsp, dst_offset), xmm0);
      __ vmovdqu(xmm0, Address(rsp, -32));
      break;
    default:
      ShouldNotReachHere();
    }
    int size = __ offset() - offset;
    assert(size == calc_size, "incorrect size calculattion");
    return size;
#ifndef PRODUCT
  } else if (!do_size) {
    switch (ireg) {
    case Op_VecS:
      st->print("pushl   [rsp + #%d]\t# 32-bit mem-mem spill\n\t"
                "popl    [rsp + #%d]",
                src_offset, dst_offset);
      break;
    case Op_VecD:
      st->print("pushl   [rsp + #%d]\t# 64-bit mem-mem spill\n\t"
                "popq    [rsp + #%d]\n\t"
                "pushl   [rsp + #%d]\n\t"
                "popq    [rsp + #%d]",
                src_offset, dst_offset, src_offset+4, dst_offset+4);
      break;
     case Op_VecX:
      st->print("movdqu  [rsp - #16], xmm0\t# 128-bit mem-mem spill\n\t"
                "movdqu  xmm0, [rsp + #%d]\n\t"
                "movdqu  [rsp + #%d], xmm0\n\t"
                "movdqu  xmm0, [rsp - #16]",
                src_offset, dst_offset);
      break;
    case Op_VecY:
      st->print("vmovdqu [rsp - #32], xmm0\t# 256-bit mem-mem spill\n\t"
                "vmovdqu xmm0, [rsp + #%d]\n\t"
                "vmovdqu [rsp + #%d], xmm0\n\t"
                "vmovdqu xmm0, [rsp - #32]",
                src_offset, dst_offset);
      break;
    default:
      ShouldNotReachHere();
    }
#endif
  }
  return calc_size;
}

uint MachSpillCopyNode::implementation( CodeBuffer *cbuf, PhaseRegAlloc *ra_, bool do_size, outputStream* st ) const {
  // Get registers to move
  OptoReg::Name src_second = ra_->get_reg_second(in(1));
  OptoReg::Name src_first = ra_->get_reg_first(in(1));
  OptoReg::Name dst_second = ra_->get_reg_second(this );
  OptoReg::Name dst_first = ra_->get_reg_first(this );

  enum RC src_second_rc = rc_class(src_second);
  enum RC src_first_rc = rc_class(src_first);
  enum RC dst_second_rc = rc_class(dst_second);
  enum RC dst_first_rc = rc_class(dst_first);

  assert( OptoReg::is_valid(src_first) && OptoReg::is_valid(dst_first), "must move at least 1 register" );

  // Generate spill code!
  int size = 0;

  if( src_first == dst_first && src_second == dst_second )
    return size;            // Self copy, no move

  if (bottom_type()->isa_vect() != NULL) {
    uint ireg = ideal_reg();
    assert((src_first_rc != rc_int && dst_first_rc != rc_int), "sanity");
    assert((src_first_rc != rc_float && dst_first_rc != rc_float), "sanity");
    assert((ireg == Op_VecS || ireg == Op_VecD || ireg == Op_VecX || ireg == Op_VecY), "sanity");
    if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) {
      // mem -> mem
      int src_offset = ra_->reg2offset(src_first);
      int dst_offset = ra_->reg2offset(dst_first);
      return vec_stack_to_stack_helper(cbuf, do_size, src_offset, dst_offset, ireg, st);
    } else if (src_first_rc == rc_xmm && dst_first_rc == rc_xmm ) {
      return vec_mov_helper(cbuf, do_size, src_first, dst_first, src_second, dst_second, ireg, st);
    } else if (src_first_rc == rc_xmm && dst_first_rc == rc_stack ) {
      int stack_offset = ra_->reg2offset(dst_first);
      return vec_spill_helper(cbuf, do_size, false, stack_offset, src_first, ireg, st);
    } else if (src_first_rc == rc_stack && dst_first_rc == rc_xmm ) {
      int stack_offset = ra_->reg2offset(src_first);
      return vec_spill_helper(cbuf, do_size, true,  stack_offset, dst_first, ireg, st);
    } else {
      ShouldNotReachHere();
    }
  }

  // --------------------------------------
  // Check for mem-mem move.  push/pop to move.
  if( src_first_rc == rc_stack && dst_first_rc == rc_stack ) {
    if( src_second == dst_first ) { // overlapping stack copy ranges
      assert( src_second_rc == rc_stack && dst_second_rc == rc_stack, "we only expect a stk-stk copy here" );
      size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_second),ESI_num,0xFF,"PUSH  ",size, st);
      size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_second),EAX_num,0x8F,"POP   ",size, st);
      src_second_rc = dst_second_rc = rc_bad;  // flag as already moved the second bits
    }
    // move low bits
    size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_first),ESI_num,0xFF,"PUSH  ",size, st);
    size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_first),EAX_num,0x8F,"POP   ",size, st);
    if( src_second_rc == rc_stack && dst_second_rc == rc_stack ) { // mov second bits
      size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_second),ESI_num,0xFF,"PUSH  ",size, st);
      size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_second),EAX_num,0x8F,"POP   ",size, st);
    }
    return size;
  }

  // --------------------------------------
  // Check for integer reg-reg copy
  if( src_first_rc == rc_int && dst_first_rc == rc_int )
    size = impl_mov_helper(cbuf,do_size,src_first,dst_first,size, st);

  // Check for integer store
  if( src_first_rc == rc_int && dst_first_rc == rc_stack )
    size = impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_first),src_first,0x89,"MOV ",size, st);

  // Check for integer load
  if( dst_first_rc == rc_int && src_first_rc == rc_stack )
    size = impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_first),dst_first,0x8B,"MOV ",size, st);

  // Check for integer reg-xmm reg copy
  if( src_first_rc == rc_int && dst_first_rc == rc_xmm ) {
    assert( (src_second_rc == rc_bad && dst_second_rc == rc_bad),
            "no 64 bit integer-float reg moves" );
    return impl_movgpr2x_helper(cbuf,do_size,src_first,dst_first,src_second, dst_second, size, st);
  }
  // --------------------------------------
  // Check for float reg-reg copy
  if( src_first_rc == rc_float && dst_first_rc == rc_float ) {
    assert( (src_second_rc == rc_bad && dst_second_rc == rc_bad) ||
            (src_first+1 == src_second && dst_first+1 == dst_second), "no non-adjacent float-moves" );
    if( cbuf ) {

      // Note the mucking with the register encode to compensate for the 0/1
      // indexing issue mentioned in a comment in the reg_def sections
      // for FPR registers many lines above here.

      if( src_first != FPR1L_num ) {
        emit_opcode  (*cbuf, 0xD9 );           // FLD    ST(i)
        emit_d8      (*cbuf, 0xC0+Matcher::_regEncode[src_first]-1 );
        emit_opcode  (*cbuf, 0xDD );           // FSTP   ST(i)
        emit_d8      (*cbuf, 0xD8+Matcher::_regEncode[dst_first] );
     } else {
        emit_opcode  (*cbuf, 0xDD );           // FST    ST(i)
        emit_d8      (*cbuf, 0xD0+Matcher::_regEncode[dst_first]-1 );
     }
#ifndef PRODUCT
    } else if( !do_size ) {
      if( size != 0 ) st->print("\n\t");
      if( src_first != FPR1L_num ) st->print("FLD    %s\n\tFSTP   %s",Matcher::regName[src_first],Matcher::regName[dst_first]);
      else                      st->print(             "FST    %s",                            Matcher::regName[dst_first]);
#endif
    }
    return size + ((src_first != FPR1L_num) ? 2+2 : 2);
  }

  // Check for float store
  if( src_first_rc == rc_float && dst_first_rc == rc_stack ) {
    return impl_fp_store_helper(cbuf,do_size,src_first,src_second,dst_first,dst_second,ra_->reg2offset(dst_first),size, st);
  }

  // Check for float load
  if( dst_first_rc == rc_float && src_first_rc == rc_stack ) {
    int offset = ra_->reg2offset(src_first);
    const char *op_str;
    int op;
    if( src_first+1 == src_second && dst_first+1 == dst_second ) { // double load?
      op_str = "FLD_D";
      op = 0xDD;
    } else {                   // 32-bit load
      op_str = "FLD_S";
      op = 0xD9;
      assert( src_second_rc == rc_bad && dst_second_rc == rc_bad, "no non-adjacent float-loads" );
    }
    if( cbuf ) {
      emit_opcode  (*cbuf, op );
      encode_RegMem(*cbuf, 0x0, ESP_enc, 0x4, 0, offset, relocInfo::none);
      emit_opcode  (*cbuf, 0xDD );           // FSTP   ST(i)
      emit_d8      (*cbuf, 0xD8+Matcher::_regEncode[dst_first] );
#ifndef PRODUCT
    } else if( !do_size ) {
      if( size != 0 ) st->print("\n\t");
      st->print("%s  ST,[ESP + #%d]\n\tFSTP   %s",op_str, offset,Matcher::regName[dst_first]);
#endif
    }
    int offset_size = (offset == 0) ? 0 : ((offset <= 127) ? 1 : 4);
    return size + 3+offset_size+2;
  }

  // Check for xmm reg-reg copy
  if( src_first_rc == rc_xmm && dst_first_rc == rc_xmm ) {
    assert( (src_second_rc == rc_bad && dst_second_rc == rc_bad) ||
            (src_first+1 == src_second && dst_first+1 == dst_second),
            "no non-adjacent float-moves" );
    return impl_movx_helper(cbuf,do_size,src_first,dst_first,src_second, dst_second, size, st);
  }

  // Check for xmm reg-integer reg copy
  if( src_first_rc == rc_xmm && dst_first_rc == rc_int ) {
    assert( (src_second_rc == rc_bad && dst_second_rc == rc_bad),
            "no 64 bit float-integer reg moves" );
    return impl_movx2gpr_helper(cbuf,do_size,src_first,dst_first,src_second, dst_second, size, st);
  }

  // Check for xmm store
  if( src_first_rc == rc_xmm && dst_first_rc == rc_stack ) {
    return impl_x_helper(cbuf,do_size,false,ra_->reg2offset(dst_first),src_first, src_second, size, st);
  }

  // Check for float xmm load
  if( dst_first_rc == rc_xmm && src_first_rc == rc_stack ) {
    return impl_x_helper(cbuf,do_size,true ,ra_->reg2offset(src_first),dst_first, dst_second, size, st);
  }

  // Copy from float reg to xmm reg
  if( dst_first_rc == rc_xmm && src_first_rc == rc_float ) {
    // copy to the top of stack from floating point reg
    // and use LEA to preserve flags
    if( cbuf ) {
      emit_opcode(*cbuf,0x8D);  // LEA  ESP,[ESP-8]
      emit_rm(*cbuf, 0x1, ESP_enc, 0x04);
      emit_rm(*cbuf, 0x0, 0x04, ESP_enc);
      emit_d8(*cbuf,0xF8);
#ifndef PRODUCT
    } else if( !do_size ) {
      if( size != 0 ) st->print("\n\t");
      st->print("LEA    ESP,[ESP-8]");
#endif
    }
    size += 4;

    size = impl_fp_store_helper(cbuf,do_size,src_first,src_second,dst_first,dst_second,0,size, st);

    // Copy from the temp memory to the xmm reg.
    size = impl_x_helper(cbuf,do_size,true ,0,dst_first, dst_second, size, st);

    if( cbuf ) {
      emit_opcode(*cbuf,0x8D);  // LEA  ESP,[ESP+8]
      emit_rm(*cbuf, 0x1, ESP_enc, 0x04);
      emit_rm(*cbuf, 0x0, 0x04, ESP_enc);
      emit_d8(*cbuf,0x08);
#ifndef PRODUCT
    } else if( !do_size ) {
      if( size != 0 ) st->print("\n\t");
      st->print("LEA    ESP,[ESP+8]");
#endif
    }
    size += 4;
    return size;
  }

  assert( size > 0, "missed a case" );

  // --------------------------------------------------------------------
  // Check for second bits still needing moving.
  if( src_second == dst_second )
    return size;               // Self copy; no move
  assert( src_second_rc != rc_bad && dst_second_rc != rc_bad, "src_second & dst_second cannot be Bad" );

  // Check for second word int-int move
  if( src_second_rc == rc_int && dst_second_rc == rc_int )
    return impl_mov_helper(cbuf,do_size,src_second,dst_second,size, st);

  // Check for second word integer store
  if( src_second_rc == rc_int && dst_second_rc == rc_stack )
    return impl_helper(cbuf,do_size,false,ra_->reg2offset(dst_second),src_second,0x89,"MOV ",size, st);

  // Check for second word integer load
  if( dst_second_rc == rc_int && src_second_rc == rc_stack )
    return impl_helper(cbuf,do_size,true ,ra_->reg2offset(src_second),dst_second,0x8B,"MOV ",size, st);


  Unimplemented();
}

#ifndef PRODUCT
void MachSpillCopyNode::format(PhaseRegAlloc *ra_, outputStream* st) const {
  implementation( NULL, ra_, false, st );
}
#endif

void MachSpillCopyNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
  implementation( &cbuf, ra_, false, NULL );
}

uint MachSpillCopyNode::size(PhaseRegAlloc *ra_) const {
  return implementation( NULL, ra_, true, NULL );
}


//=============================================================================
#ifndef PRODUCT
void BoxLockNode::format( PhaseRegAlloc *ra_, outputStream* st ) const {
  int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
  int reg = ra_->get_reg_first(this);
  st->print("LEA    %s,[ESP + #%d]",Matcher::regName[reg],offset);
}
#endif

void BoxLockNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
  int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
  int reg = ra_->get_encode(this);
  if( offset >= 128 ) {
    emit_opcode(cbuf, 0x8D);      // LEA  reg,[SP+offset]
    emit_rm(cbuf, 0x2, reg, 0x04);
    emit_rm(cbuf, 0x0, 0x04, ESP_enc);
    emit_d32(cbuf, offset);
  }
  else {
    emit_opcode(cbuf, 0x8D);      // LEA  reg,[SP+offset]
    emit_rm(cbuf, 0x1, reg, 0x04);
    emit_rm(cbuf, 0x0, 0x04, ESP_enc);
    emit_d8(cbuf, offset);
  }
}

uint BoxLockNode::size(PhaseRegAlloc *ra_) const {
  int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
  if( offset >= 128 ) {
    return 7;
  }
  else {
    return 4;
  }
}

//=============================================================================
#ifndef PRODUCT
void MachUEPNode::format( PhaseRegAlloc *ra_, outputStream* st ) const {
  st->print_cr(  "CMP    EAX,[ECX+4]\t# Inline cache check");
  st->print_cr("\tJNE    SharedRuntime::handle_ic_miss_stub");
  st->print_cr("\tNOP");
  st->print_cr("\tNOP");
  if( !OptoBreakpoint )
    st->print_cr("\tNOP");
}
#endif

void MachUEPNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
  MacroAssembler masm(&cbuf);
#ifdef ASSERT
  uint insts_size = cbuf.insts_size();
#endif
  masm.cmpptr(rax, Address(rcx, oopDesc::klass_offset_in_bytes()));
  masm.jump_cc(Assembler::notEqual,
               RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
  /* WARNING these NOPs are critical so that verified entry point is properly
     aligned for patching by NativeJump::patch_verified_entry() */
  int nops_cnt = 2;
  if( !OptoBreakpoint ) // Leave space for int3
     nops_cnt += 1;
  masm.nop(nops_cnt);

  assert(cbuf.insts_size() - insts_size == size(ra_), "checking code size of inline cache node");
}

uint MachUEPNode::size(PhaseRegAlloc *ra_) const {
  return OptoBreakpoint ? 11 : 12;
}


//=============================================================================
uint size_exception_handler() {
  // NativeCall instruction size is the same as NativeJump.
  // exception handler starts out as jump and can be patched to
  // a call be deoptimization.  (4932387)
  // Note that this value is also credited (in output.cpp) to
  // the size of the code section.
  return NativeJump::instruction_size;
}

// Emit exception handler code.  Stuff framesize into a register
// and call a VM stub routine.
int emit_exception_handler(CodeBuffer& cbuf) {

  // Note that the code buffer's insts_mark is always relative to insts.
  // That's why we must use the macroassembler to generate a handler.
  MacroAssembler _masm(&cbuf);
  address base =
  __ start_a_stub(size_exception_handler());
  if (base == NULL)  return 0;  // CodeBuffer::expand failed
  int offset = __ offset();
  __ jump(RuntimeAddress(OptoRuntime::exception_blob()->entry_point()));
  assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
  __ end_a_stub();
  return offset;
}

uint size_deopt_handler() {
  // NativeCall instruction size is the same as NativeJump.
  // exception handler starts out as jump and can be patched to
  // a call be deoptimization.  (4932387)
  // Note that this value is also credited (in output.cpp) to
  // the size of the code section.
  return 5 + NativeJump::instruction_size; // pushl(); jmp;
}

// Emit deopt handler code.
int emit_deopt_handler(CodeBuffer& cbuf) {

  // Note that the code buffer's insts_mark is always relative to insts.
  // That's why we must use the macroassembler to generate a handler.
  MacroAssembler _masm(&cbuf);
  address base =
  __ start_a_stub(size_exception_handler());
  if (base == NULL)  return 0;  // CodeBuffer::expand failed
  int offset = __ offset();
  InternalAddress here(__ pc());
  __ pushptr(here.addr());

  __ jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
  assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow");
  __ end_a_stub();
  return offset;
}

int Matcher::regnum_to_fpu_offset(int regnum) {
  return regnum - 32; // The FP registers are in the second chunk
}

// This is UltraSparc specific, true just means we have fast l2f conversion
const bool Matcher::convL2FSupported(void) {
  return true;
}

// Is this branch offset short enough that a short branch can be used?
//
// NOTE: If the platform does not provide any short branch variants, then
//       this method should return false for offset 0.
bool Matcher::is_short_branch_offset(int rule, int br_size, int offset) {
  // The passed offset is relative to address of the branch.
  // On 86 a branch displacement is calculated relative to address
  // of a next instruction.
  offset -= br_size;

  // the short version of jmpConUCF2 contains multiple branches,
  // making the reach slightly less
  if (rule == jmpConUCF2_rule)
    return (-126 <= offset && offset <= 125);
  return (-128 <= offset && offset <= 127);
}

const bool Matcher::isSimpleConstant64(jlong value) {
  // Will one (StoreL ConL) be cheaper than two (StoreI ConI)?.
  return false;
}

// The ecx parameter to rep stos for the ClearArray node is in dwords.
const bool Matcher::init_array_count_is_in_bytes = false;

// Threshold size for cleararray.
const int Matcher::init_array_short_size = 8 * BytesPerLong;

// Needs 2 CMOV's for longs.
const int Matcher::long_cmove_cost() { return 1; }

// No CMOVF/CMOVD with SSE/SSE2
const int Matcher::float_cmove_cost() { return (UseSSE>=1) ? ConditionalMoveLimit : 0; }

// Should the Matcher clone shifts on addressing modes, expecting them to
// be subsumed into complex addressing expressions or compute them into
// registers?  True for Intel but false for most RISCs
const bool Matcher::clone_shift_expressions = true;

// Do we need to mask the count passed to shift instructions or does
// the cpu only look at the lower 5/6 bits anyway?
const bool Matcher::need_masked_shift_count = false;

bool Matcher::narrow_oop_use_complex_address() {
  ShouldNotCallThis();
  return true;
}

bool Matcher::narrow_klass_use_complex_address() {
  ShouldNotCallThis();
  return true;
}


// Is it better to copy float constants, or load them directly from memory?
// Intel can load a float constant from a direct address, requiring no
// extra registers.  Most RISCs will have to materialize an address into a
// register first, so they would do better to copy the constant from stack.
const bool Matcher::rematerialize_float_constants = true;

// If CPU can load and store mis-aligned doubles directly then no fixup is
// needed.  Else we split the double into 2 integer pieces and move it
// piece-by-piece.  Only happens when passing doubles into C code as the
// Java calling convention forces doubles to be aligned.
const bool Matcher::misaligned_doubles_ok = true;


void Matcher::pd_implicit_null_fixup(MachNode *node, uint idx) {
  // Get the memory operand from the node
  uint numopnds = node->num_opnds();        // Virtual call for number of operands
  uint skipped  = node->oper_input_base();  // Sum of leaves skipped so far
  assert( idx >= skipped, "idx too low in pd_implicit_null_fixup" );
  uint opcnt     = 1;                 // First operand
  uint num_edges = node->_opnds[1]->num_edges(); // leaves for first operand
  while( idx >= skipped+num_edges ) {
    skipped += num_edges;
    opcnt++;                          // Bump operand count
    assert( opcnt < numopnds, "Accessing non-existent operand" );
    num_edges = node->_opnds[opcnt]->num_edges(); // leaves for next operand
  }

  MachOper *memory = node->_opnds[opcnt];
  MachOper *new_memory = NULL;
  switch (memory->opcode()) {
  case DIRECT:
  case INDOFFSET32X:
    // No transformation necessary.
    return;
  case INDIRECT:
    new_memory = new (C) indirect_win95_safeOper( );
    break;
  case INDOFFSET8:
    new_memory = new (C) indOffset8_win95_safeOper(memory->disp(NULL, NULL, 0));
    break;
  case INDOFFSET32:
    new_memory = new (C) indOffset32_win95_safeOper(memory->disp(NULL, NULL, 0));
    break;
  case INDINDEXOFFSET:
    new_memory = new (C) indIndexOffset_win95_safeOper(memory->disp(NULL, NULL, 0));
    break;
  case INDINDEXSCALE:
    new_memory = new (C) indIndexScale_win95_safeOper(memory->scale());
    break;
  case INDINDEXSCALEOFFSET:
    new_memory = new (C) indIndexScaleOffset_win95_safeOper(memory->scale(), memory->disp(NULL, NULL, 0));
    break;
  case LOAD_LONG_INDIRECT:
  case LOAD_LONG_INDOFFSET32:
    // Does not use EBP as address register, use { EDX, EBX, EDI, ESI}
    return;
  default:
    assert(false, "unexpected memory operand in pd_implicit_null_fixup()");
    return;
  }
  node->_opnds[opcnt] = new_memory;
}

// Advertise here if the CPU requires explicit rounding operations
// to implement the UseStrictFP mode.
const bool Matcher::strict_fp_requires_explicit_rounding = true;

// Are floats conerted to double when stored to stack during deoptimization?
// On x32 it is stored with convertion only when FPU is used for floats.
bool Matcher::float_in_double() { return (UseSSE == 0); }

// Do ints take an entire long register or just half?
const bool Matcher::int_in_long = false;

// Return whether or not this register is ever used as an argument.  This
// function is used on startup to build the trampoline stubs in generateOptoStub.
// Registers not mentioned will be killed by the VM call in the trampoline, and
// arguments in those registers not be available to the callee.
bool Matcher::can_be_java_arg( int reg ) {
  if(  reg == ECX_num   || reg == EDX_num   ) return true;
  if( (reg == XMM0_num  || reg == XMM1_num ) && UseSSE>=1 ) return true;
  if( (reg == XMM0b_num || reg == XMM1b_num) && UseSSE>=2 ) return true;
  return false;
}

bool Matcher::is_spillable_arg( int reg ) {
  return can_be_java_arg(reg);
}

bool Matcher::use_asm_for_ldiv_by_con( jlong divisor ) {
  // Use hardware integer DIV instruction when
  // it is faster than a code which use multiply.
  // Only when constant divisor fits into 32 bit
  // (min_jint is excluded to get only correct
  // positive 32 bit values from negative).
  return VM_Version::has_fast_idiv() &&
         (divisor == (int)divisor && divisor != min_jint);
}

// Register for DIVI projection of divmodI
RegMask Matcher::divI_proj_mask() {
  return EAX_REG_mask();
}

// Register for MODI projection of divmodI
RegMask Matcher::modI_proj_mask() {
  return EDX_REG_mask();
}

// Register for DIVL projection of divmodL
RegMask Matcher::divL_proj_mask() {
  ShouldNotReachHere();
  return RegMask();
}

// Register for MODL projection of divmodL
RegMask Matcher::modL_proj_mask() {
  ShouldNotReachHere();
  return RegMask();
}

const RegMask Matcher::method_handle_invoke_SP_save_mask() {
  return EBP_REG_mask();
}

const RegMask Matcher::mathExactI_result_proj_mask() {
  return EAX_REG_mask();
}

const RegMask Matcher::mathExactL_result_proj_mask() {
  ShouldNotReachHere();
  return RegMask();
}

const RegMask Matcher::mathExactI_flags_proj_mask() {
  return INT_FLAGS_mask();
}

// Returns true if the high 32 bits of the value is known to be zero.
bool is_operand_hi32_zero(Node* n) {
  int opc = n->Opcode();
  if (opc == Op_AndL) {
    Node* o2 = n->in(2);
    if (o2->is_Con() && (o2->get_long() & 0xFFFFFFFF00000000LL) == 0LL) {
      return true;
    }
  }
  if (opc == Op_ConL && (n->get_long() & 0xFFFFFFFF00000000LL) == 0LL) {
    return true;
  }
  return false;
}

%}

//----------ENCODING BLOCK-----------------------------------------------------
// This block specifies the encoding classes used by the compiler to output
// byte streams.  Encoding classes generate functions which are called by
// Machine Instruction Nodes in order to generate the bit encoding of the
// instruction.  Operands specify their base encoding interface with the
// interface keyword.  There are currently supported four interfaces,
// REG_INTER, CONST_INTER, MEMORY_INTER, & COND_INTER.  REG_INTER causes an
// operand to generate a function which returns its register number when
// queried.   CONST_INTER causes an operand to generate a function which
// returns the value of the constant when queried.  MEMORY_INTER causes an
// operand to generate four functions which return the Base Register, the
// Index Register, the Scale Value, and the Offset Value of the operand when
// queried.  COND_INTER causes an operand to generate six functions which
// return the encoding code (ie - encoding bits for the instruction)
// associated with each basic boolean condition for a conditional instruction.
// Instructions specify two basic values for encoding.  They use the
// ins_encode keyword to specify their encoding class (which must be one of
// the class names specified in the encoding block), and they use the
// opcode keyword to specify, in order, their primary, secondary, and
// tertiary opcode.  Only the opcode sections which a particular instruction
// needs for encoding need to be specified.
encode %{
  // Build emit functions for each basic byte or larger field in the intel
  // encoding scheme (opcode, rm, sib, immediate), and call them from C++
  // code in the enc_class source block.  Emit functions will live in the
  // main source block for now.  In future, we can generalize this by
  // adding a syntax that specifies the sizes of fields in an order,
  // so that the adlc can build the emit functions automagically

  // Emit primary opcode
  enc_class OpcP %{
    emit_opcode(cbuf, $primary);
  %}

  // Emit secondary opcode
  enc_class OpcS %{
    emit_opcode(cbuf, $secondary);
  %}

  // Emit opcode directly
  enc_class Opcode(immI d8) %{
    emit_opcode(cbuf, $d8$$constant);
  %}

  enc_class SizePrefix %{
    emit_opcode(cbuf,0x66);
  %}

  enc_class RegReg (rRegI dst, rRegI src) %{    // RegReg(Many)
    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
  %}

  enc_class OpcRegReg (immI opcode, rRegI dst, rRegI src) %{    // OpcRegReg(Many)
    emit_opcode(cbuf,$opcode$$constant);
    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
  %}

  enc_class mov_r32_imm0( rRegI dst ) %{
    emit_opcode( cbuf, 0xB8 + $dst$$reg ); // 0xB8+ rd   -- MOV r32  ,imm32
    emit_d32   ( cbuf, 0x0  );             //                         imm32==0x0
  %}

  enc_class cdq_enc %{
    // Full implementation of Java idiv and irem; checks for
    // special case as described in JVM spec., p.243 & p.271.
    //
    //         normal case                           special case
    //
    // input : rax,: dividend                         min_int
    //         reg: divisor                          -1
    //
    // output: rax,: quotient  (= rax, idiv reg)       min_int
    //         rdx: remainder (= rax, irem reg)       0
    //
    //  Code sequnce:
    //
    //  81 F8 00 00 00 80    cmp         rax,80000000h
    //  0F 85 0B 00 00 00    jne         normal_case
    //  33 D2                xor         rdx,edx
    //  83 F9 FF             cmp         rcx,0FFh
    //  0F 84 03 00 00 00    je          done
    //                  normal_case:
    //  99                   cdq
    //  F7 F9                idiv        rax,ecx
    //                  done:
    //
    emit_opcode(cbuf,0x81); emit_d8(cbuf,0xF8);
    emit_opcode(cbuf,0x00); emit_d8(cbuf,0x00);
    emit_opcode(cbuf,0x00); emit_d8(cbuf,0x80);                     // cmp rax,80000000h
    emit_opcode(cbuf,0x0F); emit_d8(cbuf,0x85);
    emit_opcode(cbuf,0x0B); emit_d8(cbuf,0x00);
    emit_opcode(cbuf,0x00); emit_d8(cbuf,0x00);                     // jne normal_case
    emit_opcode(cbuf,0x33); emit_d8(cbuf,0xD2);                     // xor rdx,edx
    emit_opcode(cbuf,0x83); emit_d8(cbuf,0xF9); emit_d8(cbuf,0xFF); // cmp rcx,0FFh
    emit_opcode(cbuf,0x0F); emit_d8(cbuf,0x84);
    emit_opcode(cbuf,0x03); emit_d8(cbuf,0x00);
    emit_opcode(cbuf,0x00); emit_d8(cbuf,0x00);                     // je done
    // normal_case:
    emit_opcode(cbuf,0x99);                                         // cdq
    // idiv (note: must be emitted by the user of this rule)
    // normal:
  %}

  // Dense encoding for older common ops
  enc_class Opc_plus(immI opcode, rRegI reg) %{
    emit_opcode(cbuf, $opcode$$constant + $reg$$reg);
  %}


  // Opcde enc_class for 8/32 bit immediate instructions with sign-extension
  enc_class OpcSE (immI imm) %{ // Emit primary opcode and set sign-extend bit
    // Check for 8-bit immediate, and set sign extend bit in opcode
    if (($imm$$constant >= -128) && ($imm$$constant <= 127)) {
      emit_opcode(cbuf, $primary | 0x02);
    }
    else {                          // If 32-bit immediate
      emit_opcode(cbuf, $primary);
    }
  %}

  enc_class OpcSErm (rRegI dst, immI imm) %{    // OpcSEr/m
    // Emit primary opcode and set sign-extend bit
    // Check for 8-bit immediate, and set sign extend bit in opcode
    if (($imm$$constant >= -128) && ($imm$$constant <= 127)) {
      emit_opcode(cbuf, $primary | 0x02);    }
    else {                          // If 32-bit immediate
      emit_opcode(cbuf, $primary);
    }
    // Emit r/m byte with secondary opcode, after primary opcode.
    emit_rm(cbuf, 0x3, $secondary, $dst$$reg);
  %}

  enc_class Con8or32 (immI imm) %{    // Con8or32(storeImmI), 8 or 32 bits
    // Check for 8-bit immediate, and set sign extend bit in opcode
    if (($imm$$constant >= -128) && ($imm$$constant <= 127)) {
      $$$emit8$imm$$constant;
    }
    else {                          // If 32-bit immediate
      // Output immediate
      $$$emit32$imm$$constant;
    }
  %}

  enc_class Long_OpcSErm_Lo(eRegL dst, immL imm) %{
    // Emit primary opcode and set sign-extend bit
    // Check for 8-bit immediate, and set sign extend bit in opcode
    int con = (int)$imm$$constant; // Throw away top bits
    emit_opcode(cbuf, ((con >= -128) && (con <= 127)) ? ($primary | 0x02) : $primary);
    // Emit r/m byte with secondary opcode, after primary opcode.
    emit_rm(cbuf, 0x3, $secondary, $dst$$reg);
    if ((con >= -128) && (con <= 127)) emit_d8 (cbuf,con);
    else                               emit_d32(cbuf,con);
  %}

  enc_class Long_OpcSErm_Hi(eRegL dst, immL imm) %{
    // Emit primary opcode and set sign-extend bit
    // Check for 8-bit immediate, and set sign extend bit in opcode
    int con = (int)($imm$$constant >> 32); // Throw away bottom bits
    emit_opcode(cbuf, ((con >= -128) && (con <= 127)) ? ($primary | 0x02) : $primary);
    // Emit r/m byte with tertiary opcode, after primary opcode.
    emit_rm(cbuf, 0x3, $tertiary, HIGH_FROM_LOW($dst$$reg));
    if ((con >= -128) && (con <= 127)) emit_d8 (cbuf,con);
    else                               emit_d32(cbuf,con);
  %}

  enc_class OpcSReg (rRegI dst) %{    // BSWAP
    emit_cc(cbuf, $secondary, $dst$$reg );
  %}

  enc_class bswap_long_bytes(eRegL dst) %{ // BSWAP
    int destlo = $dst$$reg;
    int desthi = HIGH_FROM_LOW(destlo);
    // bswap lo
    emit_opcode(cbuf, 0x0F);
    emit_cc(cbuf, 0xC8, destlo);
    // bswap hi
    emit_opcode(cbuf, 0x0F);
    emit_cc(cbuf, 0xC8, desthi);
    // xchg lo and hi
    emit_opcode(cbuf, 0x87);
    emit_rm(cbuf, 0x3, destlo, desthi);
  %}

  enc_class RegOpc (rRegI div) %{    // IDIV, IMOD, JMP indirect, ...
    emit_rm(cbuf, 0x3, $secondary, $div$$reg );
  %}

  enc_class enc_cmov(cmpOp cop ) %{ // CMOV
    $$$emit8$primary;
    emit_cc(cbuf, $secondary, $cop$$cmpcode);
  %}

  enc_class enc_cmov_dpr(cmpOp cop, regDPR src ) %{ // CMOV
    int op = 0xDA00 + $cop$$cmpcode + ($src$$reg-1);
    emit_d8(cbuf, op >> 8 );
    emit_d8(cbuf, op & 255);
  %}

  // emulate a CMOV with a conditional branch around a MOV
  enc_class enc_cmov_branch( cmpOp cop, immI brOffs ) %{ // CMOV
    // Invert sense of branch from sense of CMOV
    emit_cc( cbuf, 0x70, ($cop$$cmpcode^1) );
    emit_d8( cbuf, $brOffs$$constant );
  %}

  enc_class enc_PartialSubtypeCheck( ) %{
    Register Redi = as_Register(EDI_enc); // result register
    Register Reax = as_Register(EAX_enc); // super class
    Register Recx = as_Register(ECX_enc); // killed
    Register Resi = as_Register(ESI_enc); // sub class
    Label miss;

    MacroAssembler _masm(&cbuf);
    __ check_klass_subtype_slow_path(Resi, Reax, Recx, Redi,
                                     NULL, &miss,
                                     /*set_cond_codes:*/ true);
    if ($primary) {
      __ xorptr(Redi, Redi);
    }
    __ bind(miss);
  %}

  enc_class FFree_Float_Stack_All %{    // Free_Float_Stack_All
    MacroAssembler masm(&cbuf);
    int start = masm.offset();
    if (UseSSE >= 2) {
      if (VerifyFPU) {
        masm.verify_FPU(0, "must be empty in SSE2+ mode");
      }
    } else {
      // External c_calling_convention expects the FPU stack to be 'clean'.
      // Compiled code leaves it dirty.  Do cleanup now.
      masm.empty_FPU_stack();
    }
    if (sizeof_FFree_Float_Stack_All == -1) {
      sizeof_FFree_Float_Stack_All = masm.offset() - start;
    } else {
      assert(masm.offset() - start == sizeof_FFree_Float_Stack_All, "wrong size");
    }
  %}

  enc_class Verify_FPU_For_Leaf %{
    if( VerifyFPU ) {
      MacroAssembler masm(&cbuf);
      masm.verify_FPU( -3, "Returning from Runtime Leaf call");
    }
  %}

  enc_class Java_To_Runtime (method meth) %{    // CALL Java_To_Runtime, Java_To_Runtime_Leaf
    // This is the instruction starting address for relocation info.
    cbuf.set_insts_mark();
    $$$emit8$primary;
    // CALL directly to the runtime
    emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
                runtime_call_Relocation::spec(), RELOC_IMM32 );

    if (UseSSE >= 2) {
      MacroAssembler _masm(&cbuf);
      BasicType rt = tf()->return_type();

      if ((rt == T_FLOAT || rt == T_DOUBLE) && !return_value_is_used()) {
        // A C runtime call where the return value is unused.  In SSE2+
        // mode the result needs to be removed from the FPU stack.  It's
        // likely that this function call could be removed by the
        // optimizer if the C function is a pure function.
        __ ffree(0);
      } else if (rt == T_FLOAT) {
        __ lea(rsp, Address(rsp, -4));
        __ fstp_s(Address(rsp, 0));
        __ movflt(xmm0, Address(rsp, 0));
        __ lea(rsp, Address(rsp,  4));
      } else if (rt == T_DOUBLE) {
        __ lea(rsp, Address(rsp, -8));
        __ fstp_d(Address(rsp, 0));
        __ movdbl(xmm0, Address(rsp, 0));
        __ lea(rsp, Address(rsp,  8));
      }
    }
  %}


  enc_class pre_call_resets %{
    // If method sets FPU control word restore it here
    debug_only(int off0 = cbuf.insts_size());
    if (ra_->C->in_24_bit_fp_mode()) {
      MacroAssembler _masm(&cbuf);
      __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
    }
    if (ra_->C->max_vector_size() > 16) {
      // Clear upper bits of YMM registers when current compiled code uses
      // wide vectors to avoid AVX <-> SSE transition penalty during call.
      MacroAssembler _masm(&cbuf);
      __ vzeroupper();
    }
    debug_only(int off1 = cbuf.insts_size());
    assert(off1 - off0 == pre_call_resets_size(), "correct size prediction");
  %}

  enc_class post_call_FPU %{
    // If method sets FPU control word do it here also
    if (Compile::current()->in_24_bit_fp_mode()) {
      MacroAssembler masm(&cbuf);
      masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
    }
  %}

  enc_class Java_Static_Call (method meth) %{    // JAVA STATIC CALL
    // CALL to fixup routine.  Fixup routine uses ScopeDesc info to determine
    // who we intended to call.
    cbuf.set_insts_mark();
    $$$emit8$primary;
    if (!_method) {
      emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
                     runtime_call_Relocation::spec(), RELOC_IMM32 );
    } else if (_optimized_virtual) {
      emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
                     opt_virtual_call_Relocation::spec(), RELOC_IMM32 );
    } else {
      emit_d32_reloc(cbuf, ($meth$$method - (int)(cbuf.insts_end()) - 4),
                     static_call_Relocation::spec(), RELOC_IMM32 );
    }
    if (_method) {  // Emit stub for static call.
      CompiledStaticCall::emit_to_interp_stub(cbuf);
    }
  %}

  enc_class Java_Dynamic_Call (method meth) %{    // JAVA DYNAMIC CALL
    MacroAssembler _masm(&cbuf);
    __ ic_call((address)$meth$$method);
  %}

  enc_class Java_Compiled_Call (method meth) %{    // JAVA COMPILED CALL
    int disp = in_bytes(Method::from_compiled_offset());
    assert( -128 <= disp && disp <= 127, "compiled_code_offset isn't small");

    // CALL *[EAX+in_bytes(Method::from_compiled_code_entry_point_offset())]
    cbuf.set_insts_mark();
    $$$emit8$primary;
    emit_rm(cbuf, 0x01, $secondary, EAX_enc );  // R/M byte
    emit_d8(cbuf, disp);             // Displacement

  %}

//   Following encoding is no longer used, but may be restored if calling
//   convention changes significantly.
//   Became: Xor_Reg(EBP), Java_To_Runtime( labl )
//
//   enc_class Java_Interpreter_Call (label labl) %{    // JAVA INTERPRETER CALL
//     // int ic_reg     = Matcher::inline_cache_reg();
//     // int ic_encode  = Matcher::_regEncode[ic_reg];
//     // int imo_reg    = Matcher::interpreter_method_oop_reg();
//     // int imo_encode = Matcher::_regEncode[imo_reg];
//
//     // // Interpreter expects method_oop in EBX, currently a callee-saved register,
//     // // so we load it immediately before the call
//     // emit_opcode(cbuf, 0x8B);                     // MOV    imo_reg,ic_reg  # method_oop
//     // emit_rm(cbuf, 0x03, imo_encode, ic_encode ); // R/M byte
//
//     // xor rbp,ebp
//     emit_opcode(cbuf, 0x33);
//     emit_rm(cbuf, 0x3, EBP_enc, EBP_enc);
//
//     // CALL to interpreter.
//     cbuf.set_insts_mark();
//     $$$emit8$primary;
//     emit_d32_reloc(cbuf, ($labl$$label - (int)(cbuf.insts_end()) - 4),
//                 runtime_call_Relocation::spec(), RELOC_IMM32 );
//   %}

  enc_class RegOpcImm (rRegI dst, immI8 shift) %{    // SHL, SAR, SHR
    $$$emit8$primary;
    emit_rm(cbuf, 0x3, $secondary, $dst$$reg);
    $$$emit8$shift$$constant;
  %}

  enc_class LdImmI (rRegI dst, immI src) %{    // Load Immediate
    // Load immediate does not have a zero or sign extended version
    // for 8-bit immediates
    emit_opcode(cbuf, 0xB8 + $dst$$reg);
    $$$emit32$src$$constant;
  %}

  enc_class LdImmP (rRegI dst, immI src) %{    // Load Immediate
    // Load immediate does not have a zero or sign extended version
    // for 8-bit immediates
    emit_opcode(cbuf, $primary + $dst$$reg);
    $$$emit32$src$$constant;
  %}

  enc_class LdImmL_Lo( eRegL dst, immL src) %{    // Load Immediate
    // Load immediate does not have a zero or sign extended version
    // for 8-bit immediates
    int dst_enc = $dst$$reg;
    int src_con = $src$$constant & 0x0FFFFFFFFL;
    if (src_con == 0) {
      // xor dst, dst
      emit_opcode(cbuf, 0x33);
      emit_rm(cbuf, 0x3, dst_enc, dst_enc);
    } else {
      emit_opcode(cbuf, $primary + dst_enc);
      emit_d32(cbuf, src_con);
    }
  %}

  enc_class LdImmL_Hi( eRegL dst, immL src) %{    // Load Immediate
    // Load immediate does not have a zero or sign extended version
    // for 8-bit immediates
    int dst_enc = $dst$$reg + 2;
    int src_con = ((julong)($src$$constant)) >> 32;
    if (src_con == 0) {
      // xor dst, dst
      emit_opcode(cbuf, 0x33);
      emit_rm(cbuf, 0x3, dst_enc, dst_enc);
    } else {
      emit_opcode(cbuf, $primary + dst_enc);
      emit_d32(cbuf, src_con);
    }
  %}


  // Encode a reg-reg copy.  If it is useless, then empty encoding.
  enc_class enc_Copy( rRegI dst, rRegI src ) %{
    encode_Copy( cbuf, $dst$$reg, $src$$reg );
  %}

  enc_class enc_CopyL_Lo( rRegI dst, eRegL src ) %{
    encode_Copy( cbuf, $dst$$reg, $src$$reg );
  %}

  enc_class RegReg (rRegI dst, rRegI src) %{    // RegReg(Many)
    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
  %}

  enc_class RegReg_Lo(eRegL dst, eRegL src) %{    // RegReg(Many)
    $$$emit8$primary;
    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
  %}

  enc_class RegReg_Hi(eRegL dst, eRegL src) %{    // RegReg(Many)
    $$$emit8$secondary;
    emit_rm(cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), HIGH_FROM_LOW($src$$reg));
  %}

  enc_class RegReg_Lo2(eRegL dst, eRegL src) %{    // RegReg(Many)
    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
  %}

  enc_class RegReg_Hi2(eRegL dst, eRegL src) %{    // RegReg(Many)
    emit_rm(cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), HIGH_FROM_LOW($src$$reg));
  %}

  enc_class RegReg_HiLo( eRegL src, rRegI dst ) %{
    emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($src$$reg));
  %}

  enc_class Con32 (immI src) %{    // Con32(storeImmI)
    // Output immediate
    $$$emit32$src$$constant;
  %}

  enc_class Con32FPR_as_bits(immFPR src) %{        // storeF_imm
    // Output Float immediate bits
    jfloat jf = $src$$constant;
    int    jf_as_bits = jint_cast( jf );
    emit_d32(cbuf, jf_as_bits);
  %}

  enc_class Con32F_as_bits(immF src) %{      // storeX_imm
    // Output Float immediate bits
    jfloat jf = $src$$constant;
    int    jf_as_bits = jint_cast( jf );
    emit_d32(cbuf, jf_as_bits);
  %}

  enc_class Con16 (immI src) %{    // Con16(storeImmI)
    // Output immediate
    $$$emit16$src$$constant;
  %}

  enc_class Con_d32(immI src) %{
    emit_d32(cbuf,$src$$constant);
  %}

  enc_class conmemref (eRegP t1) %{    // Con32(storeImmI)
    // Output immediate memory reference
    emit_rm(cbuf, 0x00, $t1$$reg, 0x05 );
    emit_d32(cbuf, 0x00);
  %}

  enc_class lock_prefix( ) %{
    if( os::is_MP() )
      emit_opcode(cbuf,0xF0);         // [Lock]
  %}

  // Cmp-xchg long value.
  // Note: we need to swap rbx, and rcx before and after the
  //       cmpxchg8 instruction because the instruction uses
  //       rcx as the high order word of the new value to store but
  //       our register encoding uses rbx,.
  enc_class enc_cmpxchg8(eSIRegP mem_ptr) %{

    // XCHG  rbx,ecx
    emit_opcode(cbuf,0x87);
    emit_opcode(cbuf,0xD9);
    // [Lock]
    if( os::is_MP() )
      emit_opcode(cbuf,0xF0);
    // CMPXCHG8 [Eptr]
    emit_opcode(cbuf,0x0F);
    emit_opcode(cbuf,0xC7);
    emit_rm( cbuf, 0x0, 1, $mem_ptr$$reg );
    // XCHG  rbx,ecx
    emit_opcode(cbuf,0x87);
    emit_opcode(cbuf,0xD9);
  %}

  enc_class enc_cmpxchg(eSIRegP mem_ptr) %{
    // [Lock]
    if( os::is_MP() )
      emit_opcode(cbuf,0xF0);

    // CMPXCHG [Eptr]
    emit_opcode(cbuf,0x0F);
    emit_opcode(cbuf,0xB1);
    emit_rm( cbuf, 0x0, 1, $mem_ptr$$reg );
  %}

  enc_class enc_flags_ne_to_boolean( iRegI res ) %{
    int res_encoding = $res$$reg;

    // MOV  res,0
    emit_opcode( cbuf, 0xB8 + res_encoding);
    emit_d32( cbuf, 0 );
    // JNE,s  fail
    emit_opcode(cbuf,0x75);
    emit_d8(cbuf, 5 );
    // MOV  res,1
    emit_opcode( cbuf, 0xB8 + res_encoding);
    emit_d32( cbuf, 1 );
    // fail:
  %}

  enc_class set_instruction_start( ) %{
    cbuf.set_insts_mark();            // Mark start of opcode for reloc info in mem operand
  %}

  enc_class RegMem (rRegI ereg, memory mem) %{    // emit_reg_mem
    int reg_encoding = $ereg$$reg;
    int base  = $mem$$base;
    int index = $mem$$index;
    int scale = $mem$$scale;
    int displace = $mem$$disp;
    relocInfo::relocType disp_reloc = $mem->disp_reloc();
    encode_RegMem(cbuf, reg_encoding, base, index, scale, displace, disp_reloc);
  %}

  enc_class RegMem_Hi(eRegL ereg, memory mem) %{    // emit_reg_mem
    int reg_encoding = HIGH_FROM_LOW($ereg$$reg);  // Hi register of pair, computed from lo
    int base  = $mem$$base;
    int index = $mem$$index;
    int scale = $mem$$scale;
    int displace = $mem$$disp + 4;      // Offset is 4 further in memory
    assert( $mem->disp_reloc() == relocInfo::none, "Cannot add 4 to oop" );
    encode_RegMem(cbuf, reg_encoding, base, index, scale, displace, relocInfo::none);
  %}

  enc_class move_long_small_shift( eRegL dst, immI_1_31 cnt ) %{
    int r1, r2;
    if( $tertiary == 0xA4 ) { r1 = $dst$$reg;  r2 = HIGH_FROM_LOW($dst$$reg); }
    else                    { r2 = $dst$$reg;  r1 = HIGH_FROM_LOW($dst$$reg); }
    emit_opcode(cbuf,0x0F);
    emit_opcode(cbuf,$tertiary);
    emit_rm(cbuf, 0x3, r1, r2);
    emit_d8(cbuf,$cnt$$constant);
    emit_d8(cbuf,$primary);
    emit_rm(cbuf, 0x3, $secondary, r1);
    emit_d8(cbuf,$cnt$$constant);
  %}

  enc_class move_long_big_shift_sign( eRegL dst, immI_32_63 cnt ) %{
    emit_opcode( cbuf, 0x8B ); // Move
    emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($dst$$reg));
    if( $cnt$$constant > 32 ) { // Shift, if not by zero
      emit_d8(cbuf,$primary);
      emit_rm(cbuf, 0x3, $secondary, $dst$$reg);
      emit_d8(cbuf,$cnt$$constant-32);
    }
    emit_d8(cbuf,$primary);
    emit_rm(cbuf, 0x3, $secondary, HIGH_FROM_LOW($dst$$reg));
    emit_d8(cbuf,31);
  %}

  enc_class move_long_big_shift_clr( eRegL dst, immI_32_63 cnt ) %{
    int r1, r2;
    if( $secondary == 0x5 ) { r1 = $dst$$reg;  r2 = HIGH_FROM_LOW($dst$$reg); }
    else                    { r2 = $dst$$reg;  r1 = HIGH_FROM_LOW($dst$$reg); }

    emit_opcode( cbuf, 0x8B ); // Move r1,r2
    emit_rm(cbuf, 0x3, r1, r2);
    if( $cnt$$constant > 32 ) { // Shift, if not by zero
      emit_opcode(cbuf,$primary);
      emit_rm(cbuf, 0x3, $secondary, r1);
      emit_d8(cbuf,$cnt$$constant-32);
    }
    emit_opcode(cbuf,0x33);  // XOR r2,r2
    emit_rm(cbuf, 0x3, r2, r2);
  %}

  // Clone of RegMem but accepts an extra parameter to access each
  // half of a double in memory; it never needs relocation info.
  enc_class Mov_MemD_half_to_Reg (immI opcode, memory mem, immI disp_for_half, rRegI rm_reg) %{
    emit_opcode(cbuf,$opcode$$constant);
    int reg_encoding = $rm_reg$$reg;
    int base     = $mem$$base;
    int index    = $mem$$index;
    int scale    = $mem$$scale;
    int displace = $mem$$disp + $disp_for_half$$constant;
    relocInfo::relocType disp_reloc = relocInfo::none;
    encode_RegMem(cbuf, reg_encoding, base, index, scale, displace, disp_reloc);
  %}

  // !!!!! Special Custom Code used by MemMove, and stack access instructions !!!!!
  //
  // Clone of RegMem except the RM-byte's reg/opcode field is an ADLC-time constant
  // and it never needs relocation information.
  // Frequently used to move data between FPU's Stack Top and memory.
  enc_class RMopc_Mem_no_oop (immI rm_opcode, memory mem) %{
    int rm_byte_opcode = $rm_opcode$$constant;
    int base     = $mem$$base;
    int index    = $mem$$index;
    int scale    = $mem$$scale;
    int displace = $mem$$disp;
    assert( $mem->disp_reloc() == relocInfo::none, "No oops here because no reloc info allowed" );
    encode_RegMem(cbuf, rm_byte_opcode, base, index, scale, displace, relocInfo::none);
  %}

  enc_class RMopc_Mem (immI rm_opcode, memory mem) %{
    int rm_byte_opcode = $rm_opcode$$constant;
    int base     = $mem$$base;
    int index    = $mem$$index;
    int scale    = $mem$$scale;
    int displace = $mem$$disp;
    relocInfo::relocType disp_reloc = $mem->disp_reloc(); // disp-as-oop when working with static globals
    encode_RegMem(cbuf, rm_byte_opcode, base, index, scale, displace, disp_reloc);
  %}

  enc_class RegLea (rRegI dst, rRegI src0, immI src1 ) %{    // emit_reg_lea
    int reg_encoding = $dst$$reg;
    int base         = $src0$$reg;      // 0xFFFFFFFF indicates no base
    int index        = 0x04;            // 0x04 indicates no index
    int scale        = 0x00;            // 0x00 indicates no scale
    int displace     = $src1$$constant; // 0x00 indicates no displacement
    relocInfo::relocType disp_reloc = relocInfo::none;
    encode_RegMem(cbuf, reg_encoding, base, index, scale, displace, disp_reloc);
  %}

  enc_class min_enc (rRegI dst, rRegI src) %{    // MIN
    // Compare dst,src
    emit_opcode(cbuf,0x3B);
    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
    // jmp dst < src around move
    emit_opcode(cbuf,0x7C);
    emit_d8(cbuf,2);
    // move dst,src
    emit_opcode(cbuf,0x8B);
    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
  %}

  enc_class max_enc (rRegI dst, rRegI src) %{    // MAX
    // Compare dst,src
    emit_opcode(cbuf,0x3B);
    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
    // jmp dst > src around move
    emit_opcode(cbuf,0x7F);
    emit_d8(cbuf,2);
    // move dst,src
    emit_opcode(cbuf,0x8B);
    emit_rm(cbuf, 0x3, $dst$$reg, $src$$reg);
  %}

  enc_class enc_FPR_store(memory mem, regDPR src) %{
    // If src is FPR1, we can just FST to store it.
    // Else we need to FLD it to FPR1, then FSTP to store/pop it.
    int reg_encoding = 0x2; // Just store
    int base  = $mem$$base;
    int index = $mem$$index;
    int scale = $mem$$scale;
    int displace = $mem$$disp;
    relocInfo::relocType disp_reloc = $mem->disp_reloc(); // disp-as-oop when working with static globals
    if( $src$$reg != FPR1L_enc ) {
      reg_encoding = 0x3;  // Store & pop
      emit_opcode( cbuf, 0xD9 ); // FLD (i.e., push it)
      emit_d8( cbuf, 0xC0-1+$src$$reg );
    }
    cbuf.set_insts_mark();       // Mark start of opcode for reloc info in mem operand
    emit_opcode(cbuf,$primary);
    encode_RegMem(cbuf, reg_encoding, base, index, scale, displace, disp_reloc);
  %}

  enc_class neg_reg(rRegI dst) %{
    // NEG $dst
    emit_opcode(cbuf,0xF7);
    emit_rm(cbuf, 0x3, 0x03, $dst$$reg );
  %}

  enc_class setLT_reg(eCXRegI dst) %{
    // SETLT $dst
    emit_opcode(cbuf,0x0F);
    emit_opcode(cbuf,0x9C);
    emit_rm( cbuf, 0x3, 0x4, $dst$$reg );
  %}

  enc_class enc_cmpLTP(ncxRegI p, ncxRegI q, ncxRegI y, eCXRegI tmp) %{    // cadd_cmpLT
    int tmpReg = $tmp$$reg;

    // SUB $p,$q
    emit_opcode(cbuf,0x2B);
    emit_rm(cbuf, 0x3, $p$$reg, $q$$reg);
    // SBB $tmp,$tmp
    emit_opcode(cbuf,0x1B);
    emit_rm(cbuf, 0x3, tmpReg, tmpReg);
    // AND $tmp,$y
    emit_opcode(cbuf,0x23);
    emit_rm(cbuf, 0x3, tmpReg, $y$$reg);
    // ADD $p,$tmp
    emit_opcode(cbuf,0x03);
    emit_rm(cbuf, 0x3, $p$$reg, tmpReg);
  %}

  enc_class shift_left_long( eRegL dst, eCXRegI shift ) %{
    // TEST shift,32
    emit_opcode(cbuf,0xF7);
    emit_rm(cbuf, 0x3, 0, ECX_enc);
    emit_d32(cbuf,0x20);
    // JEQ,s small
    emit_opcode(cbuf, 0x74);
    emit_d8(cbuf, 0x04);
    // MOV    $dst.hi,$dst.lo
    emit_opcode( cbuf, 0x8B );
    emit_rm(cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), $dst$$reg );
    // CLR    $dst.lo
    emit_opcode(cbuf, 0x33);
    emit_rm(cbuf, 0x3, $dst$$reg, $dst$$reg);
// small:
    // SHLD   $dst.hi,$dst.lo,$shift
    emit_opcode(cbuf,0x0F);
    emit_opcode(cbuf,0xA5);
    emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($dst$$reg));
    // SHL    $dst.lo,$shift"
    emit_opcode(cbuf,0xD3);
    emit_rm(cbuf, 0x3, 0x4, $dst$$reg );
  %}

  enc_class shift_right_long( eRegL dst, eCXRegI shift ) %{
    // TEST shift,32
    emit_opcode(cbuf,0xF7);
    emit_rm(cbuf, 0x3, 0, ECX_enc);
    emit_d32(cbuf,0x20);
    // JEQ,s small
    emit_opcode(cbuf, 0x74);
    emit_d8(cbuf, 0x04);
    // MOV    $dst.lo,$dst.hi
    emit_opcode( cbuf, 0x8B );
    emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($dst$$reg) );
    // CLR    $dst.hi
    emit_opcode(cbuf, 0x33);
    emit_rm(cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), HIGH_FROM_LOW($dst$$reg));
// small:
    // SHRD   $dst.lo,$dst.hi,$shift
    emit_opcode(cbuf,0x0F);
    emit_opcode(cbuf,0xAD);
    emit_rm(cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), $dst$$reg);
    // SHR    $dst.hi,$shift"
    emit_opcode(cbuf,0xD3);
    emit_rm(cbuf, 0x3, 0x5, HIGH_FROM_LOW($dst$$reg) );
  %}

  enc_class shift_right_arith_long( eRegL dst, eCXRegI shift ) %{
    // TEST shift,32
    emit_opcode(cbuf,0xF7);
    emit_rm(cbuf, 0x3, 0, ECX_enc);
    emit_d32(cbuf,0x20);
    // JEQ,s small
    emit_opcode(cbuf, 0x74);
    emit_d8(cbuf, 0x05);
    // MOV    $dst.lo,$dst.hi
    emit_opcode( cbuf, 0x8B );
    emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($dst$$reg) );
    // SAR    $dst.hi,31
    emit_opcode(cbuf, 0xC1);
    emit_rm(cbuf, 0x3, 7, HIGH_FROM_LOW($dst$$reg) );
    emit_d8(cbuf, 0x1F );
// small:
    // SHRD   $dst.lo,$dst.hi,$shift
    emit_opcode(cbuf,0x0F);
    emit_opcode(cbuf,0xAD);
    emit_rm(cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), $dst$$reg);
    // SAR    $dst.hi,$shift"
    emit_opcode(cbuf,0xD3);
    emit_rm(cbuf, 0x3, 0x7, HIGH_FROM_LOW($dst$$reg) );
  %}


  // ----------------- Encodings for floating point unit -----------------
  // May leave result in FPU-TOS or FPU reg depending on opcodes
  enc_class OpcReg_FPR(regFPR src) %{    // FMUL, FDIV
    $$$emit8$primary;
    emit_rm(cbuf, 0x3, $secondary, $src$$reg );
  %}

  // Pop argument in FPR0 with FSTP ST(0)
  enc_class PopFPU() %{
    emit_opcode( cbuf, 0xDD );
    emit_d8( cbuf, 0xD8 );
  %}

  // !!!!! equivalent to Pop_Reg_F
  enc_class Pop_Reg_DPR( regDPR dst ) %{
    emit_opcode( cbuf, 0xDD );           // FSTP   ST(i)
    emit_d8( cbuf, 0xD8+$dst$$reg );
  %}

  enc_class Push_Reg_DPR( regDPR dst ) %{
    emit_opcode( cbuf, 0xD9 );
    emit_d8( cbuf, 0xC0-1+$dst$$reg );   // FLD ST(i-1)
  %}

  enc_class strictfp_bias1( regDPR dst ) %{
    emit_opcode( cbuf, 0xDB );           // FLD m80real
    emit_opcode( cbuf, 0x2D );
    emit_d32( cbuf, (int)StubRoutines::addr_fpu_subnormal_bias1() );
    emit_opcode( cbuf, 0xDE );           // FMULP ST(dst), ST0
    emit_opcode( cbuf, 0xC8+$dst$$reg );
  %}

  enc_class strictfp_bias2( regDPR dst ) %{
    emit_opcode( cbuf, 0xDB );           // FLD m80real
    emit_opcode( cbuf, 0x2D );
    emit_d32( cbuf, (int)StubRoutines::addr_fpu_subnormal_bias2() );
    emit_opcode( cbuf, 0xDE );           // FMULP ST(dst), ST0
    emit_opcode( cbuf, 0xC8+$dst$$reg );
  %}

  // Special case for moving an integer register to a stack slot.
  enc_class OpcPRegSS( stackSlotI dst, rRegI src ) %{ // RegSS
    store_to_stackslot( cbuf, $primary, $src$$reg, $dst$$disp );
  %}

  // Special case for moving a register to a stack slot.
  enc_class RegSS( stackSlotI dst, rRegI src ) %{ // RegSS
    // Opcode already emitted
    emit_rm( cbuf, 0x02, $src$$reg, ESP_enc );   // R/M byte
    emit_rm( cbuf, 0x00, ESP_enc, ESP_enc);          // SIB byte
    emit_d32(cbuf, $dst$$disp);   // Displacement
  %}

  // Push the integer in stackSlot 'src' onto FP-stack
  enc_class Push_Mem_I( memory src ) %{    // FILD   [ESP+src]
    store_to_stackslot( cbuf, $primary, $secondary, $src$$disp );
  %}

  // Push FPU's TOS float to a stack-slot, and pop FPU-stack
  enc_class Pop_Mem_FPR( stackSlotF dst ) %{ // FSTP_S [ESP+dst]
    store_to_stackslot( cbuf, 0xD9, 0x03, $dst$$disp );
  %}

  // Same as Pop_Mem_F except for opcode
  // Push FPU's TOS double to a stack-slot, and pop FPU-stack
  enc_class Pop_Mem_DPR( stackSlotD dst ) %{ // FSTP_D [ESP+dst]
    store_to_stackslot( cbuf, 0xDD, 0x03, $dst$$disp );
  %}

  enc_class Pop_Reg_FPR( regFPR dst ) %{
    emit_opcode( cbuf, 0xDD );           // FSTP   ST(i)
    emit_d8( cbuf, 0xD8+$dst$$reg );
  %}

  enc_class Push_Reg_FPR( regFPR dst ) %{
    emit_opcode( cbuf, 0xD9 );           // FLD    ST(i-1)
    emit_d8( cbuf, 0xC0-1+$dst$$reg );
  %}

  // Push FPU's float to a stack-slot, and pop FPU-stack
  enc_class Pop_Mem_Reg_FPR( stackSlotF dst, regFPR src ) %{
    int pop = 0x02;
    if ($src$$reg != FPR1L_enc) {
      emit_opcode( cbuf, 0xD9 );         // FLD    ST(i-1)
      emit_d8( cbuf, 0xC0-1+$src$$reg );
      pop = 0x03;
    }
    store_to_stackslot( cbuf, 0xD9, pop, $dst$$disp ); // FST<P>_S  [ESP+dst]
  %}

  // Push FPU's double to a stack-slot, and pop FPU-stack
  enc_class Pop_Mem_Reg_DPR( stackSlotD dst, regDPR src ) %{
    int pop = 0x02;
    if ($src$$reg != FPR1L_enc) {
      emit_opcode( cbuf, 0xD9 );         // FLD    ST(i-1)
      emit_d8( cbuf, 0xC0-1+$src$$reg );
      pop = 0x03;
    }
    store_to_stackslot( cbuf, 0xDD, pop, $dst$$disp ); // FST<P>_D  [ESP+dst]
  %}

  // Push FPU's double to a FPU-stack-slot, and pop FPU-stack
  enc_class Pop_Reg_Reg_DPR( regDPR dst, regFPR src ) %{
    int pop = 0xD0 - 1; // -1 since we skip FLD
    if ($src$$reg != FPR1L_enc) {
      emit_opcode( cbuf, 0xD9 );         // FLD    ST(src-1)
      emit_d8( cbuf, 0xC0-1+$src$$reg );
      pop = 0xD8;
    }
    emit_opcode( cbuf, 0xDD );
    emit_d8( cbuf, pop+$dst$$reg );      // FST<P> ST(i)
  %}


  enc_class Push_Reg_Mod_DPR( regDPR dst, regDPR src) %{
    // load dst in FPR0
    emit_opcode( cbuf, 0xD9 );
    emit_d8( cbuf, 0xC0-1+$dst$$reg );
    if ($src$$reg != FPR1L_enc) {
      // fincstp
      emit_opcode (cbuf, 0xD9);
      emit_opcode (cbuf, 0xF7);
      // swap src with FPR1:
      // FXCH FPR1 with src
      emit_opcode(cbuf, 0xD9);
      emit_d8(cbuf, 0xC8-1+$src$$reg );
      // fdecstp
      emit_opcode (cbuf, 0xD9);
      emit_opcode (cbuf, 0xF6);
    }
  %}

  enc_class Push_ModD_encoding(regD src0, regD src1) %{
    MacroAssembler _masm(&cbuf);
    __ subptr(rsp, 8);
    __ movdbl(Address(rsp, 0), $src1$$XMMRegister);
    __ fld_d(Address(rsp, 0));
    __ movdbl(Address(rsp, 0), $src0$$XMMRegister);
    __ fld_d(Address(rsp, 0));
  %}

  enc_class Push_ModF_encoding(regF src0, regF src1) %{
    MacroAssembler _masm(&cbuf);
    __ subptr(rsp, 4);
    __ movflt(Address(rsp, 0), $src1$$XMMRegister);
    __ fld_s(Address(rsp, 0));
    __ movflt(Address(rsp, 0), $src0$$XMMRegister);
    __ fld_s(Address(rsp, 0));
  %}

  enc_class Push_ResultD(regD dst) %{
    MacroAssembler _masm(&cbuf);
    __ fstp_d(Address(rsp, 0));
    __ movdbl($dst$$XMMRegister, Address(rsp, 0));
    __ addptr(rsp, 8);
  %}

  enc_class Push_ResultF(regF dst, immI d8) %{
    MacroAssembler _masm(&cbuf);
    __ fstp_s(Address(rsp, 0));
    __ movflt($dst$$XMMRegister, Address(rsp, 0));
    __ addptr(rsp, $d8$$constant);
  %}

  enc_class Push_SrcD(regD src) %{
    MacroAssembler _masm(&cbuf);
    __ subptr(rsp, 8);
    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
    __ fld_d(Address(rsp, 0));
  %}

  enc_class push_stack_temp_qword() %{
    MacroAssembler _masm(&cbuf);
    __ subptr(rsp, 8);
  %}

  enc_class pop_stack_temp_qword() %{
    MacroAssembler _masm(&cbuf);
    __ addptr(rsp, 8);
  %}

  enc_class push_xmm_to_fpr1(regD src) %{
    MacroAssembler _masm(&cbuf);
    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
    __ fld_d(Address(rsp, 0));
  %}

  enc_class Push_Result_Mod_DPR( regDPR src) %{
    if ($src$$reg != FPR1L_enc) {
      // fincstp
      emit_opcode (cbuf, 0xD9);
      emit_opcode (cbuf, 0xF7);
      // FXCH FPR1 with src
      emit_opcode(cbuf, 0xD9);
      emit_d8(cbuf, 0xC8-1+$src$$reg );
      // fdecstp
      emit_opcode (cbuf, 0xD9);
      emit_opcode (cbuf, 0xF6);
    }
    // // following asm replaced with Pop_Reg_F or Pop_Mem_F
    // // FSTP   FPR$dst$$reg
    // emit_opcode( cbuf, 0xDD );
    // emit_d8( cbuf, 0xD8+$dst$$reg );
  %}

  enc_class fnstsw_sahf_skip_parity() %{
    // fnstsw ax
    emit_opcode( cbuf, 0xDF );
    emit_opcode( cbuf, 0xE0 );
    // sahf
    emit_opcode( cbuf, 0x9E );
    // jnp  ::skip
    emit_opcode( cbuf, 0x7B );
    emit_opcode( cbuf, 0x05 );
  %}

  enc_class emitModDPR() %{
    // fprem must be iterative
    // :: loop
    // fprem
    emit_opcode( cbuf, 0xD9 );
    emit_opcode( cbuf, 0xF8 );
    // wait
    emit_opcode( cbuf, 0x9b );
    // fnstsw ax
    emit_opcode( cbuf, 0xDF );
    emit_opcode( cbuf, 0xE0 );
    // sahf
    emit_opcode( cbuf, 0x9E );
    // jp  ::loop
    emit_opcode( cbuf, 0x0F );
    emit_opcode( cbuf, 0x8A );
    emit_opcode( cbuf, 0xF4 );
    emit_opcode( cbuf, 0xFF );
    emit_opcode( cbuf, 0xFF );
    emit_opcode( cbuf, 0xFF );
  %}

  enc_class fpu_flags() %{
    // fnstsw_ax
    emit_opcode( cbuf, 0xDF);
    emit_opcode( cbuf, 0xE0);
    // test ax,0x0400
    emit_opcode( cbuf, 0x66 );   // operand-size prefix for 16-bit immediate
    emit_opcode( cbuf, 0xA9 );
    emit_d16   ( cbuf, 0x0400 );
    // // // This sequence works, but stalls for 12-16 cycles on PPro
    // // test rax,0x0400
    // emit_opcode( cbuf, 0xA9 );
    // emit_d32   ( cbuf, 0x00000400 );
    //
    // jz exit (no unordered comparison)
    emit_opcode( cbuf, 0x74 );
    emit_d8    ( cbuf, 0x02 );
    // mov ah,1 - treat as LT case (set carry flag)
    emit_opcode( cbuf, 0xB4 );
    emit_d8    ( cbuf, 0x01 );
    // sahf
    emit_opcode( cbuf, 0x9E);
  %}

  enc_class cmpF_P6_fixup() %{
    // Fixup the integer flags in case comparison involved a NaN
    //
    // JNP exit (no unordered comparison, P-flag is set by NaN)
    emit_opcode( cbuf, 0x7B );
    emit_d8    ( cbuf, 0x03 );
    // MOV AH,1 - treat as LT case (set carry flag)
    emit_opcode( cbuf, 0xB4 );
    emit_d8    ( cbuf, 0x01 );
    // SAHF
    emit_opcode( cbuf, 0x9E);
    // NOP     // target for branch to avoid branch to branch
    emit_opcode( cbuf, 0x90);
  %}

//     fnstsw_ax();
//     sahf();
//     movl(dst, nan_result);
//     jcc(Assembler::parity, exit);
//     movl(dst, less_result);
//     jcc(Assembler::below, exit);
//     movl(dst, equal_result);
//     jcc(Assembler::equal, exit);
//     movl(dst, greater_result);

// less_result     =  1;
// greater_result  = -1;
// equal_result    = 0;
// nan_result      = -1;

  enc_class CmpF_Result(rRegI dst) %{
    // fnstsw_ax();
    emit_opcode( cbuf, 0xDF);
    emit_opcode( cbuf, 0xE0);
    // sahf
    emit_opcode( cbuf, 0x9E);
    // movl(dst, nan_result);
    emit_opcode( cbuf, 0xB8 + $dst$$reg);
    emit_d32( cbuf, -1 );
    // jcc(Assembler::parity, exit);
    emit_opcode( cbuf, 0x7A );
    emit_d8    ( cbuf, 0x13 );
    // movl(dst, less_result);
    emit_opcode( cbuf, 0xB8 + $dst$$reg);
    emit_d32( cbuf, -1 );
    // jcc(Assembler::below, exit);
    emit_opcode( cbuf, 0x72 );
    emit_d8    ( cbuf, 0x0C );
    // movl(dst, equal_result);
    emit_opcode( cbuf, 0xB8 + $dst$$reg);
    emit_d32( cbuf, 0 );
    // jcc(Assembler::equal, exit);
    emit_opcode( cbuf, 0x74 );
    emit_d8    ( cbuf, 0x05 );
    // movl(dst, greater_result);
    emit_opcode( cbuf, 0xB8 + $dst$$reg);
    emit_d32( cbuf, 1 );
  %}


  // Compare the longs and set flags
  // BROKEN!  Do Not use as-is
  enc_class cmpl_test( eRegL src1, eRegL src2 ) %{
    // CMP    $src1.hi,$src2.hi
    emit_opcode( cbuf, 0x3B );
    emit_rm(cbuf, 0x3, HIGH_FROM_LOW($src1$$reg), HIGH_FROM_LOW($src2$$reg) );
    // JNE,s  done
    emit_opcode(cbuf,0x75);
    emit_d8(cbuf, 2 );
    // CMP    $src1.lo,$src2.lo
    emit_opcode( cbuf, 0x3B );
    emit_rm(cbuf, 0x3, $src1$$reg, $src2$$reg );
// done:
  %}

  enc_class convert_int_long( regL dst, rRegI src ) %{
    // mov $dst.lo,$src
    int dst_encoding = $dst$$reg;
    int src_encoding = $src$$reg;
    encode_Copy( cbuf, dst_encoding  , src_encoding );
    // mov $dst.hi,$src
    encode_Copy( cbuf, HIGH_FROM_LOW(dst_encoding), src_encoding );
    // sar $dst.hi,31
    emit_opcode( cbuf, 0xC1 );
    emit_rm(cbuf, 0x3, 7, HIGH_FROM_LOW(dst_encoding) );
    emit_d8(cbuf, 0x1F );
  %}

  enc_class convert_long_double( eRegL src ) %{
    // push $src.hi
    emit_opcode(cbuf, 0x50+HIGH_FROM_LOW($src$$reg));
    // push $src.lo
    emit_opcode(cbuf, 0x50+$src$$reg  );
    // fild 64-bits at [SP]
    emit_opcode(cbuf,0xdf);
    emit_d8(cbuf, 0x6C);
    emit_d8(cbuf, 0x24);
    emit_d8(cbuf, 0x00);
    // pop stack
    emit_opcode(cbuf, 0x83); // add  SP, #8
    emit_rm(cbuf, 0x3, 0x00, ESP_enc);
    emit_d8(cbuf, 0x8);
  %}

  enc_class multiply_con_and_shift_high( eDXRegI dst, nadxRegI src1, eADXRegL_low_only src2, immI_32_63 cnt, eFlagsReg cr ) %{
    // IMUL   EDX:EAX,$src1
    emit_opcode( cbuf, 0xF7 );
    emit_rm( cbuf, 0x3, 0x5, $src1$$reg );
    // SAR    EDX,$cnt-32
    int shift_count = ((int)$cnt$$constant) - 32;
    if (shift_count > 0) {
      emit_opcode(cbuf, 0xC1);
      emit_rm(cbuf, 0x3, 7, $dst$$reg );
      emit_d8(cbuf, shift_count);
    }
  %}

  // this version doesn't have add sp, 8
  enc_class convert_long_double2( eRegL src ) %{
    // push $src.hi
    emit_opcode(cbuf, 0x50+HIGH_FROM_LOW($src$$reg));
    // push $src.lo
    emit_opcode(cbuf, 0x50+$src$$reg  );
    // fild 64-bits at [SP]
    emit_opcode(cbuf,0xdf);
    emit_d8(cbuf, 0x6C);
    emit_d8(cbuf, 0x24);
    emit_d8(cbuf, 0x00);
  %}

  enc_class long_int_multiply( eADXRegL dst, nadxRegI src) %{
    // Basic idea: long = (long)int * (long)int
    // IMUL EDX:EAX, src
    emit_opcode( cbuf, 0xF7 );
    emit_rm( cbuf, 0x3, 0x5, $src$$reg);
  %}

  enc_class long_uint_multiply( eADXRegL dst, nadxRegI src) %{
    // Basic Idea:  long = (int & 0xffffffffL) * (int & 0xffffffffL)
    // MUL EDX:EAX, src
    emit_opcode( cbuf, 0xF7 );
    emit_rm( cbuf, 0x3, 0x4, $src$$reg);
  %}

  enc_class long_multiply( eADXRegL dst, eRegL src, rRegI tmp ) %{
    // Basic idea: lo(result) = lo(x_lo * y_lo)
    //             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
    // MOV    $tmp,$src.lo
    encode_Copy( cbuf, $tmp$$reg, $src$$reg );
    // IMUL   $tmp,EDX
    emit_opcode( cbuf, 0x0F );
    emit_opcode( cbuf, 0xAF );
    emit_rm( cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($dst$$reg) );
    // MOV    EDX,$src.hi
    encode_Copy( cbuf, HIGH_FROM_LOW($dst$$reg), HIGH_FROM_LOW($src$$reg) );
    // IMUL   EDX,EAX
    emit_opcode( cbuf, 0x0F );
    emit_opcode( cbuf, 0xAF );
    emit_rm( cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), $dst$$reg );
    // ADD    $tmp,EDX
    emit_opcode( cbuf, 0x03 );
    emit_rm( cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($dst$$reg) );
    // MUL   EDX:EAX,$src.lo
    emit_opcode( cbuf, 0xF7 );
    emit_rm( cbuf, 0x3, 0x4, $src$$reg );
    // ADD    EDX,ESI
    emit_opcode( cbuf, 0x03 );
    emit_rm( cbuf, 0x3, HIGH_FROM_LOW($dst$$reg), $tmp$$reg );
  %}

  enc_class long_multiply_con( eADXRegL dst, immL_127 src, rRegI tmp ) %{
    // Basic idea: lo(result) = lo(src * y_lo)
    //             hi(result) = hi(src * y_lo) + lo(src * y_hi)
    // IMUL   $tmp,EDX,$src
    emit_opcode( cbuf, 0x6B );
    emit_rm( cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($dst$$reg) );
    emit_d8( cbuf, (int)$src$$constant );
    // MOV    EDX,$src
    emit_opcode(cbuf, 0xB8 + EDX_enc);
    emit_d32( cbuf, (int)$src$$constant );
    // MUL   EDX:EAX,EDX
    emit_opcode( cbuf, 0xF7 );
    emit_rm( cbuf, 0x3, 0x4, EDX_enc );
    // ADD    EDX,ESI
    emit_opcode( cbuf, 0x03 );
    emit_rm( cbuf, 0x3, EDX_enc, $tmp$$reg );
  %}

  enc_class long_div( eRegL src1, eRegL src2 ) %{
    // PUSH src1.hi
    emit_opcode(cbuf, HIGH_FROM_LOW(0x50+$src1$$reg) );
    // PUSH src1.lo
    emit_opcode(cbuf,               0x50+$src1$$reg  );
    // PUSH src2.hi
    emit_opcode(cbuf, HIGH_FROM_LOW(0x50+$src2$$reg) );
    // PUSH src2.lo
    emit_opcode(cbuf,               0x50+$src2$$reg  );
    // CALL directly to the runtime
    cbuf.set_insts_mark();
    emit_opcode(cbuf,0xE8);       // Call into runtime
    emit_d32_reloc(cbuf, (CAST_FROM_FN_PTR(address, SharedRuntime::ldiv) - cbuf.insts_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
    // Restore stack
    emit_opcode(cbuf, 0x83); // add  SP, #framesize
    emit_rm(cbuf, 0x3, 0x00, ESP_enc);
    emit_d8(cbuf, 4*4);
  %}

  enc_class long_mod( eRegL src1, eRegL src2 ) %{
    // PUSH src1.hi
    emit_opcode(cbuf, HIGH_FROM_LOW(0x50+$src1$$reg) );
    // PUSH src1.lo
    emit_opcode(cbuf,               0x50+$src1$$reg  );
    // PUSH src2.hi
    emit_opcode(cbuf, HIGH_FROM_LOW(0x50+$src2$$reg) );
    // PUSH src2.lo
    emit_opcode(cbuf,               0x50+$src2$$reg  );
    // CALL directly to the runtime
    cbuf.set_insts_mark();
    emit_opcode(cbuf,0xE8);       // Call into runtime
    emit_d32_reloc(cbuf, (CAST_FROM_FN_PTR(address, SharedRuntime::lrem ) - cbuf.insts_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
    // Restore stack
    emit_opcode(cbuf, 0x83); // add  SP, #framesize
    emit_rm(cbuf, 0x3, 0x00, ESP_enc);
    emit_d8(cbuf, 4*4);
  %}

  enc_class long_cmp_flags0( eRegL src, rRegI tmp ) %{
    // MOV   $tmp,$src.lo
    emit_opcode(cbuf, 0x8B);
    emit_rm(cbuf, 0x3, $tmp$$reg, $src$$reg);
    // OR    $tmp,$src.hi
    emit_opcode(cbuf, 0x0B);
    emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src$$reg));
  %}

  enc_class long_cmp_flags1( eRegL src1, eRegL src2 ) %{
    // CMP    $src1.lo,$src2.lo
    emit_opcode( cbuf, 0x3B );
    emit_rm(cbuf, 0x3, $src1$$reg, $src2$$reg );
    // JNE,s  skip
    emit_cc(cbuf, 0x70, 0x5);
    emit_d8(cbuf,2);
    // CMP    $src1.hi,$src2.hi
    emit_opcode( cbuf, 0x3B );
    emit_rm(cbuf, 0x3, HIGH_FROM_LOW($src1$$reg), HIGH_FROM_LOW($src2$$reg) );
  %}

  enc_class long_cmp_flags2( eRegL src1, eRegL src2, rRegI tmp ) %{
    // CMP    $src1.lo,$src2.lo\t! Long compare; set flags for low bits
    emit_opcode( cbuf, 0x3B );
    emit_rm(cbuf, 0x3, $src1$$reg, $src2$$reg );
    // MOV    $tmp,$src1.hi
    emit_opcode( cbuf, 0x8B );
    emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src1$$reg) );
    // SBB   $tmp,$src2.hi\t! Compute flags for long compare
    emit_opcode( cbuf, 0x1B );
    emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src2$$reg) );
  %}

  enc_class long_cmp_flags3( eRegL src, rRegI tmp ) %{
    // XOR    $tmp,$tmp
    emit_opcode(cbuf,0x33);  // XOR
    emit_rm(cbuf,0x3, $tmp$$reg, $tmp$$reg);
    // CMP    $tmp,$src.lo
    emit_opcode( cbuf, 0x3B );
    emit_rm(cbuf, 0x3, $tmp$$reg, $src$$reg );
    // SBB    $tmp,$src.hi
    emit_opcode( cbuf, 0x1B );
    emit_rm(cbuf, 0x3, $tmp$$reg, HIGH_FROM_LOW($src$$reg) );
  %}

 // Sniff, sniff... smells like Gnu Superoptimizer
  enc_class neg_long( eRegL dst ) %{
    emit_opcode(cbuf,0xF7);    // NEG hi
    emit_rm    (cbuf,0x3, 0x3, HIGH_FROM_LOW($dst$$reg));
    emit_opcode(cbuf,0xF7);    // NEG lo
    emit_rm    (cbuf,0x3, 0x3,               $dst$$reg );
    emit_opcode(cbuf,0x83);    // SBB hi,0
    emit_rm    (cbuf,0x3, 0x3, HIGH_FROM_LOW($dst$$reg));
    emit_d8    (cbuf,0 );
  %}


  // Because the transitions from emitted code to the runtime
  // monitorenter/exit helper stubs are so slow it's critical that
  // we inline both the stack-locking fast-path and the inflated fast path.
  //
  // See also: cmpFastLock and cmpFastUnlock.
  //
  // What follows is a specialized inline transliteration of the code
  // in slow_enter() and slow_exit().  If we're concerned about I$ bloat
  // another option would be to emit TrySlowEnter and TrySlowExit methods
  // at startup-time.  These methods would accept arguments as
  // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
  // indications in the icc.ZFlag.  Fast_Lock and Fast_Unlock would simply
  // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
  // In practice, however, the # of lock sites is bounded and is usually small.
  // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
  // if the processor uses simple bimodal branch predictors keyed by EIP
  // Since the helper routines would be called from multiple synchronization
  // sites.
  //
  // An even better approach would be write "MonitorEnter()" and "MonitorExit()"
  // in java - using j.u.c and unsafe - and just bind the lock and unlock sites
  // to those specialized methods.  That'd give us a mostly platform-independent
  // implementation that the JITs could optimize and inline at their pleasure.
  // Done correctly, the only time we'd need to cross to native could would be
  // to park() or unpark() threads.  We'd also need a few more unsafe operators
  // to (a) prevent compiler-JIT reordering of non-volatile accesses, and
  // (b) explicit barriers or fence operations.
  //
  // TODO:
  //
  // *  Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr).
  //    This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals.
  //    Given TLAB allocation, Self is usually manifested in a register, so passing it into
  //    the lock operators would typically be faster than reifying Self.
  //
  // *  Ideally I'd define the primitives as:
  //       fast_lock   (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
  //       fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
  //    Unfortunately ADLC bugs prevent us from expressing the ideal form.
  //    Instead, we're stuck with a rather awkward and brittle register assignments below.
  //    Furthermore the register assignments are overconstrained, possibly resulting in
  //    sub-optimal code near the synchronization site.
  //
  // *  Eliminate the sp-proximity tests and just use "== Self" tests instead.
  //    Alternately, use a better sp-proximity test.
  //
  // *  Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
  //    Either one is sufficient to uniquely identify a thread.
  //    TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
  //
  // *  Intrinsify notify() and notifyAll() for the common cases where the
  //    object is locked by the calling thread but the waitlist is empty.
  //    avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
  //
  // *  use jccb and jmpb instead of jcc and jmp to improve code density.
  //    But beware of excessive branch density on AMD Opterons.
  //
  // *  Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success
  //    or failure of the fast-path.  If the fast-path fails then we pass
  //    control to the slow-path, typically in C.  In Fast_Lock and
  //    Fast_Unlock we often branch to DONE_LABEL, just to find that C2
  //    will emit a conditional branch immediately after the node.
  //    So we have branches to branches and lots of ICC.ZF games.
  //    Instead, it might be better to have C2 pass a "FailureLabel"
  //    into Fast_Lock and Fast_Unlock.  In the case of success, control
  //    will drop through the node.  ICC.ZF is undefined at exit.
  //    In the case of failure, the node will branch directly to the
  //    FailureLabel


  // obj: object to lock
  // box: on-stack box address (displaced header location) - KILLED
  // rax,: tmp -- KILLED
  // scr: tmp -- KILLED
  enc_class Fast_Lock( eRegP obj, eRegP box, eAXRegI tmp, eRegP scr ) %{

    Register objReg = as_Register($obj$$reg);
    Register boxReg = as_Register($box$$reg);
    Register tmpReg = as_Register($tmp$$reg);
    Register scrReg = as_Register($scr$$reg);

    // Ensure the register assignents are disjoint
    guarantee (objReg != boxReg, "") ;
    guarantee (objReg != tmpReg, "") ;
    guarantee (objReg != scrReg, "") ;
    guarantee (boxReg != tmpReg, "") ;
    guarantee (boxReg != scrReg, "") ;
    guarantee (tmpReg == as_Register(EAX_enc), "") ;

    MacroAssembler masm(&cbuf);

    if (_counters != NULL) {
      masm.atomic_incl(ExternalAddress((address) _counters->total_entry_count_addr()));
    }
    if (EmitSync & 1) {
        // set box->dhw = unused_mark (3)
        // Force all sync thru slow-path: slow_enter() and slow_exit() 
        masm.movptr (Address(boxReg, 0), int32_t(markOopDesc::unused_mark())) ;             
        masm.cmpptr (rsp, (int32_t)0) ;                        
    } else 
    if (EmitSync & 2) { 
        Label DONE_LABEL ;           
        if (UseBiasedLocking) {
           // Note: tmpReg maps to the swap_reg argument and scrReg to the tmp_reg argument.
           masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
        }

        masm.movptr(tmpReg, Address(objReg, 0)) ;          // fetch markword 
        masm.orptr (tmpReg, 0x1);
        masm.movptr(Address(boxReg, 0), tmpReg);           // Anticipate successful CAS 
        if (os::is_MP()) { masm.lock();  }
        masm.cmpxchgptr(boxReg, Address(objReg, 0));          // Updates tmpReg
        masm.jcc(Assembler::equal, DONE_LABEL);
        // Recursive locking
        masm.subptr(tmpReg, rsp);
        masm.andptr(tmpReg, (int32_t) 0xFFFFF003 );
        masm.movptr(Address(boxReg, 0), tmpReg);
        masm.bind(DONE_LABEL) ; 
    } else {  
      // Possible cases that we'll encounter in fast_lock 
      // ------------------------------------------------
      // * Inflated
      //    -- unlocked
      //    -- Locked
      //       = by self
      //       = by other
      // * biased
      //    -- by Self
      //    -- by other
      // * neutral
      // * stack-locked
      //    -- by self
      //       = sp-proximity test hits
      //       = sp-proximity test generates false-negative
      //    -- by other
      //

      Label IsInflated, DONE_LABEL, PopDone ;

      // TODO: optimize away redundant LDs of obj->mark and improve the markword triage
      // order to reduce the number of conditional branches in the most common cases.
      // Beware -- there's a subtle invariant that fetch of the markword
      // at [FETCH], below, will never observe a biased encoding (*101b).
      // If this invariant is not held we risk exclusion (safety) failure.
      if (UseBiasedLocking && !UseOptoBiasInlining) {
        masm.biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, _counters);
      }

      masm.movptr(tmpReg, Address(objReg, 0)) ;         // [FETCH]
      masm.testptr(tmpReg, 0x02) ;                      // Inflated v (Stack-locked or neutral)
      masm.jccb  (Assembler::notZero, IsInflated) ;

      // Attempt stack-locking ...
      masm.orptr (tmpReg, 0x1);
      masm.movptr(Address(boxReg, 0), tmpReg);          // Anticipate successful CAS
      if (os::is_MP()) { masm.lock();  }
      masm.cmpxchgptr(boxReg, Address(objReg, 0));           // Updates tmpReg
      if (_counters != NULL) {
        masm.cond_inc32(Assembler::equal,
                        ExternalAddress((address)_counters->fast_path_entry_count_addr()));
      }
      masm.jccb (Assembler::equal, DONE_LABEL);

      // Recursive locking
      masm.subptr(tmpReg, rsp);
      masm.andptr(tmpReg, 0xFFFFF003 );
      masm.movptr(Address(boxReg, 0), tmpReg);
      if (_counters != NULL) {
        masm.cond_inc32(Assembler::equal,
                        ExternalAddress((address)_counters->fast_path_entry_count_addr()));
      }
      masm.jmp  (DONE_LABEL) ;

      masm.bind (IsInflated) ;

      // The object is inflated.
      //
      // TODO-FIXME: eliminate the ugly use of manifest constants:
      //   Use markOopDesc::monitor_value instead of "2".
      //   use markOop::unused_mark() instead of "3".
      // The tmpReg value is an objectMonitor reference ORed with
      // markOopDesc::monitor_value (2).   We can either convert tmpReg to an
      // objectmonitor pointer by masking off the "2" bit or we can just
      // use tmpReg as an objectmonitor pointer but bias the objectmonitor
      // field offsets with "-2" to compensate for and annul the low-order tag bit.
      //
      // I use the latter as it avoids AGI stalls.
      // As such, we write "mov r, [tmpReg+OFFSETOF(Owner)-2]"
      // instead of "mov r, [tmpReg+OFFSETOF(Owner)]".
      //
      #define OFFSET_SKEWED(f) ((ObjectMonitor::f ## _offset_in_bytes())-2)

      // boxReg refers to the on-stack BasicLock in the current frame.
      // We'd like to write:
      //   set box->_displaced_header = markOop::unused_mark().  Any non-0 value suffices.
      // This is convenient but results a ST-before-CAS penalty.  The following CAS suffers
      // additional latency as we have another ST in the store buffer that must drain.

      if (EmitSync & 8192) { 
         masm.movptr(Address(boxReg, 0), 3) ;            // results in ST-before-CAS penalty
         masm.get_thread (scrReg) ; 
         masm.movptr(boxReg, tmpReg);                    // consider: LEA box, [tmp-2] 
         masm.movptr(tmpReg, NULL_WORD);                 // consider: xor vs mov
         if (os::is_MP()) { masm.lock(); } 
         masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 
      } else 
      if ((EmitSync & 128) == 0) {                      // avoid ST-before-CAS
         masm.movptr(scrReg, boxReg) ; 
         masm.movptr(boxReg, tmpReg);                   // consider: LEA box, [tmp-2] 

         // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
         if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
            // prefetchw [eax + Offset(_owner)-2]
            masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
         }

         if ((EmitSync & 64) == 0) {
           // Optimistic form: consider XORL tmpReg,tmpReg
           masm.movptr(tmpReg, NULL_WORD) ; 
         } else { 
           // Can suffer RTS->RTO upgrades on shared or cold $ lines
           // Test-And-CAS instead of CAS
           masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;   // rax, = m->_owner
           masm.testptr(tmpReg, tmpReg) ;                   // Locked ? 
           masm.jccb  (Assembler::notZero, DONE_LABEL) ;                   
         }

         // Appears unlocked - try to swing _owner from null to non-null.
         // Ideally, I'd manifest "Self" with get_thread and then attempt
         // to CAS the register containing Self into m->Owner.
         // But we don't have enough registers, so instead we can either try to CAS
         // rsp or the address of the box (in scr) into &m->owner.  If the CAS succeeds
         // we later store "Self" into m->Owner.  Transiently storing a stack address
         // (rsp or the address of the box) into  m->owner is harmless.
         // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
         if (os::is_MP()) { masm.lock();  }
         masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ; 
         masm.movptr(Address(scrReg, 0), 3) ;          // box->_displaced_header = 3
         masm.jccb  (Assembler::notZero, DONE_LABEL) ; 
         masm.get_thread (scrReg) ;                    // beware: clobbers ICCs
         masm.movptr(Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2), scrReg) ; 
         masm.xorptr(boxReg, boxReg) ;                 // set icc.ZFlag = 1 to indicate success
                       
         // If the CAS fails we can either retry or pass control to the slow-path.  
         // We use the latter tactic.  
         // Pass the CAS result in the icc.ZFlag into DONE_LABEL
         // If the CAS was successful ...
         //   Self has acquired the lock
         //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
         // Intentional fall-through into DONE_LABEL ...
      } else {
         masm.movptr(Address(boxReg, 0), 3) ;       // results in ST-before-CAS penalty
         masm.movptr(boxReg, tmpReg) ; 

         // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes
         if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
            // prefetchw [eax + Offset(_owner)-2]
            masm.prefetchw(Address(rax, ObjectMonitor::owner_offset_in_bytes()-2));
         }

         if ((EmitSync & 64) == 0) {
           // Optimistic form
           masm.xorptr  (tmpReg, tmpReg) ; 
         } else { 
           // Can suffer RTS->RTO upgrades on shared or cold $ lines
           masm.movptr(tmpReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;   // rax, = m->_owner
           masm.testptr(tmpReg, tmpReg) ;                   // Locked ? 
           masm.jccb  (Assembler::notZero, DONE_LABEL) ;                   
         }

         // Appears unlocked - try to swing _owner from null to non-null.
         // Use either "Self" (in scr) or rsp as thread identity in _owner.
         // Invariant: tmpReg == 0.  tmpReg is EAX which is the implicit cmpxchg comparand.
         masm.get_thread (scrReg) ;
         if (os::is_MP()) { masm.lock(); }
         masm.cmpxchgptr(scrReg, Address(boxReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;

         // If the CAS fails we can either retry or pass control to the slow-path.
         // We use the latter tactic.
         // Pass the CAS result in the icc.ZFlag into DONE_LABEL
         // If the CAS was successful ...
         //   Self has acquired the lock
         //   Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
         // Intentional fall-through into DONE_LABEL ...
      }

      // DONE_LABEL is a hot target - we'd really like to place it at the
      // start of cache line by padding with NOPs.
      // See the AMD and Intel software optimization manuals for the
      // most efficient "long" NOP encodings.
      // Unfortunately none of our alignment mechanisms suffice.
      masm.bind(DONE_LABEL);

      // Avoid branch-to-branch on AMD processors
      // This appears to be superstition.
      if (EmitSync & 32) masm.nop() ;


      // At DONE_LABEL the icc ZFlag is set as follows ...
      // Fast_Unlock uses the same protocol.
      // ZFlag == 1 -> Success
      // ZFlag == 0 -> Failure - force control through the slow-path
    }
  %}

  // obj: object to unlock
  // box: box address (displaced header location), killed.  Must be EAX.
  // rbx,: killed tmp; cannot be obj nor box.
  //
  // Some commentary on balanced locking:
  //
  // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites.
  // Methods that don't have provably balanced locking are forced to run in the
  // interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
  // The interpreter provides two properties:
  // I1:  At return-time the interpreter automatically and quietly unlocks any
  //      objects acquired the current activation (frame).  Recall that the
  //      interpreter maintains an on-stack list of locks currently held by
  //      a frame.
  // I2:  If a method attempts to unlock an object that is not held by the
  //      the frame the interpreter throws IMSX.
  //
  // Lets say A(), which has provably balanced locking, acquires O and then calls B().
  // B() doesn't have provably balanced locking so it runs in the interpreter.
  // Control returns to A() and A() unlocks O.  By I1 and I2, above, we know that O
  // is still locked by A().
  //
  // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
  // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
  // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
  // doesn't specify what will occur if a program engages in such mixed-mode locking, however.

  enc_class Fast_Unlock( nabxRegP obj, eAXRegP box, eRegP tmp) %{

    Register objReg = as_Register($obj$$reg);
    Register boxReg = as_Register($box$$reg);
    Register tmpReg = as_Register($tmp$$reg);

    guarantee (objReg != boxReg, "") ;
    guarantee (objReg != tmpReg, "") ;
    guarantee (boxReg != tmpReg, "") ;
    guarantee (boxReg == as_Register(EAX_enc), "") ;
    MacroAssembler masm(&cbuf);

    if (EmitSync & 4) {
      // Disable - inhibit all inlining.  Force control through the slow-path
      masm.cmpptr (rsp, 0) ; 
    } else 
    if (EmitSync & 8) {
      Label DONE_LABEL ;
      if (UseBiasedLocking) {
         masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
      }
      // classic stack-locking code ...
      masm.movptr(tmpReg, Address(boxReg, 0)) ;
      masm.testptr(tmpReg, tmpReg) ;
      masm.jcc   (Assembler::zero, DONE_LABEL) ;
      if (os::is_MP()) { masm.lock(); }
      masm.cmpxchgptr(tmpReg, Address(objReg, 0));          // Uses EAX which is box
      masm.bind(DONE_LABEL);
    } else {
      Label DONE_LABEL, Stacked, CheckSucc, Inflated ;

      // Critically, the biased locking test must have precedence over
      // and appear before the (box->dhw == 0) recursive stack-lock test.
      if (UseBiasedLocking && !UseOptoBiasInlining) {
         masm.biased_locking_exit(objReg, tmpReg, DONE_LABEL);
      }
      
      masm.cmpptr(Address(boxReg, 0), 0) ;            // Examine the displaced header
      masm.movptr(tmpReg, Address(objReg, 0)) ;       // Examine the object's markword
      masm.jccb  (Assembler::zero, DONE_LABEL) ;      // 0 indicates recursive stack-lock

      masm.testptr(tmpReg, 0x02) ;                     // Inflated? 
      masm.jccb  (Assembler::zero, Stacked) ;

      masm.bind  (Inflated) ;
      // It's inflated.
      // Despite our balanced locking property we still check that m->_owner == Self
      // as java routines or native JNI code called by this thread might
      // have released the lock.
      // Refer to the comments in synchronizer.cpp for how we might encode extra
      // state in _succ so we can avoid fetching EntryList|cxq.
      //
      // I'd like to add more cases in fast_lock() and fast_unlock() --
      // such as recursive enter and exit -- but we have to be wary of
      // I$ bloat, T$ effects and BP$ effects.
      //
      // If there's no contention try a 1-0 exit.  That is, exit without
      // a costly MEMBAR or CAS.  See synchronizer.cpp for details on how
      // we detect and recover from the race that the 1-0 exit admits.
      //
      // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier
      // before it STs null into _owner, releasing the lock.  Updates
      // to data protected by the critical section must be visible before
      // we drop the lock (and thus before any other thread could acquire
      // the lock and observe the fields protected by the lock).
      // IA32's memory-model is SPO, so STs are ordered with respect to
      // each other and there's no need for an explicit barrier (fence).
      // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.

      masm.get_thread (boxReg) ;
      if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) {
        // prefetchw [ebx + Offset(_owner)-2]
        masm.prefetchw(Address(rbx, ObjectMonitor::owner_offset_in_bytes()-2));
      }

      // Note that we could employ various encoding schemes to reduce
      // the number of loads below (currently 4) to just 2 or 3.
      // Refer to the comments in synchronizer.cpp.
      // In practice the chain of fetches doesn't seem to impact performance, however.
      if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
         // Attempt to reduce branch density - AMD's branch predictor.
         masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;  
         masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
         masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 
         masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 
         masm.jccb  (Assembler::notZero, DONE_LABEL) ; 
         masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; 
         masm.jmpb  (DONE_LABEL) ; 
      } else { 
         masm.xorptr(boxReg, Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2)) ;  
         masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::recursions_offset_in_bytes()-2)) ;
         masm.jccb  (Assembler::notZero, DONE_LABEL) ; 
         masm.movptr(boxReg, Address (tmpReg, ObjectMonitor::EntryList_offset_in_bytes()-2)) ; 
         masm.orptr(boxReg, Address (tmpReg, ObjectMonitor::cxq_offset_in_bytes()-2)) ; 
         masm.jccb  (Assembler::notZero, CheckSucc) ; 
         masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; 
         masm.jmpb  (DONE_LABEL) ; 
      }

      // The Following code fragment (EmitSync & 65536) improves the performance of
      // contended applications and contended synchronization microbenchmarks.
      // Unfortunately the emission of the code - even though not executed - causes regressions
      // in scimark and jetstream, evidently because of $ effects.  Replacing the code
      // with an equal number of never-executed NOPs results in the same regression.
      // We leave it off by default.

      if ((EmitSync & 65536) != 0) {
         Label LSuccess, LGoSlowPath ;

         masm.bind  (CheckSucc) ;

         // Optional pre-test ... it's safe to elide this
         if ((EmitSync & 16) == 0) { 
            masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; 
            masm.jccb  (Assembler::zero, LGoSlowPath) ; 
         }

         // We have a classic Dekker-style idiom:
         //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
         // There are a number of ways to implement the barrier:
         // (1) lock:andl &m->_owner, 0
         //     is fast, but mask doesn't currently support the "ANDL M,IMM32" form.
         //     LOCK: ANDL [ebx+Offset(_Owner)-2], 0
         //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
         // (2) If supported, an explicit MFENCE is appealing.
         //     In older IA32 processors MFENCE is slower than lock:add or xchg
         //     particularly if the write-buffer is full as might be the case if
         //     if stores closely precede the fence or fence-equivalent instruction.
         //     In more modern implementations MFENCE appears faster, however.
         // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
         //     The $lines underlying the top-of-stack should be in M-state.
         //     The locked add instruction is serializing, of course.
         // (4) Use xchg, which is serializing
         //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
         // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0.
         //     The integer condition codes will tell us if succ was 0.
         //     Since _succ and _owner should reside in the same $line and
         //     we just stored into _owner, it's likely that the $line
         //     remains in M-state for the lock:orl.
         //
         // We currently use (3), although it's likely that switching to (2)
         // is correct for the future.
            
         masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), NULL_WORD) ; 
         if (os::is_MP()) { 
            if (VM_Version::supports_sse2() && 1 == FenceInstruction) { 
              masm.mfence();
            } else { 
              masm.lock () ; masm.addptr(Address(rsp, 0), 0) ; 
            }
         }
         // Ratify _succ remains non-null
         masm.cmpptr(Address (tmpReg, ObjectMonitor::succ_offset_in_bytes()-2), 0) ; 
         masm.jccb  (Assembler::notZero, LSuccess) ; 

         masm.xorptr(boxReg, boxReg) ;                  // box is really EAX
         if (os::is_MP()) { masm.lock(); }
         masm.cmpxchgptr(rsp, Address(tmpReg, ObjectMonitor::owner_offset_in_bytes()-2));
         masm.jccb  (Assembler::notEqual, LSuccess) ;
         // Since we're low on registers we installed rsp as a placeholding in _owner.
         // Now install Self over rsp.  This is safe as we're transitioning from
         // non-null to non=null
         masm.get_thread (boxReg) ;
         masm.movptr(Address (tmpReg, ObjectMonitor::owner_offset_in_bytes()-2), boxReg) ;
         // Intentional fall-through into LGoSlowPath ...

         masm.bind  (LGoSlowPath) ; 
         masm.orptr(boxReg, 1) ;                      // set ICC.ZF=0 to indicate failure
         masm.jmpb  (DONE_LABEL) ; 

         masm.bind  (LSuccess) ; 
         masm.xorptr(boxReg, boxReg) ;                 // set ICC.ZF=1 to indicate success
         masm.jmpb  (DONE_LABEL) ; 
      }

      masm.bind (Stacked) ;
      // It's not inflated and it's not recursively stack-locked and it's not biased.
      // It must be stack-locked.
      // Try to reset the header to displaced header.
      // The "box" value on the stack is stable, so we can reload
      // and be assured we observe the same value as above.
      masm.movptr(tmpReg, Address(boxReg, 0)) ;
      if (os::is_MP()) {   masm.lock();    }
      masm.cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses EAX which is box
      // Intention fall-thru into DONE_LABEL


      // DONE_LABEL is a hot target - we'd really like to place it at the
      // start of cache line by padding with NOPs.
      // See the AMD and Intel software optimization manuals for the
      // most efficient "long" NOP encodings.
      // Unfortunately none of our alignment mechanisms suffice.
      if ((EmitSync & 65536) == 0) {
         masm.bind (CheckSucc) ;
      }
      masm.bind(DONE_LABEL);

      // Avoid branch to branch on AMD processors
      if (EmitSync & 32768) { masm.nop() ; }
    }
  %}


  enc_class enc_pop_rdx() %{
    emit_opcode(cbuf,0x5A);
  %}

  enc_class enc_rethrow() %{
    cbuf.set_insts_mark();
    emit_opcode(cbuf, 0xE9);        // jmp    entry
    emit_d32_reloc(cbuf, (int)OptoRuntime::rethrow_stub() - ((int)cbuf.insts_end())-4,
                   runtime_call_Relocation::spec(), RELOC_IMM32 );
  %}


  // Convert a double to an int.  Java semantics require we do complex
  // manglelations in the corner cases.  So we set the rounding mode to
  // 'zero', store the darned double down as an int, and reset the
  // rounding mode to 'nearest'.  The hardware throws an exception which
  // patches up the correct value directly to the stack.
  enc_class DPR2I_encoding( regDPR src ) %{
    // Flip to round-to-zero mode.  We attempted to allow invalid-op
    // exceptions here, so that a NAN or other corner-case value will
    // thrown an exception (but normal values get converted at full speed).
    // However, I2C adapters and other float-stack manglers leave pending
    // invalid-op exceptions hanging.  We would have to clear them before
    // enabling them and that is more expensive than just testing for the
    // invalid value Intel stores down in the corner cases.
    emit_opcode(cbuf,0xD9);            // FLDCW  trunc
    emit_opcode(cbuf,0x2D);
    emit_d32(cbuf,(int)StubRoutines::addr_fpu_cntrl_wrd_trunc());
    // Allocate a word
    emit_opcode(cbuf,0x83);            // SUB ESP,4
    emit_opcode(cbuf,0xEC);
    emit_d8(cbuf,0x04);
    // Encoding assumes a double has been pushed into FPR0.
    // Store down the double as an int, popping the FPU stack
    emit_opcode(cbuf,0xDB);            // FISTP [ESP]
    emit_opcode(cbuf,0x1C);
    emit_d8(cbuf,0x24);
    // Restore the rounding mode; mask the exception
    emit_opcode(cbuf,0xD9);            // FLDCW   std/24-bit mode
    emit_opcode(cbuf,0x2D);
    emit_d32( cbuf, Compile::current()->in_24_bit_fp_mode()
        ? (int)StubRoutines::addr_fpu_cntrl_wrd_24()
        : (int)StubRoutines::addr_fpu_cntrl_wrd_std());

    // Load the converted int; adjust CPU stack
    emit_opcode(cbuf,0x58);       // POP EAX
    emit_opcode(cbuf,0x3D);       // CMP EAX,imm
    emit_d32   (cbuf,0x80000000); //         0x80000000
    emit_opcode(cbuf,0x75);       // JNE around_slow_call
    emit_d8    (cbuf,0x07);       // Size of slow_call
    // Push src onto stack slow-path
    emit_opcode(cbuf,0xD9 );      // FLD     ST(i)
    emit_d8    (cbuf,0xC0-1+$src$$reg );
    // CALL directly to the runtime
    cbuf.set_insts_mark();
    emit_opcode(cbuf,0xE8);       // Call into runtime
    emit_d32_reloc(cbuf, (StubRoutines::d2i_wrapper() - cbuf.insts_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
    // Carry on here...
  %}

  enc_class DPR2L_encoding( regDPR src ) %{
    emit_opcode(cbuf,0xD9);            // FLDCW  trunc
    emit_opcode(cbuf,0x2D);
    emit_d32(cbuf,(int)StubRoutines::addr_fpu_cntrl_wrd_trunc());
    // Allocate a word
    emit_opcode(cbuf,0x83);            // SUB ESP,8
    emit_opcode(cbuf,0xEC);
    emit_d8(cbuf,0x08);
    // Encoding assumes a double has been pushed into FPR0.
    // Store down the double as a long, popping the FPU stack
    emit_opcode(cbuf,0xDF);            // FISTP [ESP]
    emit_opcode(cbuf,0x3C);
    emit_d8(cbuf,0x24);
    // Restore the rounding mode; mask the exception
    emit_opcode(cbuf,0xD9);            // FLDCW   std/24-bit mode
    emit_opcode(cbuf,0x2D);
    emit_d32( cbuf, Compile::current()->in_24_bit_fp_mode()
        ? (int)StubRoutines::addr_fpu_cntrl_wrd_24()
        : (int)StubRoutines::addr_fpu_cntrl_wrd_std());

    // Load the converted int; adjust CPU stack
    emit_opcode(cbuf,0x58);       // POP EAX
    emit_opcode(cbuf,0x5A);       // POP EDX
    emit_opcode(cbuf,0x81);       // CMP EDX,imm
    emit_d8    (cbuf,0xFA);       // rdx
    emit_d32   (cbuf,0x80000000); //         0x80000000
    emit_opcode(cbuf,0x75);       // JNE around_slow_call
    emit_d8    (cbuf,0x07+4);     // Size of slow_call
    emit_opcode(cbuf,0x85);       // TEST EAX,EAX
    emit_opcode(cbuf,0xC0);       // 2/rax,/rax,
    emit_opcode(cbuf,0x75);       // JNE around_slow_call
    emit_d8    (cbuf,0x07);       // Size of slow_call
    // Push src onto stack slow-path
    emit_opcode(cbuf,0xD9 );      // FLD     ST(i)
    emit_d8    (cbuf,0xC0-1+$src$$reg );
    // CALL directly to the runtime
    cbuf.set_insts_mark();
    emit_opcode(cbuf,0xE8);       // Call into runtime
    emit_d32_reloc(cbuf, (StubRoutines::d2l_wrapper() - cbuf.insts_end()) - 4, runtime_call_Relocation::spec(), RELOC_IMM32 );
    // Carry on here...
  %}

  enc_class FMul_ST_reg( eRegFPR src1 ) %{
    // Operand was loaded from memory into fp ST (stack top)
    // FMUL   ST,$src  /* D8 C8+i */
    emit_opcode(cbuf, 0xD8);
    emit_opcode(cbuf, 0xC8 + $src1$$reg);
  %}

  enc_class FAdd_ST_reg( eRegFPR src2 ) %{
    // FADDP  ST,src2  /* D8 C0+i */
    emit_opcode(cbuf, 0xD8);
    emit_opcode(cbuf, 0xC0 + $src2$$reg);
    //could use FADDP  src2,fpST  /* DE C0+i */
  %}

  enc_class FAddP_reg_ST( eRegFPR src2 ) %{
    // FADDP  src2,ST  /* DE C0+i */
    emit_opcode(cbuf, 0xDE);
    emit_opcode(cbuf, 0xC0 + $src2$$reg);
  %}

  enc_class subFPR_divFPR_encode( eRegFPR src1, eRegFPR src2) %{
    // Operand has been loaded into fp ST (stack top)
      // FSUB   ST,$src1
      emit_opcode(cbuf, 0xD8);
      emit_opcode(cbuf, 0xE0 + $src1$$reg);

      // FDIV
      emit_opcode(cbuf, 0xD8);
      emit_opcode(cbuf, 0xF0 + $src2$$reg);
  %}

  enc_class MulFAddF (eRegFPR src1, eRegFPR src2) %{
    // Operand was loaded from memory into fp ST (stack top)
    // FADD   ST,$src  /* D8 C0+i */
    emit_opcode(cbuf, 0xD8);
    emit_opcode(cbuf, 0xC0 + $src1$$reg);

    // FMUL  ST,src2  /* D8 C*+i */
    emit_opcode(cbuf, 0xD8);
    emit_opcode(cbuf, 0xC8 + $src2$$reg);
  %}


  enc_class MulFAddFreverse (eRegFPR src1, eRegFPR src2) %{
    // Operand was loaded from memory into fp ST (stack top)
    // FADD   ST,$src  /* D8 C0+i */
    emit_opcode(cbuf, 0xD8);
    emit_opcode(cbuf, 0xC0 + $src1$$reg);

    // FMULP  src2,ST  /* DE C8+i */
    emit_opcode(cbuf, 0xDE);
    emit_opcode(cbuf, 0xC8 + $src2$$reg);
  %}

  // Atomically load the volatile long
  enc_class enc_loadL_volatile( memory mem, stackSlotL dst ) %{
    emit_opcode(cbuf,0xDF);
    int rm_byte_opcode = 0x05;
    int base     = $mem$$base;
    int index    = $mem$$index;
    int scale    = $mem$$scale;
    int displace = $mem$$disp;
    relocInfo::relocType disp_reloc = $mem->disp_reloc(); // disp-as-oop when working with static globals
    encode_RegMem(cbuf, rm_byte_opcode, base, index, scale, displace, disp_reloc);
    store_to_stackslot( cbuf, 0x0DF, 0x07, $dst$$disp );
  %}

  // Volatile Store Long.  Must be atomic, so move it into
  // the FP TOS and then do a 64-bit FIST.  Has to probe the
  // target address before the store (for null-ptr checks)
  // so the memory operand is used twice in the encoding.
  enc_class enc_storeL_volatile( memory mem, stackSlotL src ) %{
    store_to_stackslot( cbuf, 0x0DF, 0x05, $src$$disp );
    cbuf.set_insts_mark();            // Mark start of FIST in case $mem has an oop
    emit_opcode(cbuf,0xDF);
    int rm_byte_opcode = 0x07;
    int base     = $mem$$base;
    int index    = $mem$$index;
    int scale    = $mem$$scale;
    int displace = $mem$$disp;
    relocInfo::relocType disp_reloc = $mem->disp_reloc(); // disp-as-oop when working with static globals
    encode_RegMem(cbuf, rm_byte_opcode, base, index, scale, displace, disp_reloc);
  %}

  // Safepoint Poll.  This polls the safepoint page, and causes an
  // exception if it is not readable. Unfortunately, it kills the condition code
  // in the process
  // We current use TESTL [spp],EDI
  // A better choice might be TESTB [spp + pagesize() - CacheLineSize()],0

  enc_class Safepoint_Poll() %{
    cbuf.relocate(cbuf.insts_mark(), relocInfo::poll_type, 0);
    emit_opcode(cbuf,0x85);
    emit_rm (cbuf, 0x0, 0x7, 0x5);
    emit_d32(cbuf, (intptr_t)os::get_polling_page());
  %}
%}


//----------FRAME--------------------------------------------------------------
// Definition of frame structure and management information.
//
//  S T A C K   L A Y O U T    Allocators stack-slot number
//                             |   (to get allocators register number
//  G  Owned by    |        |  v    add OptoReg::stack0())
//  r   CALLER     |        |
//  o     |        +--------+      pad to even-align allocators stack-slot
//  w     V        |  pad0  |        numbers; owned by CALLER
//  t   -----------+--------+----> Matcher::_in_arg_limit, unaligned
//  h     ^        |   in   |  5
//        |        |  args  |  4   Holes in incoming args owned by SELF
//  |     |        |        |  3
//  |     |        +--------+
//  V     |        | old out|      Empty on Intel, window on Sparc
//        |    old |preserve|      Must be even aligned.
//        |     SP-+--------+----> Matcher::_old_SP, even aligned
//        |        |   in   |  3   area for Intel ret address
//     Owned by    |preserve|      Empty on Sparc.
//       SELF      +--------+
//        |        |  pad2  |  2   pad to align old SP
//        |        +--------+  1
//        |        | locks  |  0
//        |        +--------+----> OptoReg::stack0(), even aligned
//        |        |  pad1  | 11   pad to align new SP
//        |        +--------+
//        |        |        | 10
//        |        | spills |  9   spills
//        V        |        |  8   (pad0 slot for callee)
//      -----------+--------+----> Matcher::_out_arg_limit, unaligned
//        ^        |  out   |  7
//        |        |  args  |  6   Holes in outgoing args owned by CALLEE
//     Owned by    +--------+
//      CALLEE     | new out|  6   Empty on Intel, window on Sparc
//        |    new |preserve|      Must be even-aligned.
//        |     SP-+--------+----> Matcher::_new_SP, even aligned
//        |        |        |
//
// Note 1: Only region 8-11 is determined by the allocator.  Region 0-5 is
//         known from SELF's arguments and the Java calling convention.
//         Region 6-7 is determined per call site.
// Note 2: If the calling convention leaves holes in the incoming argument
//         area, those holes are owned by SELF.  Holes in the outgoing area
//         are owned by the CALLEE.  Holes should not be nessecary in the
//         incoming area, as the Java calling convention is completely under
//         the control of the AD file.  Doubles can be sorted and packed to
//         avoid holes.  Holes in the outgoing arguments may be nessecary for
//         varargs C calling conventions.
// Note 3: Region 0-3 is even aligned, with pad2 as needed.  Region 3-5 is
//         even aligned with pad0 as needed.
//         Region 6 is even aligned.  Region 6-7 is NOT even aligned;
//         region 6-11 is even aligned; it may be padded out more so that
//         the region from SP to FP meets the minimum stack alignment.

frame %{
  // What direction does stack grow in (assumed to be same for C & Java)
  stack_direction(TOWARDS_LOW);

  // These three registers define part of the calling convention
  // between compiled code and the interpreter.
  inline_cache_reg(EAX);                // Inline Cache Register
  interpreter_method_oop_reg(EBX);      // Method Oop Register when calling interpreter

  // Optional: name the operand used by cisc-spilling to access [stack_pointer + offset]
  cisc_spilling_operand_name(indOffset32);

  // Number of stack slots consumed by locking an object
  sync_stack_slots(1);

  // Compiled code's Frame Pointer
  frame_pointer(ESP);
  // Interpreter stores its frame pointer in a register which is
  // stored to the stack by I2CAdaptors.
  // I2CAdaptors convert from interpreted java to compiled java.
  interpreter_frame_pointer(EBP);

  // Stack alignment requirement
  // Alignment size in bytes (128-bit -> 16 bytes)
  stack_alignment(StackAlignmentInBytes);

  // Number of stack slots between incoming argument block and the start of
  // a new frame.  The PROLOG must add this many slots to the stack.  The
  // EPILOG must remove this many slots.  Intel needs one slot for
  // return address and one for rbp, (must save rbp)
  in_preserve_stack_slots(2+VerifyStackAtCalls);

  // Number of outgoing stack slots killed above the out_preserve_stack_slots
  // for calls to C.  Supports the var-args backing area for register parms.
  varargs_C_out_slots_killed(0);

  // The after-PROLOG location of the return address.  Location of
  // return address specifies a type (REG or STACK) and a number
  // representing the register number (i.e. - use a register name) or
  // stack slot.
  // Ret Addr is on stack in slot 0 if no locks or verification or alignment.
  // Otherwise, it is above the locks and verification slot and alignment word
  return_addr(STACK - 1 +
              round_to((Compile::current()->in_preserve_stack_slots() +
                        Compile::current()->fixed_slots()),
                       stack_alignment_in_slots()));

  // Body of function which returns an integer array locating
  // arguments either in registers or in stack slots.  Passed an array
  // of ideal registers called "sig" and a "length" count.  Stack-slot
  // offsets are based on outgoing arguments, i.e. a CALLER setting up
  // arguments for a CALLEE.  Incoming stack arguments are
  // automatically biased by the preserve_stack_slots field above.
  calling_convention %{
    // No difference between ingoing/outgoing just pass false
    SharedRuntime::java_calling_convention(sig_bt, regs, length, false);
  %}


  // Body of function which returns an integer array locating
  // arguments either in registers or in stack slots.  Passed an array
  // of ideal registers called "sig" and a "length" count.  Stack-slot
  // offsets are based on outgoing arguments, i.e. a CALLER setting up
  // arguments for a CALLEE.  Incoming stack arguments are
  // automatically biased by the preserve_stack_slots field above.
  c_calling_convention %{
    // This is obviously always outgoing
    (void) SharedRuntime::c_calling_convention(sig_bt, regs, length);
  %}

  // Location of C & interpreter return values
  c_return_value %{
    assert( ideal_reg >= Op_RegI && ideal_reg <= Op_RegL, "only return normal values" );
    static int lo[Op_RegL+1] = { 0, 0, OptoReg::Bad, EAX_num,      EAX_num,      FPR1L_num,    FPR1L_num, EAX_num };
    static int hi[Op_RegL+1] = { 0, 0, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, FPR1H_num, EDX_num };

    // in SSE2+ mode we want to keep the FPU stack clean so pretend
    // that C functions return float and double results in XMM0.
    if( ideal_reg == Op_RegD && UseSSE>=2 )
      return OptoRegPair(XMM0b_num,XMM0_num);
    if( ideal_reg == Op_RegF && UseSSE>=2 )
      return OptoRegPair(OptoReg::Bad,XMM0_num);

    return OptoRegPair(hi[ideal_reg],lo[ideal_reg]);
  %}

  // Location of return values
  return_value %{
    assert( ideal_reg >= Op_RegI && ideal_reg <= Op_RegL, "only return normal values" );
    static int lo[Op_RegL+1] = { 0, 0, OptoReg::Bad, EAX_num,      EAX_num,      FPR1L_num,    FPR1L_num, EAX_num };
    static int hi[Op_RegL+1] = { 0, 0, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, FPR1H_num, EDX_num };
    if( ideal_reg == Op_RegD && UseSSE>=2 )
      return OptoRegPair(XMM0b_num,XMM0_num);
    if( ideal_reg == Op_RegF && UseSSE>=1 )
      return OptoRegPair(OptoReg::Bad,XMM0_num);
    return OptoRegPair(hi[ideal_reg],lo[ideal_reg]);
  %}

%}

//----------ATTRIBUTES---------------------------------------------------------
//----------Operand Attributes-------------------------------------------------
op_attrib op_cost(0);        // Required cost attribute

//----------Instruction Attributes---------------------------------------------
ins_attrib ins_cost(100);       // Required cost attribute
ins_attrib ins_size(8);         // Required size attribute (in bits)
ins_attrib ins_short_branch(0); // Required flag: is this instruction a
                                // non-matching short branch variant of some
                                                            // long branch?
ins_attrib ins_alignment(1);    // Required alignment attribute (must be a power of 2)
                                // specifies the alignment that some part of the instruction (not
                                // necessarily the start) requires.  If > 1, a compute_padding()
                                // function must be provided for the instruction

//----------OPERANDS-----------------------------------------------------------
// Operand definitions must precede instruction definitions for correct parsing
// in the ADLC because operands constitute user defined types which are used in
// instruction definitions.

//----------Simple Operands----------------------------------------------------
// Immediate Operands
// Integer Immediate
operand immI() %{
  match(ConI);

  op_cost(10);
  format %{ %}
  interface(CONST_INTER);
%}

// Constant for test vs zero
operand immI0() %{
  predicate(n->get_int() == 0);
  match(ConI);

  op_cost(0);
  format %{ %}
  interface(CONST_INTER);
%}

// Constant for increment
operand immI1() %{
  predicate(n->get_int() == 1);
  match(ConI);

  op_cost(0);
  format %{ %}
  interface(CONST_INTER);
%}

// Constant for decrement
operand immI_M1() %{
  predicate(n->get_int() == -1);
  match(ConI);

  op_cost(0);
  format %{ %}
  interface(CONST_INTER);
%}

// Valid scale values for addressing modes
operand immI2() %{
  predicate(0 <= n->get_int() && (n->get_int() <= 3));
  match(ConI);

  format %{ %}
  interface(CONST_INTER);
%}

operand immI8() %{
  predicate((-128 <= n->get_int()) && (n->get_int() <= 127));
  match(ConI);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

operand immI16() %{
  predicate((-32768 <= n->get_int()) && (n->get_int() <= 32767));
  match(ConI);

  op_cost(10);
  format %{ %}
  interface(CONST_INTER);
%}

// Int Immediate non-negative
operand immU31()
%{
  predicate(n->get_int() >= 0);
  match(ConI);

  op_cost(0);
  format %{ %}
  interface(CONST_INTER);
%}

// Constant for long shifts
operand immI_32() %{
  predicate( n->get_int() == 32 );
  match(ConI);

  op_cost(0);
  format %{ %}
  interface(CONST_INTER);
%}

operand immI_1_31() %{
  predicate( n->get_int() >= 1 && n->get_int() <= 31 );
  match(ConI);

  op_cost(0);
  format %{ %}
  interface(CONST_INTER);
%}

operand immI_32_63() %{
  predicate( n->get_int() >= 32 && n->get_int() <= 63 );
  match(ConI);
  op_cost(0);

  format %{ %}
  interface(CONST_INTER);
%}

operand immI_1() %{
  predicate( n->get_int() == 1 );
  match(ConI);

  op_cost(0);
  format %{ %}
  interface(CONST_INTER);
%}

operand immI_2() %{
  predicate( n->get_int() == 2 );
  match(ConI);

  op_cost(0);
  format %{ %}
  interface(CONST_INTER);
%}

operand immI_3() %{
  predicate( n->get_int() == 3 );
  match(ConI);

  op_cost(0);
  format %{ %}
  interface(CONST_INTER);
%}

// Pointer Immediate
operand immP() %{
  match(ConP);

  op_cost(10);
  format %{ %}
  interface(CONST_INTER);
%}

// NULL Pointer Immediate
operand immP0() %{
  predicate( n->get_ptr() == 0 );
  match(ConP);
  op_cost(0);

  format %{ %}
  interface(CONST_INTER);
%}

// Long Immediate
operand immL() %{
  match(ConL);

  op_cost(20);
  format %{ %}
  interface(CONST_INTER);
%}

// Long Immediate zero
operand immL0() %{
  predicate( n->get_long() == 0L );
  match(ConL);
  op_cost(0);

  format %{ %}
  interface(CONST_INTER);
%}

// Long Immediate zero
operand immL_M1() %{
  predicate( n->get_long() == -1L );
  match(ConL);
  op_cost(0);

  format %{ %}
  interface(CONST_INTER);
%}

// Long immediate from 0 to 127.
// Used for a shorter form of long mul by 10.
operand immL_127() %{
  predicate((0 <= n->get_long()) && (n->get_long() <= 127));
  match(ConL);
  op_cost(0);

  format %{ %}
  interface(CONST_INTER);
%}

// Long Immediate: low 32-bit mask
operand immL_32bits() %{
  predicate(n->get_long() == 0xFFFFFFFFL);
  match(ConL);
  op_cost(0);

  format %{ %}
  interface(CONST_INTER);
%}

// Long Immediate: low 32-bit mask
operand immL32() %{
  predicate(n->get_long() == (int)(n->get_long()));
  match(ConL);
  op_cost(20);

  format %{ %}
  interface(CONST_INTER);
%}

//Double Immediate zero
operand immDPR0() %{
  // Do additional (and counter-intuitive) test against NaN to work around VC++
  // bug that generates code such that NaNs compare equal to 0.0
  predicate( UseSSE<=1 && n->getd() == 0.0 && !g_isnan(n->getd()) );
  match(ConD);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

// Double Immediate one
operand immDPR1() %{
  predicate( UseSSE<=1 && n->getd() == 1.0 );
  match(ConD);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

// Double Immediate
operand immDPR() %{
  predicate(UseSSE<=1);
  match(ConD);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

operand immD() %{
  predicate(UseSSE>=2);
  match(ConD);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

// Double Immediate zero
operand immD0() %{
  // Do additional (and counter-intuitive) test against NaN to work around VC++
  // bug that generates code such that NaNs compare equal to 0.0 AND do not
  // compare equal to -0.0.
  predicate( UseSSE>=2 && jlong_cast(n->getd()) == 0 );
  match(ConD);

  format %{ %}
  interface(CONST_INTER);
%}

// Float Immediate zero
operand immFPR0() %{
  predicate(UseSSE == 0 && n->getf() == 0.0F);
  match(ConF);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

// Float Immediate one
operand immFPR1() %{
  predicate(UseSSE == 0 && n->getf() == 1.0F);
  match(ConF);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

// Float Immediate
operand immFPR() %{
  predicate( UseSSE == 0 );
  match(ConF);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

// Float Immediate
operand immF() %{
  predicate(UseSSE >= 1);
  match(ConF);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

// Float Immediate zero.  Zero and not -0.0
operand immF0() %{
  predicate( UseSSE >= 1 && jint_cast(n->getf()) == 0 );
  match(ConF);

  op_cost(5);
  format %{ %}
  interface(CONST_INTER);
%}

// Immediates for special shifts (sign extend)

// Constants for increment
operand immI_16() %{
  predicate( n->get_int() == 16 );
  match(ConI);

  format %{ %}
  interface(CONST_INTER);
%}

operand immI_24() %{
  predicate( n->get_int() == 24 );
  match(ConI);

  format %{ %}
  interface(CONST_INTER);
%}

// Constant for byte-wide masking
operand immI_255() %{
  predicate( n->get_int() == 255 );
  match(ConI);

  format %{ %}
  interface(CONST_INTER);
%}

// Constant for short-wide masking
operand immI_65535() %{
  predicate(n->get_int() == 65535);
  match(ConI);

  format %{ %}
  interface(CONST_INTER);
%}

// Register Operands
// Integer Register
operand rRegI() %{
  constraint(ALLOC_IN_RC(int_reg));
  match(RegI);
  match(xRegI);
  match(eAXRegI);
  match(eBXRegI);
  match(eCXRegI);
  match(eDXRegI);
  match(eDIRegI);
  match(eSIRegI);

  format %{ %}
  interface(REG_INTER);
%}

// Subset of Integer Register
operand xRegI(rRegI reg) %{
  constraint(ALLOC_IN_RC(int_x_reg));
  match(reg);
  match(eAXRegI);
  match(eBXRegI);
  match(eCXRegI);
  match(eDXRegI);

  format %{ %}
  interface(REG_INTER);
%}

// Special Registers
operand eAXRegI(xRegI reg) %{
  constraint(ALLOC_IN_RC(eax_reg));
  match(reg);
  match(rRegI);

  format %{ "EAX" %}
  interface(REG_INTER);
%}

// Special Registers
operand eBXRegI(xRegI reg) %{
  constraint(ALLOC_IN_RC(ebx_reg));
  match(reg);
  match(rRegI);

  format %{ "EBX" %}
  interface(REG_INTER);
%}

operand eCXRegI(xRegI reg) %{
  constraint(ALLOC_IN_RC(ecx_reg));
  match(reg);
  match(rRegI);

  format %{ "ECX" %}
  interface(REG_INTER);
%}

operand eDXRegI(xRegI reg) %{
  constraint(ALLOC_IN_RC(edx_reg));
  match(reg);
  match(rRegI);

  format %{ "EDX" %}
  interface(REG_INTER);
%}

operand eDIRegI(xRegI reg) %{
  constraint(ALLOC_IN_RC(edi_reg));
  match(reg);
  match(rRegI);

  format %{ "EDI" %}
  interface(REG_INTER);
%}

operand naxRegI() %{
  constraint(ALLOC_IN_RC(nax_reg));
  match(RegI);
  match(eCXRegI);
  match(eDXRegI);
  match(eSIRegI);
  match(eDIRegI);

  format %{ %}
  interface(REG_INTER);
%}

operand nadxRegI() %{
  constraint(ALLOC_IN_RC(nadx_reg));
  match(RegI);
  match(eBXRegI);
  match(eCXRegI);
  match(eSIRegI);
  match(eDIRegI);

  format %{ %}
  interface(REG_INTER);
%}

operand ncxRegI() %{
  constraint(ALLOC_IN_RC(ncx_reg));
  match(RegI);
  match(eAXRegI);
  match(eDXRegI);
  match(eSIRegI);
  match(eDIRegI);

  format %{ %}
  interface(REG_INTER);
%}

// // This operand was used by cmpFastUnlock, but conflicted with 'object' reg
// //
operand eSIRegI(xRegI reg) %{
   constraint(ALLOC_IN_RC(esi_reg));
   match(reg);
   match(rRegI);

   format %{ "ESI" %}
   interface(REG_INTER);
%}

// Pointer Register
operand anyRegP() %{
  constraint(ALLOC_IN_RC(any_reg));
  match(RegP);
  match(eAXRegP);
  match(eBXRegP);
  match(eCXRegP);
  match(eDIRegP);
  match(eRegP);

  format %{ %}
  interface(REG_INTER);
%}

operand eRegP() %{
  constraint(ALLOC_IN_RC(int_reg));
  match(RegP);
  match(eAXRegP);
  match(eBXRegP);
  match(eCXRegP);
  match(eDIRegP);

  format %{ %}
  interface(REG_INTER);
%}

// On windows95, EBP is not safe to use for implicit null tests.
operand eRegP_no_EBP() %{
  constraint(ALLOC_IN_RC(int_reg_no_rbp));
  match(RegP);
  match(eAXRegP);
  match(eBXRegP);
  match(eCXRegP);
  match(eDIRegP);

  op_cost(100);
  format %{ %}
  interface(REG_INTER);
%}

operand naxRegP() %{
  constraint(ALLOC_IN_RC(nax_reg));
  match(RegP);
  match(eBXRegP);
  match(eDXRegP);
  match(eCXRegP);
  match(eSIRegP);
  match(eDIRegP);

  format %{ %}
  interface(REG_INTER);
%}

operand nabxRegP() %{
  constraint(ALLOC_IN_RC(nabx_reg));
  match(RegP);
  match(eCXRegP);
  match(eDXRegP);
  match(eSIRegP);
  match(eDIRegP);

  format %{ %}
  interface(REG_INTER);
%}

operand pRegP() %{
  constraint(ALLOC_IN_RC(p_reg));
  match(RegP);
  match(eBXRegP);
  match(eDXRegP);
  match(eSIRegP);
  match(eDIRegP);

  format %{ %}
  interface(REG_INTER);
%}

// Special Registers
// Return a pointer value
operand eAXRegP(eRegP reg) %{
  constraint(ALLOC_IN_RC(eax_reg));
  match(reg);
  format %{ "EAX" %}
  interface(REG_INTER);
%}

// Used in AtomicAdd
operand eBXRegP(eRegP reg) %{
  constraint(ALLOC_IN_RC(ebx_reg));
  match(reg);
  format %{ "EBX" %}
  interface(REG_INTER);
%}

// Tail-call (interprocedural jump) to interpreter
operand eCXRegP(eRegP reg) %{
  constraint(ALLOC_IN_RC(ecx_reg));
  match(reg);
  format %{ "ECX" %}
  interface(REG_INTER);
%}

operand eSIRegP(eRegP reg) %{
  constraint(ALLOC_IN_RC(esi_reg));
  match(reg);
  format %{ "ESI" %}
  interface(REG_INTER);
%}

// Used in rep stosw
operand eDIRegP(eRegP reg) %{
  constraint(ALLOC_IN_RC(edi_reg));
  match(reg);
  format %{ "EDI" %}
  interface(REG_INTER);
%}

operand eBPRegP() %{
  constraint(ALLOC_IN_RC(ebp_reg));
  match(RegP);
  format %{ "EBP" %}
  interface(REG_INTER);
%}

operand eRegL() %{
  constraint(ALLOC_IN_RC(long_reg));
  match(RegL);
  match(eADXRegL);

  format %{ %}
  interface(REG_INTER);
%}

operand eADXRegL( eRegL reg ) %{
  constraint(ALLOC_IN_RC(eadx_reg));
  match(reg);

  format %{ "EDX:EAX" %}
  interface(REG_INTER);
%}

operand eBCXRegL( eRegL reg ) %{
  constraint(ALLOC_IN_RC(ebcx_reg));
  match(reg);

  format %{ "EBX:ECX" %}
  interface(REG_INTER);
%}

// Special case for integer high multiply
operand eADXRegL_low_only() %{
  constraint(ALLOC_IN_RC(eadx_reg));
  match(RegL);

  format %{ "EAX" %}
  interface(REG_INTER);
%}

// Flags register, used as output of compare instructions
operand eFlagsReg() %{
  constraint(ALLOC_IN_RC(int_flags));
  match(RegFlags);

  format %{ "EFLAGS" %}
  interface(REG_INTER);
%}

// Flags register, used as output of FLOATING POINT compare instructions
operand eFlagsRegU() %{
  constraint(ALLOC_IN_RC(int_flags));
  match(RegFlags);

  format %{ "EFLAGS_U" %}
  interface(REG_INTER);
%}

operand eFlagsRegUCF() %{
  constraint(ALLOC_IN_RC(int_flags));
  match(RegFlags);
  predicate(false);

  format %{ "EFLAGS_U_CF" %}
  interface(REG_INTER);
%}

// Condition Code Register used by long compare
operand flagsReg_long_LTGE() %{
  constraint(ALLOC_IN_RC(int_flags));
  match(RegFlags);
  format %{ "FLAGS_LTGE" %}
  interface(REG_INTER);
%}
operand flagsReg_long_EQNE() %{
  constraint(ALLOC_IN_RC(int_flags));
  match(RegFlags);
  format %{ "FLAGS_EQNE" %}
  interface(REG_INTER);
%}
operand flagsReg_long_LEGT() %{
  constraint(ALLOC_IN_RC(int_flags));
  match(RegFlags);
  format %{ "FLAGS_LEGT" %}
  interface(REG_INTER);
%}

// Float register operands
operand regDPR() %{
  predicate( UseSSE < 2 );
  constraint(ALLOC_IN_RC(fp_dbl_reg));
  match(RegD);
  match(regDPR1);
  match(regDPR2);
  format %{ %}
  interface(REG_INTER);
%}

operand regDPR1(regDPR reg) %{
  predicate( UseSSE < 2 );
  constraint(ALLOC_IN_RC(fp_dbl_reg0));
  match(reg);
  format %{ "FPR1" %}
  interface(REG_INTER);
%}

operand regDPR2(regDPR reg) %{
  predicate( UseSSE < 2 );
  constraint(ALLOC_IN_RC(fp_dbl_reg1));
  match(reg);
  format %{ "FPR2" %}
  interface(REG_INTER);
%}

operand regnotDPR1(regDPR reg) %{
  predicate( UseSSE < 2 );
  constraint(ALLOC_IN_RC(fp_dbl_notreg0));
  match(reg);
  format %{ %}
  interface(REG_INTER);
%}

// Float register operands
operand regFPR() %{
  predicate( UseSSE < 2 );
  constraint(ALLOC_IN_RC(fp_flt_reg));
  match(RegF);
  match(regFPR1);
  format %{ %}
  interface(REG_INTER);
%}

// Float register operands
operand regFPR1(regFPR reg) %{
  predicate( UseSSE < 2 );
  constraint(ALLOC_IN_RC(fp_flt_reg0));
  match(reg);
  format %{ "FPR1" %}
  interface(REG_INTER);
%}

// XMM Float register operands
operand regF() %{
  predicate( UseSSE>=1 );
  constraint(ALLOC_IN_RC(float_reg));
  match(RegF);
  format %{ %}
  interface(REG_INTER);
%}

// XMM Double register operands
operand regD() %{
  predicate( UseSSE>=2 );
  constraint(ALLOC_IN_RC(double_reg));
  match(RegD);
  format %{ %}
  interface(REG_INTER);
%}


//----------Memory Operands----------------------------------------------------
// Direct Memory Operand
operand direct(immP addr) %{
  match(addr);

  format %{ "[$addr]" %}
  interface(MEMORY_INTER) %{
    base(0xFFFFFFFF);
    index(0x4);
    scale(0x0);
    disp($addr);
  %}
%}

// Indirect Memory Operand
operand indirect(eRegP reg) %{
  constraint(ALLOC_IN_RC(int_reg));
  match(reg);

  format %{ "[$reg]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index(0x4);
    scale(0x0);
    disp(0x0);
  %}
%}

// Indirect Memory Plus Short Offset Operand
operand indOffset8(eRegP reg, immI8 off) %{
  match(AddP reg off);

  format %{ "[$reg + $off]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index(0x4);
    scale(0x0);
    disp($off);
  %}
%}

// Indirect Memory Plus Long Offset Operand
operand indOffset32(eRegP reg, immI off) %{
  match(AddP reg off);

  format %{ "[$reg + $off]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index(0x4);
    scale(0x0);
    disp($off);
  %}
%}

// Indirect Memory Plus Long Offset Operand
operand indOffset32X(rRegI reg, immP off) %{
  match(AddP off reg);

  format %{ "[$reg + $off]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index(0x4);
    scale(0x0);
    disp($off);
  %}
%}

// Indirect Memory Plus Index Register Plus Offset Operand
operand indIndexOffset(eRegP reg, rRegI ireg, immI off) %{
  match(AddP (AddP reg ireg) off);

  op_cost(10);
  format %{"[$reg + $off + $ireg]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index($ireg);
    scale(0x0);
    disp($off);
  %}
%}

// Indirect Memory Plus Index Register Plus Offset Operand
operand indIndex(eRegP reg, rRegI ireg) %{
  match(AddP reg ireg);

  op_cost(10);
  format %{"[$reg + $ireg]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index($ireg);
    scale(0x0);
    disp(0x0);
  %}
%}

// // -------------------------------------------------------------------------
// // 486 architecture doesn't support "scale * index + offset" with out a base
// // -------------------------------------------------------------------------
// // Scaled Memory Operands
// // Indirect Memory Times Scale Plus Offset Operand
// operand indScaleOffset(immP off, rRegI ireg, immI2 scale) %{
//   match(AddP off (LShiftI ireg scale));
//
//   op_cost(10);
//   format %{"[$off + $ireg << $scale]" %}
//   interface(MEMORY_INTER) %{
//     base(0x4);
//     index($ireg);
//     scale($scale);
//     disp($off);
//   %}
// %}

// Indirect Memory Times Scale Plus Index Register
operand indIndexScale(eRegP reg, rRegI ireg, immI2 scale) %{
  match(AddP reg (LShiftI ireg scale));

  op_cost(10);
  format %{"[$reg + $ireg << $scale]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index($ireg);
    scale($scale);
    disp(0x0);
  %}
%}

// Indirect Memory Times Scale Plus Index Register Plus Offset Operand
operand indIndexScaleOffset(eRegP reg, immI off, rRegI ireg, immI2 scale) %{
  match(AddP (AddP reg (LShiftI ireg scale)) off);

  op_cost(10);
  format %{"[$reg + $off + $ireg << $scale]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index($ireg);
    scale($scale);
    disp($off);
  %}
%}

//----------Load Long Memory Operands------------------------------------------
// The load-long idiom will use it's address expression again after loading
// the first word of the long.  If the load-long destination overlaps with
// registers used in the addressing expression, the 2nd half will be loaded
// from a clobbered address.  Fix this by requiring that load-long use
// address registers that do not overlap with the load-long target.

// load-long support
operand load_long_RegP() %{
  constraint(ALLOC_IN_RC(esi_reg));
  match(RegP);
  match(eSIRegP);
  op_cost(100);
  format %{  %}
  interface(REG_INTER);
%}

// Indirect Memory Operand Long
operand load_long_indirect(load_long_RegP reg) %{
  constraint(ALLOC_IN_RC(esi_reg));
  match(reg);

  format %{ "[$reg]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index(0x4);
    scale(0x0);
    disp(0x0);
  %}
%}

// Indirect Memory Plus Long Offset Operand
operand load_long_indOffset32(load_long_RegP reg, immI off) %{
  match(AddP reg off);

  format %{ "[$reg + $off]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index(0x4);
    scale(0x0);
    disp($off);
  %}
%}

opclass load_long_memory(load_long_indirect, load_long_indOffset32);


//----------Special Memory Operands--------------------------------------------
// Stack Slot Operand - This operand is used for loading and storing temporary
//                      values on the stack where a match requires a value to
//                      flow through memory.
operand stackSlotP(sRegP reg) %{
  constraint(ALLOC_IN_RC(stack_slots));
  // No match rule because this operand is only generated in matching
  format %{ "[$reg]" %}
  interface(MEMORY_INTER) %{
    base(0x4);   // ESP
    index(0x4);  // No Index
    scale(0x0);  // No Scale
    disp($reg);  // Stack Offset
  %}
%}

operand stackSlotI(sRegI reg) %{
  constraint(ALLOC_IN_RC(stack_slots));
  // No match rule because this operand is only generated in matching
  format %{ "[$reg]" %}
  interface(MEMORY_INTER) %{
    base(0x4);   // ESP
    index(0x4);  // No Index
    scale(0x0);  // No Scale
    disp($reg);  // Stack Offset
  %}
%}

operand stackSlotF(sRegF reg) %{
  constraint(ALLOC_IN_RC(stack_slots));
  // No match rule because this operand is only generated in matching
  format %{ "[$reg]" %}
  interface(MEMORY_INTER) %{
    base(0x4);   // ESP
    index(0x4);  // No Index
    scale(0x0);  // No Scale
    disp($reg);  // Stack Offset
  %}
%}

operand stackSlotD(sRegD reg) %{
  constraint(ALLOC_IN_RC(stack_slots));
  // No match rule because this operand is only generated in matching
  format %{ "[$reg]" %}
  interface(MEMORY_INTER) %{
    base(0x4);   // ESP
    index(0x4);  // No Index
    scale(0x0);  // No Scale
    disp($reg);  // Stack Offset
  %}
%}

operand stackSlotL(sRegL reg) %{
  constraint(ALLOC_IN_RC(stack_slots));
  // No match rule because this operand is only generated in matching
  format %{ "[$reg]" %}
  interface(MEMORY_INTER) %{
    base(0x4);   // ESP
    index(0x4);  // No Index
    scale(0x0);  // No Scale
    disp($reg);  // Stack Offset
  %}
%}

//----------Memory Operands - Win95 Implicit Null Variants----------------
// Indirect Memory Operand
operand indirect_win95_safe(eRegP_no_EBP reg)
%{
  constraint(ALLOC_IN_RC(int_reg));
  match(reg);

  op_cost(100);
  format %{ "[$reg]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index(0x4);
    scale(0x0);
    disp(0x0);
  %}
%}

// Indirect Memory Plus Short Offset Operand
operand indOffset8_win95_safe(eRegP_no_EBP reg, immI8 off)
%{
  match(AddP reg off);

  op_cost(100);
  format %{ "[$reg + $off]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index(0x4);
    scale(0x0);
    disp($off);
  %}
%}

// Indirect Memory Plus Long Offset Operand
operand indOffset32_win95_safe(eRegP_no_EBP reg, immI off)
%{
  match(AddP reg off);

  op_cost(100);
  format %{ "[$reg + $off]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index(0x4);
    scale(0x0);
    disp($off);
  %}
%}

// Indirect Memory Plus Index Register Plus Offset Operand
operand indIndexOffset_win95_safe(eRegP_no_EBP reg, rRegI ireg, immI off)
%{
  match(AddP (AddP reg ireg) off);

  op_cost(100);
  format %{"[$reg + $off + $ireg]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index($ireg);
    scale(0x0);
    disp($off);
  %}
%}

// Indirect Memory Times Scale Plus Index Register
operand indIndexScale_win95_safe(eRegP_no_EBP reg, rRegI ireg, immI2 scale)
%{
  match(AddP reg (LShiftI ireg scale));

  op_cost(100);
  format %{"[$reg + $ireg << $scale]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index($ireg);
    scale($scale);
    disp(0x0);
  %}
%}

// Indirect Memory Times Scale Plus Index Register Plus Offset Operand
operand indIndexScaleOffset_win95_safe(eRegP_no_EBP reg, immI off, rRegI ireg, immI2 scale)
%{
  match(AddP (AddP reg (LShiftI ireg scale)) off);

  op_cost(100);
  format %{"[$reg + $off + $ireg << $scale]" %}
  interface(MEMORY_INTER) %{
    base($reg);
    index($ireg);
    scale($scale);
    disp($off);
  %}
%}

//----------Conditional Branch Operands----------------------------------------
// Comparison Op  - This is the operation of the comparison, and is limited to
//                  the following set of codes:
//                  L (<), LE (<=), G (>), GE (>=), E (==), NE (!=)
//
// Other attributes of the comparison, such as unsignedness, are specified
// by the comparison instruction that sets a condition code flags register.
// That result is represented by a flags operand whose subtype is appropriate
// to the unsignedness (etc.) of the comparison.
//
// Later, the instruction which matches both the Comparison Op (a Bool) and
// the flags (produced by the Cmp) specifies the coding of the comparison op
// by matching a specific subtype of Bool operand below, such as cmpOpU.

// Comparision Code
operand cmpOp() %{
  match(Bool);

  format %{ "" %}
  interface(COND_INTER) %{
    equal(0x4, "e");
    not_equal(0x5, "ne");
    less(0xC, "l");
    greater_equal(0xD, "ge");
    less_equal(0xE, "le");
    greater(0xF, "g");
    overflow(0x0, "o");
    no_overflow(0x1, "no");
  %}
%}

// Comparison Code, unsigned compare.  Used by FP also, with
// C2 (unordered) turned into GT or LT already.  The other bits
// C0 and C3 are turned into Carry & Zero flags.
operand cmpOpU() %{
  match(Bool);

  format %{ "" %}
  interface(COND_INTER) %{
    equal(0x4, "e");
    not_equal(0x5, "ne");
    less(0x2, "b");
    greater_equal(0x3, "nb");
    less_equal(0x6, "be");
    greater(0x7, "nbe");
    overflow(0x0, "o");
    no_overflow(0x1, "no");
  %}
%}

// Floating comparisons that don't require any fixup for the unordered case
operand cmpOpUCF() %{
  match(Bool);
  predicate(n->as_Bool()->_test._test == BoolTest::lt ||
            n->as_Bool()->_test._test == BoolTest::ge ||
            n->as_Bool()->_test._test == BoolTest::le ||
            n->as_Bool()->_test._test == BoolTest::gt);
  format %{ "" %}
  interface(COND_INTER) %{
    equal(0x4, "e");
    not_equal(0x5, "ne");
    less(0x2, "b");
    greater_equal(0x3, "nb");
    less_equal(0x6, "be");
    greater(0x7, "nbe");
    overflow(0x0, "o");
    no_overflow(0x1, "no");
  %}
%}


// Floating comparisons that can be fixed up with extra conditional jumps
operand cmpOpUCF2() %{
  match(Bool);
  predicate(n->as_Bool()->_test._test == BoolTest::ne ||
            n->as_Bool()->_test._test == BoolTest::eq);
  format %{ "" %}
  interface(COND_INTER) %{
    equal(0x4, "e");
    not_equal(0x5, "ne");
    less(0x2, "b");
    greater_equal(0x3, "nb");
    less_equal(0x6, "be");
    greater(0x7, "nbe");
    overflow(0x0, "o");
    no_overflow(0x1, "no");
  %}
%}

// Comparison Code for FP conditional move
operand cmpOp_fcmov() %{
  match(Bool);

  predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
            n->as_Bool()->_test._test != BoolTest::no_overflow);
  format %{ "" %}
  interface(COND_INTER) %{
    equal        (0x0C8);
    not_equal    (0x1C8);
    less         (0x0C0);
    greater_equal(0x1C0);
    less_equal   (0x0D0);
    greater      (0x1D0);
    overflow(0x0, "o"); // not really supported by the instruction
    no_overflow(0x1, "no"); // not really supported by the instruction
  %}
%}

// Comparision Code used in long compares
operand cmpOp_commute() %{
  match(Bool);

  format %{ "" %}
  interface(COND_INTER) %{
    equal(0x4, "e");
    not_equal(0x5, "ne");
    less(0xF, "g");
    greater_equal(0xE, "le");
    less_equal(0xD, "ge");
    greater(0xC, "l");
    overflow(0x0, "o");
    no_overflow(0x1, "no");
  %}
%}

//----------OPERAND CLASSES----------------------------------------------------
// Operand Classes are groups of operands that are used as to simplify
// instruction definitions by not requiring the AD writer to specify separate
// instructions for every form of operand when the instruction accepts
// multiple operand types with the same basic encoding and format.  The classic
// case of this is memory operands.

opclass memory(direct, indirect, indOffset8, indOffset32, indOffset32X, indIndexOffset,
               indIndex, indIndexScale, indIndexScaleOffset);

// Long memory operations are encoded in 2 instructions and a +4 offset.
// This means some kind of offset is always required and you cannot use
// an oop as the offset (done when working on static globals).
opclass long_memory(direct, indirect, indOffset8, indOffset32, indIndexOffset,
                    indIndex, indIndexScale, indIndexScaleOffset);


//----------PIPELINE-----------------------------------------------------------
// Rules which define the behavior of the target architectures pipeline.
pipeline %{

//----------ATTRIBUTES---------------------------------------------------------
attributes %{
  variable_size_instructions;        // Fixed size instructions
  max_instructions_per_bundle = 3;   // Up to 3 instructions per bundle
  instruction_unit_size = 1;         // An instruction is 1 bytes long
  instruction_fetch_unit_size = 16;  // The processor fetches one line
  instruction_fetch_units = 1;       // of 16 bytes

  // List of nop instructions
  nops( MachNop );
%}

//----------RESOURCES----------------------------------------------------------
// Resources are the functional units available to the machine

// Generic P2/P3 pipeline
// 3 decoders, only D0 handles big operands; a "bundle" is the limit of
// 3 instructions decoded per cycle.
// 2 load/store ops per cycle, 1 branch, 1 FPU,
// 2 ALU op, only ALU0 handles mul/div instructions.
resources( D0, D1, D2, DECODE = D0 | D1 | D2,
           MS0, MS1, MEM = MS0 | MS1,
           BR, FPU,
           ALU0, ALU1, ALU = ALU0 | ALU1 );

//----------PIPELINE DESCRIPTION-----------------------------------------------
// Pipeline Description specifies the stages in the machine's pipeline

// Generic P2/P3 pipeline
pipe_desc(S0, S1, S2, S3, S4, S5);

//----------PIPELINE CLASSES---------------------------------------------------
// Pipeline Classes describe the stages in which input and output are
// referenced by the hardware pipeline.

// Naming convention: ialu or fpu
// Then: _reg
// Then: _reg if there is a 2nd register
// Then: _long if it's a pair of instructions implementing a long
// Then: _fat if it requires the big decoder
//   Or: _mem if it requires the big decoder and a memory unit.

// Integer ALU reg operation
pipe_class ialu_reg(rRegI dst) %{
    single_instruction;
    dst    : S4(write);
    dst    : S3(read);
    DECODE : S0;        // any decoder
    ALU    : S3;        // any alu
%}

// Long ALU reg operation
pipe_class ialu_reg_long(eRegL dst) %{
    instruction_count(2);
    dst    : S4(write);
    dst    : S3(read);
    DECODE : S0(2);     // any 2 decoders
    ALU    : S3(2);     // both alus
%}

// Integer ALU reg operation using big decoder
pipe_class ialu_reg_fat(rRegI dst) %{
    single_instruction;
    dst    : S4(write);
    dst    : S3(read);
    D0     : S0;        // big decoder only
    ALU    : S3;        // any alu
%}

// Long ALU reg operation using big decoder
pipe_class ialu_reg_long_fat(eRegL dst) %{
    instruction_count(2);
    dst    : S4(write);
    dst    : S3(read);
    D0     : S0(2);     // big decoder only; twice
    ALU    : S3(2);     // any 2 alus
%}

// Integer ALU reg-reg operation
pipe_class ialu_reg_reg(rRegI dst, rRegI src) %{
    single_instruction;
    dst    : S4(write);
    src    : S3(read);
    DECODE : S0;        // any decoder
    ALU    : S3;        // any alu
%}

// Long ALU reg-reg operation
pipe_class ialu_reg_reg_long(eRegL dst, eRegL src) %{
    instruction_count(2);
    dst    : S4(write);
    src    : S3(read);
    DECODE : S0(2);     // any 2 decoders
    ALU    : S3(2);     // both alus
%}

// Integer ALU reg-reg operation
pipe_class ialu_reg_reg_fat(rRegI dst, memory src) %{
    single_instruction;
    dst    : S4(write);
    src    : S3(read);
    D0     : S0;        // big decoder only
    ALU    : S3;        // any alu
%}

// Long ALU reg-reg operation
pipe_class ialu_reg_reg_long_fat(eRegL dst, eRegL src) %{
    instruction_count(2);
    dst    : S4(write);
    src    : S3(read);
    D0     : S0(2);     // big decoder only; twice
    ALU    : S3(2);     // both alus
%}

// Integer ALU reg-mem operation
pipe_class ialu_reg_mem(rRegI dst, memory mem) %{
    single_instruction;
    dst    : S5(write);
    mem    : S3(read);
    D0     : S0;        // big decoder only
    ALU    : S4;        // any alu
    MEM    : S3;        // any mem
%}

// Long ALU reg-mem operation
pipe_class ialu_reg_long_mem(eRegL dst, load_long_memory mem) %{
    instruction_count(2);
    dst    : S5(write);
    mem    : S3(read);
    D0     : S0(2);     // big decoder only; twice
    ALU    : S4(2);     // any 2 alus
    MEM    : S3(2);     // both mems
%}

// Integer mem operation (prefetch)
pipe_class ialu_mem(memory mem)
%{
    single_instruction;
    mem    : S3(read);
    D0     : S0;        // big decoder only
    MEM    : S3;        // any mem
%}

// Integer Store to Memory
pipe_class ialu_mem_reg(memory mem, rRegI src) %{
    single_instruction;
    mem    : S3(read);
    src    : S5(read);
    D0     : S0;        // big decoder only
    ALU    : S4;        // any alu
    MEM    : S3;
%}

// Long Store to Memory
pipe_class ialu_mem_long_reg(memory mem, eRegL src) %{
    instruction_count(2);
    mem    : S3(read);
    src    : S5(read);
    D0     : S0(2);     // big decoder only; twice
    ALU    : S4(2);     // any 2 alus
    MEM    : S3(2);     // Both mems
%}

// Integer Store to Memory
pipe_class ialu_mem_imm(memory mem) %{
    single_instruction;
    mem    : S3(read);
    D0     : S0;        // big decoder only
    ALU    : S4;        // any alu
    MEM    : S3;
%}

// Integer ALU0 reg-reg operation
pipe_class ialu_reg_reg_alu0(rRegI dst, rRegI src) %{
    single_instruction;
    dst    : S4(write);
    src    : S3(read);
    D0     : S0;        // Big decoder only
    ALU0   : S3;        // only alu0
%}

// Integer ALU0 reg-mem operation
pipe_class ialu_reg_mem_alu0(rRegI dst, memory mem) %{
    single_instruction;
    dst    : S5(write);
    mem    : S3(read);
    D0     : S0;        // big decoder only
    ALU0   : S4;        // ALU0 only
    MEM    : S3;        // any mem
%}

// Integer ALU reg-reg operation
pipe_class ialu_cr_reg_reg(eFlagsReg cr, rRegI src1, rRegI src2) %{
    single_instruction;
    cr     : S4(write);
    src1   : S3(read);
    src2   : S3(read);
    DECODE : S0;        // any decoder
    ALU    : S3;        // any alu
%}

// Integer ALU reg-imm operation
pipe_class ialu_cr_reg_imm(eFlagsReg cr, rRegI src1) %{
    single_instruction;
    cr     : S4(write);
    src1   : S3(read);
    DECODE : S0;        // any decoder
    ALU    : S3;        // any alu
%}

// Integer ALU reg-mem operation
pipe_class ialu_cr_reg_mem(eFlagsReg cr, rRegI src1, memory src2) %{
    single_instruction;
    cr     : S4(write);
    src1   : S3(read);
    src2   : S3(read);
    D0     : S0;        // big decoder only
    ALU    : S4;        // any alu
    MEM    : S3;
%}

// Conditional move reg-reg
pipe_class pipe_cmplt( rRegI p, rRegI q, rRegI y ) %{
    instruction_count(4);
    y      : S4(read);
    q      : S3(read);
    p      : S3(read);
    DECODE : S0(4);     // any decoder
%}

// Conditional move reg-reg
pipe_class pipe_cmov_reg( rRegI dst, rRegI src, eFlagsReg cr ) %{
    single_instruction;
    dst    : S4(write);
    src    : S3(read);
    cr     : S3(read);
    DECODE : S0;        // any decoder
%}

// Conditional move reg-mem
pipe_class pipe_cmov_mem( eFlagsReg cr, rRegI dst, memory src) %{
    single_instruction;
    dst    : S4(write);
    src    : S3(read);
    cr     : S3(read);
    DECODE : S0;        // any decoder
    MEM    : S3;
%}

// Conditional move reg-reg long
pipe_class pipe_cmov_reg_long( eFlagsReg cr, eRegL dst, eRegL src) %{
    single_instruction;
    dst    : S4(write);
    src    : S3(read);
    cr     : S3(read);
    DECODE : S0(2);     // any 2 decoders
%}

// Conditional move double reg-reg
pipe_class pipe_cmovDPR_reg( eFlagsReg cr, regDPR1 dst, regDPR src) %{
    single_instruction;
    dst    : S4(write);
    src    : S3(read);
    cr     : S3(read);
    DECODE : S0;        // any decoder
%}

// Float reg-reg operation
pipe_class fpu_reg(regDPR dst) %{
    instruction_count(2);
    dst    : S3(read);
    DECODE : S0(2);     // any 2 decoders
    FPU    : S3;
%}

// Float reg-reg operation
pipe_class fpu_reg_reg(regDPR dst, regDPR src) %{
    instruction_count(2);
    dst    : S4(write);
    src    : S3(read);
    DECODE : S0(2);     // any 2 decoders
    FPU    : S3;
%}

// Float reg-reg operation
pipe_class fpu_reg_reg_reg(regDPR dst, regDPR src1, regDPR src2) %{
    instruction_count(3);
    dst    : S4(write);
    src1   : S3(read);
    src2   : S3(read);
    DECODE : S0(3);     // any 3 decoders
    FPU    : S3(2);
%}

// Float reg-reg operation
pipe_class fpu_reg_reg_reg_reg(regDPR dst, regDPR src1, regDPR src2, regDPR src3) %{
    instruction_count(4);
    dst    : S4(write);
    src1   : S3(read);
    src2   : S3(read);
    src3   : S3(read);
    DECODE : S0(4);     // any 3 decoders
    FPU    : S3(2);
%}

// Float reg-reg operation
pipe_class fpu_reg_mem_reg_reg(regDPR dst, memory src1, regDPR src2, regDPR src3) %{
    instruction_count(4);
    dst    : S4(write);
    src1   : S3(read);
    src2   : S3(read);
    src3   : S3(read);
    DECODE : S1(3);     // any 3 decoders
    D0     : S0;        // Big decoder only
    FPU    : S3(2);
    MEM    : S3;
%}

// Float reg-mem operation
pipe_class fpu_reg_mem(regDPR dst, memory mem) %{
    instruction_count(2);
    dst    : S5(write);
    mem    : S3(read);
    D0     : S0;        // big decoder only
    DECODE : S1;        // any decoder for FPU POP
    FPU    : S4;
    MEM    : S3;        // any mem
%}

// Float reg-mem operation
pipe_class fpu_reg_reg_mem(regDPR dst, regDPR src1, memory mem) %{
    instruction_count(3);
    dst    : S5(write);
    src1   : S3(read);
    mem    : S3(read);
    D0     : S0;        // big decoder only
    DECODE : S1(2);     // any decoder for FPU POP
    FPU    : S4;
    MEM    : S3;        // any mem
%}

// Float mem-reg operation
pipe_class fpu_mem_reg(memory mem, regDPR src) %{
    instruction_count(2);
    src    : S5(read);
    mem    : S3(read);
    DECODE : S0;        // any decoder for FPU PUSH
    D0     : S1;        // big decoder only
    FPU    : S4;
    MEM    : S3;        // any mem
%}

pipe_class fpu_mem_reg_reg(memory mem, regDPR src1, regDPR src2) %{
    instruction_count(3);
    src1   : S3(read);
    src2   : S3(read);
    mem    : S3(read);
    DECODE : S0(2);     // any decoder for FPU PUSH
    D0     : S1;        // big decoder only
    FPU    : S4;
    MEM    : S3;        // any mem
%}

pipe_class fpu_mem_reg_mem(memory mem, regDPR src1, memory src2) %{
    instruction_count(3);
    src1   : S3(read);
    src2   : S3(read);
    mem    : S4(read);
    DECODE : S0;        // any decoder for FPU PUSH
    D0     : S0(2);     // big decoder only
    FPU    : S4;
    MEM    : S3(2);     // any mem
%}

pipe_class fpu_mem_mem(memory dst, memory src1) %{
    instruction_count(2);
    src1   : S3(read);
    dst    : S4(read);
    D0     : S0(2);     // big decoder only
    MEM    : S3(2);     // any mem
%}

pipe_class fpu_mem_mem_mem(memory dst, memory src1, memory src2) %{
    instruction_count(3);
    src1   : S3(read);
    src2   : S3(read);
    dst    : S4(read);
    D0     : S0(3);     // big decoder only
    FPU    : S4;
    MEM    : S3(3);     // any mem
%}

pipe_class fpu_mem_reg_con(memory mem, regDPR src1) %{
    instruction_count(3);
    src1   : S4(read);
    mem    : S4(read);
    DECODE : S0;        // any decoder for FPU PUSH
    D0     : S0(2);     // big decoder only
    FPU    : S4;
    MEM    : S3(2);     // any mem
%}

// Float load constant
pipe_class fpu_reg_con(regDPR dst) %{
    instruction_count(2);
    dst    : S5(write);
    D0     : S0;        // big decoder only for the load
    DECODE : S1;        // any decoder for FPU POP
    FPU    : S4;
    MEM    : S3;        // any mem
%}

// Float load constant
pipe_class fpu_reg_reg_con(regDPR dst, regDPR src) %{
    instruction_count(3);
    dst    : S5(write);
    src    : S3(read);
    D0     : S0;        // big decoder only for the load
    DECODE : S1(2);     // any decoder for FPU POP
    FPU    : S4;
    MEM    : S3;        // any mem
%}

// UnConditional branch
pipe_class pipe_jmp( label labl ) %{
    single_instruction;
    BR   : S3;
%}

// Conditional branch
pipe_class pipe_jcc( cmpOp cmp, eFlagsReg cr, label labl ) %{
    single_instruction;
    cr    : S1(read);
    BR    : S3;
%}

// Allocation idiom
pipe_class pipe_cmpxchg( eRegP dst, eRegP heap_ptr ) %{
    instruction_count(1); force_serialization;
    fixed_latency(6);
    heap_ptr : S3(read);
    DECODE   : S0(3);
    D0       : S2;
    MEM      : S3;
    ALU      : S3(2);
    dst      : S5(write);
    BR       : S5;
%}

// Generic big/slow expanded idiom
pipe_class pipe_slow(  ) %{
    instruction_count(10); multiple_bundles; force_serialization;
    fixed_latency(100);
    D0  : S0(2);
    MEM : S3(2);
%}

// The real do-nothing guy
pipe_class empty( ) %{
    instruction_count(0);
%}

// Define the class for the Nop node
define %{
   MachNop = empty;
%}

%}

//----------INSTRUCTIONS-------------------------------------------------------
//
// match      -- States which machine-independent subtree may be replaced
//               by this instruction.
// ins_cost   -- The estimated cost of this instruction is used by instruction
//               selection to identify a minimum cost tree of machine
//               instructions that matches a tree of machine-independent
//               instructions.
// format     -- A string providing the disassembly for this instruction.
//               The value of an instruction's operand may be inserted
//               by referring to it with a '$' prefix.
// opcode     -- Three instruction opcodes may be provided.  These are referred
//               to within an encode class as $primary, $secondary, and $tertiary
//               respectively.  The primary opcode is commonly used to
//               indicate the type of machine instruction, while secondary
//               and tertiary are often used for prefix options or addressing
//               modes.
// ins_encode -- A list of encode classes with parameters. The encode class
//               name must have been defined in an 'enc_class' specification
//               in the encode section of the architecture description.

//----------BSWAP-Instruction--------------------------------------------------
instruct bytes_reverse_int(rRegI dst) %{
  match(Set dst (ReverseBytesI dst));

  format %{ "BSWAP  $dst" %}
  opcode(0x0F, 0xC8);
  ins_encode( OpcP, OpcSReg(dst) );
  ins_pipe( ialu_reg );
%}

instruct bytes_reverse_long(eRegL dst) %{
  match(Set dst (ReverseBytesL dst));

  format %{ "BSWAP  $dst.lo\n\t"
            "BSWAP  $dst.hi\n\t"
            "XCHG   $dst.lo $dst.hi" %}

  ins_cost(125);
  ins_encode( bswap_long_bytes(dst) );
  ins_pipe( ialu_reg_reg);
%}

instruct bytes_reverse_unsigned_short(rRegI dst, eFlagsReg cr) %{
  match(Set dst (ReverseBytesUS dst));
  effect(KILL cr);

  format %{ "BSWAP  $dst\n\t" 
            "SHR    $dst,16\n\t" %}
  ins_encode %{
    __ bswapl($dst$$Register);
    __ shrl($dst$$Register, 16); 
  %}
  ins_pipe( ialu_reg );
%}

instruct bytes_reverse_short(rRegI dst, eFlagsReg cr) %{
  match(Set dst (ReverseBytesS dst));
  effect(KILL cr);

  format %{ "BSWAP  $dst\n\t" 
            "SAR    $dst,16\n\t" %}
  ins_encode %{
    __ bswapl($dst$$Register);
    __ sarl($dst$$Register, 16); 
  %}
  ins_pipe( ialu_reg );
%}


//---------- Zeros Count Instructions ------------------------------------------

instruct countLeadingZerosI(rRegI dst, rRegI src, eFlagsReg cr) %{
  predicate(UseCountLeadingZerosInstruction);
  match(Set dst (CountLeadingZerosI src));
  effect(KILL cr);

  format %{ "LZCNT  $dst, $src\t# count leading zeros (int)" %}
  ins_encode %{
    __ lzcntl($dst$$Register, $src$$Register);
  %}
  ins_pipe(ialu_reg);
%}

instruct countLeadingZerosI_bsr(rRegI dst, rRegI src, eFlagsReg cr) %{
  predicate(!UseCountLeadingZerosInstruction);
  match(Set dst (CountLeadingZerosI src));
  effect(KILL cr);

  format %{ "BSR    $dst, $src\t# count leading zeros (int)\n\t"
            "JNZ    skip\n\t"
            "MOV    $dst, -1\n"
      "skip:\n\t"
            "NEG    $dst\n\t"
            "ADD    $dst, 31" %}
  ins_encode %{
    Register Rdst = $dst$$Register;
    Register Rsrc = $src$$Register;
    Label skip;
    __ bsrl(Rdst, Rsrc);
    __ jccb(Assembler::notZero, skip);
    __ movl(Rdst, -1);
    __ bind(skip);
    __ negl(Rdst);
    __ addl(Rdst, BitsPerInt - 1);
  %}
  ins_pipe(ialu_reg);
%}

instruct countLeadingZerosL(rRegI dst, eRegL src, eFlagsReg cr) %{
  predicate(UseCountLeadingZerosInstruction);
  match(Set dst (CountLeadingZerosL src));
  effect(TEMP dst, KILL cr);

  format %{ "LZCNT  $dst, $src.hi\t# count leading zeros (long)\n\t"
            "JNC    done\n\t"
            "LZCNT  $dst, $src.lo\n\t"
            "ADD    $dst, 32\n"
      "done:" %}
  ins_encode %{
    Register Rdst = $dst$$Register;
    Register Rsrc = $src$$Register;
    Label done;
    __ lzcntl(Rdst, HIGH_FROM_LOW(Rsrc));
    __ jccb(Assembler::carryClear, done);
    __ lzcntl(Rdst, Rsrc);
    __ addl(Rdst, BitsPerInt);
    __ bind(done);
  %}
  ins_pipe(ialu_reg);
%}

instruct countLeadingZerosL_bsr(rRegI dst, eRegL src, eFlagsReg cr) %{
  predicate(!UseCountLeadingZerosInstruction);
  match(Set dst (CountLeadingZerosL src));
  effect(TEMP dst, KILL cr);

  format %{ "BSR    $dst, $src.hi\t# count leading zeros (long)\n\t"
            "JZ     msw_is_zero\n\t"
            "ADD    $dst, 32\n\t"
            "JMP    not_zero\n"
      "msw_is_zero:\n\t"
            "BSR    $dst, $src.lo\n\t"
            "JNZ    not_zero\n\t"
            "MOV    $dst, -1\n"
      "not_zero:\n\t"
            "NEG    $dst\n\t"
            "ADD    $dst, 63\n" %}
 ins_encode %{
    Register Rdst = $dst$$Register;
    Register Rsrc = $src$$Register;
    Label msw_is_zero;
    Label not_zero;
    __ bsrl(Rdst, HIGH_FROM_LOW(Rsrc));
    __ jccb(Assembler::zero, msw_is_zero);
    __ addl(Rdst, BitsPerInt);
    __ jmpb(not_zero);
    __ bind(msw_is_zero);
    __ bsrl(Rdst, Rsrc);
    __ jccb(Assembler::notZero, not_zero);
    __ movl(Rdst, -1);
    __ bind(not_zero);
    __ negl(Rdst);
    __ addl(Rdst, BitsPerLong - 1);
  %}
  ins_pipe(ialu_reg);
%}

instruct countTrailingZerosI(rRegI dst, rRegI src, eFlagsReg cr) %{
  match(Set dst (CountTrailingZerosI src));
  effect(KILL cr);

  format %{ "BSF    $dst, $src\t# count trailing zeros (int)\n\t"
            "JNZ    done\n\t"
            "MOV    $dst, 32\n"
      "done:" %}
  ins_encode %{
    Register Rdst = $dst$$Register;
    Label done;
    __ bsfl(Rdst, $src$$Register);
    __ jccb(Assembler::notZero, done);
    __ movl(Rdst, BitsPerInt);
    __ bind(done);
  %}
  ins_pipe(ialu_reg);
%}

instruct countTrailingZerosL(rRegI dst, eRegL src, eFlagsReg cr) %{
  match(Set dst (CountTrailingZerosL src));
  effect(TEMP dst, KILL cr);

  format %{ "BSF    $dst, $src.lo\t# count trailing zeros (long)\n\t"
            "JNZ    done\n\t"
            "BSF    $dst, $src.hi\n\t"
            "JNZ    msw_not_zero\n\t"
            "MOV    $dst, 32\n"
      "msw_not_zero:\n\t"
            "ADD    $dst, 32\n"
      "done:" %}
  ins_encode %{
    Register Rdst = $dst$$Register;
    Register Rsrc = $src$$Register;
    Label msw_not_zero;
    Label done;
    __ bsfl(Rdst, Rsrc);
    __ jccb(Assembler::notZero, done);
    __ bsfl(Rdst, HIGH_FROM_LOW(Rsrc));
    __ jccb(Assembler::notZero, msw_not_zero);
    __ movl(Rdst, BitsPerInt);
    __ bind(msw_not_zero);
    __ addl(Rdst, BitsPerInt);
    __ bind(done);
  %}
  ins_pipe(ialu_reg);
%}


//---------- Population Count Instructions -------------------------------------

instruct popCountI(rRegI dst, rRegI src, eFlagsReg cr) %{
  predicate(UsePopCountInstruction);
  match(Set dst (PopCountI src));
  effect(KILL cr);

  format %{ "POPCNT $dst, $src" %}
  ins_encode %{
    __ popcntl($dst$$Register, $src$$Register);
  %}
  ins_pipe(ialu_reg);
%}

instruct popCountI_mem(rRegI dst, memory mem, eFlagsReg cr) %{
  predicate(UsePopCountInstruction);
  match(Set dst (PopCountI (LoadI mem)));
  effect(KILL cr);

  format %{ "POPCNT $dst, $mem" %}
  ins_encode %{
    __ popcntl($dst$$Register, $mem$$Address);
  %}
  ins_pipe(ialu_reg);
%}

// Note: Long.bitCount(long) returns an int.
instruct popCountL(rRegI dst, eRegL src, rRegI tmp, eFlagsReg cr) %{
  predicate(UsePopCountInstruction);
  match(Set dst (PopCountL src));
  effect(KILL cr, TEMP tmp, TEMP dst);

  format %{ "POPCNT $dst, $src.lo\n\t"
            "POPCNT $tmp, $src.hi\n\t"
            "ADD    $dst, $tmp" %}
  ins_encode %{
    __ popcntl($dst$$Register, $src$$Register);
    __ popcntl($tmp$$Register, HIGH_FROM_LOW($src$$Register));
    __ addl($dst$$Register, $tmp$$Register);
  %}
  ins_pipe(ialu_reg);
%}

// Note: Long.bitCount(long) returns an int.
instruct popCountL_mem(rRegI dst, memory mem, rRegI tmp, eFlagsReg cr) %{
  predicate(UsePopCountInstruction);
  match(Set dst (PopCountL (LoadL mem)));
  effect(KILL cr, TEMP tmp, TEMP dst);

  format %{ "POPCNT $dst, $mem\n\t"
            "POPCNT $tmp, $mem+4\n\t"
            "ADD    $dst, $tmp" %}
  ins_encode %{
    //__ popcntl($dst$$Register, $mem$$Address$$first);
    //__ popcntl($tmp$$Register, $mem$$Address$$second);
    __ popcntl($dst$$Register, Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none));
    __ popcntl($tmp$$Register, Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp + 4, relocInfo::none));
    __ addl($dst$$Register, $tmp$$Register);
  %}
  ins_pipe(ialu_reg);
%}


//----------Load/Store/Move Instructions---------------------------------------
//----------Load Instructions--------------------------------------------------
// Load Byte (8bit signed)
instruct loadB(xRegI dst, memory mem) %{
  match(Set dst (LoadB mem));

  ins_cost(125);
  format %{ "MOVSX8 $dst,$mem\t# byte" %}

  ins_encode %{
    __ movsbl($dst$$Register, $mem$$Address);
  %}

  ins_pipe(ialu_reg_mem);
%}

// Load Byte (8bit signed) into Long Register
instruct loadB2L(eRegL dst, memory mem, eFlagsReg cr) %{
  match(Set dst (ConvI2L (LoadB mem)));
  effect(KILL cr);

  ins_cost(375);
  format %{ "MOVSX8 $dst.lo,$mem\t# byte -> long\n\t"
            "MOV    $dst.hi,$dst.lo\n\t"
            "SAR    $dst.hi,7" %}

  ins_encode %{
    __ movsbl($dst$$Register, $mem$$Address);
    __ movl(HIGH_FROM_LOW($dst$$Register), $dst$$Register); // This is always a different register.
    __ sarl(HIGH_FROM_LOW($dst$$Register), 7); // 24+1 MSB are already signed extended.
  %}

  ins_pipe(ialu_reg_mem);
%}

// Load Unsigned Byte (8bit UNsigned)
instruct loadUB(xRegI dst, memory mem) %{
  match(Set dst (LoadUB mem));

  ins_cost(125);
  format %{ "MOVZX8 $dst,$mem\t# ubyte -> int" %}

  ins_encode %{
    __ movzbl($dst$$Register, $mem$$Address);
  %}

  ins_pipe(ialu_reg_mem);
%}

// Load Unsigned Byte (8 bit UNsigned) into Long Register
instruct loadUB2L(eRegL dst, memory mem, eFlagsReg cr) %{
  match(Set dst (ConvI2L (LoadUB mem)));
  effect(KILL cr);

  ins_cost(250);
  format %{ "MOVZX8 $dst.lo,$mem\t# ubyte -> long\n\t"
            "XOR    $dst.hi,$dst.hi" %}

  ins_encode %{
    Register Rdst = $dst$$Register;
    __ movzbl(Rdst, $mem$$Address);
    __ xorl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rdst));
  %}

  ins_pipe(ialu_reg_mem);
%}

// Load Unsigned Byte (8 bit UNsigned) with mask into Long Register
instruct loadUB2L_immI8(eRegL dst, memory mem, immI8 mask, eFlagsReg cr) %{
  match(Set dst (ConvI2L (AndI (LoadUB mem) mask)));
  effect(KILL cr);

  format %{ "MOVZX8 $dst.lo,$mem\t# ubyte & 8-bit mask -> long\n\t"
            "XOR    $dst.hi,$dst.hi\n\t"
            "AND    $dst.lo,$mask" %}
  ins_encode %{
    Register Rdst = $dst$$Register;
    __ movzbl(Rdst, $mem$$Address);
    __ xorl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rdst));
    __ andl(Rdst, $mask$$constant);
  %}
  ins_pipe(ialu_reg_mem);
%}

// Load Short (16bit signed)
instruct loadS(rRegI dst, memory mem) %{
  match(Set dst (LoadS mem));

  ins_cost(125);
  format %{ "MOVSX  $dst,$mem\t# short" %}

  ins_encode %{
    __ movswl($dst$$Register, $mem$$Address);
  %}

  ins_pipe(ialu_reg_mem);
%}

// Load Short (16 bit signed) to Byte (8 bit signed)
instruct loadS2B(rRegI dst, memory mem, immI_24 twentyfour) %{
  match(Set dst (RShiftI (LShiftI (LoadS mem) twentyfour) twentyfour));

  ins_cost(125);
  format %{ "MOVSX  $dst, $mem\t# short -> byte" %}
  ins_encode %{
    __ movsbl($dst$$Register, $mem$$Address);
  %}
  ins_pipe(ialu_reg_mem);
%}

// Load Short (16bit signed) into Long Register
instruct loadS2L(eRegL dst, memory mem, eFlagsReg cr) %{
  match(Set dst (ConvI2L (LoadS mem)));
  effect(KILL cr);

  ins_cost(375);
  format %{ "MOVSX  $dst.lo,$mem\t# short -> long\n\t"
            "MOV    $dst.hi,$dst.lo\n\t"
            "SAR    $dst.hi,15" %}

  ins_encode %{
    __ movswl($dst$$Register, $mem$$Address);
    __ movl(HIGH_FROM_LOW($dst$$Register), $dst$$Register); // This is always a different register.
    __ sarl(HIGH_FROM_LOW($dst$$Register), 15); // 16+1 MSB are already signed extended.
  %}

  ins_pipe(ialu_reg_mem);
%}

// Load Unsigned Short/Char (16bit unsigned)
instruct loadUS(rRegI dst, memory mem) %{
  match(Set dst (LoadUS mem));

  ins_cost(125);
  format %{ "MOVZX  $dst,$mem\t# ushort/char -> int" %}

  ins_encode %{
    __ movzwl($dst$$Register, $mem$$Address);
  %}

  ins_pipe(ialu_reg_mem);
%}

// Load Unsigned Short/Char (16 bit UNsigned) to Byte (8 bit signed)
instruct loadUS2B(rRegI dst, memory mem, immI_24 twentyfour) %{
  match(Set dst (RShiftI (LShiftI (LoadUS mem) twentyfour) twentyfour));

  ins_cost(125);
  format %{ "MOVSX  $dst, $mem\t# ushort -> byte" %}
  ins_encode %{
    __ movsbl($dst$$Register, $mem$$Address);
  %}
  ins_pipe(ialu_reg_mem);
%}

// Load Unsigned Short/Char (16 bit UNsigned) into Long Register
instruct loadUS2L(eRegL dst, memory mem, eFlagsReg cr) %{
  match(Set dst (ConvI2L (LoadUS mem)));
  effect(KILL cr);

  ins_cost(250);
  format %{ "MOVZX  $dst.lo,$mem\t# ushort/char -> long\n\t"
            "XOR    $dst.hi,$dst.hi" %}

  ins_encode %{
    __ movzwl($dst$$Register, $mem$$Address);
    __ xorl(HIGH_FROM_LOW($dst$$Register), HIGH_FROM_LOW($dst$$Register));
  %}

  ins_pipe(ialu_reg_mem);
%}

// Load Unsigned Short/Char (16 bit UNsigned) with mask 0xFF into Long Register
instruct loadUS2L_immI_255(eRegL dst, memory mem, immI_255 mask, eFlagsReg cr) %{
  match(Set dst (ConvI2L (AndI (LoadUS mem) mask)));
  effect(KILL cr);

  format %{ "MOVZX8 $dst.lo,$mem\t# ushort/char & 0xFF -> long\n\t"
            "XOR    $dst.hi,$dst.hi" %}
  ins_encode %{
    Register Rdst = $dst$$Register;
    __ movzbl(Rdst, $mem$$Address);
    __ xorl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rdst));
  %}
  ins_pipe(ialu_reg_mem);
%}

// Load Unsigned Short/Char (16 bit UNsigned) with a 16-bit mask into Long Register
instruct loadUS2L_immI16(eRegL dst, memory mem, immI16 mask, eFlagsReg cr) %{
  match(Set dst (ConvI2L (AndI (LoadUS mem) mask)));
  effect(KILL cr);

  format %{ "MOVZX  $dst.lo, $mem\t# ushort/char & 16-bit mask -> long\n\t"
            "XOR    $dst.hi,$dst.hi\n\t"
            "AND    $dst.lo,$mask" %}
  ins_encode %{
    Register Rdst = $dst$$Register;
    __ movzwl(Rdst, $mem$$Address);
    __ xorl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rdst));
    __ andl(Rdst, $mask$$constant);
  %}
  ins_pipe(ialu_reg_mem);
%}

// Load Integer
instruct loadI(rRegI dst, memory mem) %{
  match(Set dst (LoadI mem));

  ins_cost(125);
  format %{ "MOV    $dst,$mem\t# int" %}

  ins_encode %{
    __ movl($dst$$Register, $mem$$Address);
  %}

  ins_pipe(ialu_reg_mem);
%}

// Load Integer (32 bit signed) to Byte (8 bit signed)
instruct loadI2B(rRegI dst, memory mem, immI_24 twentyfour) %{
  match(Set dst (RShiftI (LShiftI (LoadI mem) twentyfour) twentyfour));

  ins_cost(125);
  format %{ "MOVSX  $dst, $mem\t# int -> byte" %}
  ins_encode %{
    __ movsbl($dst$$Register, $mem$$Address);
  %}
  ins_pipe(ialu_reg_mem);
%}

// Load Integer (32 bit signed) to Unsigned Byte (8 bit UNsigned)
instruct loadI2UB(rRegI dst, memory mem, immI_255 mask) %{
  match(Set dst (AndI (LoadI mem) mask));

  ins_cost(125);
  format %{ "MOVZX  $dst, $mem\t# int -> ubyte" %}
  ins_encode %{
    __ movzbl($dst$$Register, $mem$$Address);
  %}
  ins_pipe(ialu_reg_mem);
%}

// Load Integer (32 bit signed) to Short (16 bit signed)
instruct loadI2S(rRegI dst, memory mem, immI_16 sixteen) %{
  match(Set dst (RShiftI (LShiftI (LoadI mem) sixteen) sixteen));

  ins_cost(125);
  format %{ "MOVSX  $dst, $mem\t# int -> short" %}
  ins_encode %{
    __ movswl($dst$$Register, $mem$$Address);
  %}
  ins_pipe(ialu_reg_mem);
%}

// Load Integer (32 bit signed) to Unsigned Short/Char (16 bit UNsigned)
instruct loadI2US(rRegI dst, memory mem, immI_65535 mask) %{
  match(Set dst (AndI (LoadI mem) mask));

  ins_cost(125);
  format %{ "MOVZX  $dst, $mem\t# int -> ushort/char" %}
  ins_encode %{
    __ movzwl($dst$$Register, $mem$$Address);
  %}
  ins_pipe(ialu_reg_mem);
%}

// Load Integer into Long Register
instruct loadI2L(eRegL dst, memory mem, eFlagsReg cr) %{
  match(Set dst (ConvI2L (LoadI mem)));
  effect(KILL cr);

  ins_cost(375);
  format %{ "MOV    $dst.lo,$mem\t# int -> long\n\t"
            "MOV    $dst.hi,$dst.lo\n\t"
            "SAR    $dst.hi,31" %}

  ins_encode %{
    __ movl($dst$$Register, $mem$$Address);
    __ movl(HIGH_FROM_LOW($dst$$Register), $dst$$Register); // This is always a different register.
    __ sarl(HIGH_FROM_LOW($dst$$Register), 31);
  %}

  ins_pipe(ialu_reg_mem);
%}

// Load Integer with mask 0xFF into Long Register
instruct loadI2L_immI_255(eRegL dst, memory mem, immI_255 mask, eFlagsReg cr) %{
  match(Set dst (ConvI2L (AndI (LoadI mem) mask)));
  effect(KILL cr);

  format %{ "MOVZX8 $dst.lo,$mem\t# int & 0xFF -> long\n\t"
            "XOR    $dst.hi,$dst.hi" %}
  ins_encode %{
    Register Rdst = $dst$$Register;
    __ movzbl(Rdst, $mem$$Address);
    __ xorl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rdst));
  %}
  ins_pipe(ialu_reg_mem);
%}

// Load Integer with mask 0xFFFF into Long Register
instruct loadI2L_immI_65535(eRegL dst, memory mem, immI_65535 mask, eFlagsReg cr) %{
  match(Set dst (ConvI2L (AndI (LoadI mem) mask)));
  effect(KILL cr);

  format %{ "MOVZX  $dst.lo,$mem\t# int & 0xFFFF -> long\n\t"
            "XOR    $dst.hi,$dst.hi" %}
  ins_encode %{
    Register Rdst = $dst$$Register;
    __ movzwl(Rdst, $mem$$Address);
    __ xorl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rdst));
  %}
  ins_pipe(ialu_reg_mem);
%}

// Load Integer with 31-bit mask into Long Register
instruct loadI2L_immU31(eRegL dst, memory mem, immU31 mask, eFlagsReg cr) %{
  match(Set dst (ConvI2L (AndI (LoadI mem) mask)));
  effect(KILL cr);

  format %{ "MOV    $dst.lo,$mem\t# int & 31-bit mask -> long\n\t"
            "XOR    $dst.hi,$dst.hi\n\t"
            "AND    $dst.lo,$mask" %}
  ins_encode %{
    Register Rdst = $dst$$Register;
    __ movl(Rdst, $mem$$Address);
    __ xorl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rdst));
    __ andl(Rdst, $mask$$constant);
  %}
  ins_pipe(ialu_reg_mem);
%}

// Load Unsigned Integer into Long Register
instruct loadUI2L(eRegL dst, memory mem, immL_32bits mask, eFlagsReg cr) %{
  match(Set dst (AndL (ConvI2L (LoadI mem)) mask));
  effect(KILL cr);

  ins_cost(250);
  format %{ "MOV    $dst.lo,$mem\t# uint -> long\n\t"
            "XOR    $dst.hi,$dst.hi" %}

  ins_encode %{
    __ movl($dst$$Register, $mem$$Address);
    __ xorl(HIGH_FROM_LOW($dst$$Register), HIGH_FROM_LOW($dst$$Register));
  %}

  ins_pipe(ialu_reg_mem);
%}

// Load Long.  Cannot clobber address while loading, so restrict address
// register to ESI
instruct loadL(eRegL dst, load_long_memory mem) %{
  predicate(!((LoadLNode*)n)->require_atomic_access());
  match(Set dst (LoadL mem));

  ins_cost(250);
  format %{ "MOV    $dst.lo,$mem\t# long\n\t"
            "MOV    $dst.hi,$mem+4" %}

  ins_encode %{
    Address Amemlo = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
    Address Amemhi = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp + 4, relocInfo::none);
    __ movl($dst$$Register, Amemlo);
    __ movl(HIGH_FROM_LOW($dst$$Register), Amemhi);
  %}

  ins_pipe(ialu_reg_long_mem);
%}

// Volatile Load Long.  Must be atomic, so do 64-bit FILD
// then store it down to the stack and reload on the int
// side.
instruct loadL_volatile(stackSlotL dst, memory mem) %{
  predicate(UseSSE<=1 && ((LoadLNode*)n)->require_atomic_access());
  match(Set dst (LoadL mem));

  ins_cost(200);
  format %{ "FILD   $mem\t# Atomic volatile long load\n\t"
            "FISTp  $dst" %}
  ins_encode(enc_loadL_volatile(mem,dst));
  ins_pipe( fpu_reg_mem );
%}

instruct loadLX_volatile(stackSlotL dst, memory mem, regD tmp) %{
  predicate(UseSSE>=2 && ((LoadLNode*)n)->require_atomic_access());
  match(Set dst (LoadL mem));
  effect(TEMP tmp);
  ins_cost(180);
  format %{ "MOVSD  $tmp,$mem\t# Atomic volatile long load\n\t"
            "MOVSD  $dst,$tmp" %}
  ins_encode %{
    __ movdbl($tmp$$XMMRegister, $mem$$Address);
    __ movdbl(Address(rsp, $dst$$disp), $tmp$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
%}

instruct loadLX_reg_volatile(eRegL dst, memory mem, regD tmp) %{
  predicate(UseSSE>=2 && ((LoadLNode*)n)->require_atomic_access());
  match(Set dst (LoadL mem));
  effect(TEMP tmp);
  ins_cost(160);
  format %{ "MOVSD  $tmp,$mem\t# Atomic volatile long load\n\t"
            "MOVD   $dst.lo,$tmp\n\t"
            "PSRLQ  $tmp,32\n\t"
            "MOVD   $dst.hi,$tmp" %}
  ins_encode %{
    __ movdbl($tmp$$XMMRegister, $mem$$Address);
    __ movdl($dst$$Register, $tmp$$XMMRegister);
    __ psrlq($tmp$$XMMRegister, 32);
    __ movdl(HIGH_FROM_LOW($dst$$Register), $tmp$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
%}

// Load Range
instruct loadRange(rRegI dst, memory mem) %{
  match(Set dst (LoadRange mem));

  ins_cost(125);
  format %{ "MOV    $dst,$mem" %}
  opcode(0x8B);
  ins_encode( OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_mem );
%}


// Load Pointer
instruct loadP(eRegP dst, memory mem) %{
  match(Set dst (LoadP mem));

  ins_cost(125);
  format %{ "MOV    $dst,$mem" %}
  opcode(0x8B);
  ins_encode( OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_mem );
%}

// Load Klass Pointer
instruct loadKlass(eRegP dst, memory mem) %{
  match(Set dst (LoadKlass mem));

  ins_cost(125);
  format %{ "MOV    $dst,$mem" %}
  opcode(0x8B);
  ins_encode( OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_mem );
%}

// Load Double
instruct loadDPR(regDPR dst, memory mem) %{
  predicate(UseSSE<=1);
  match(Set dst (LoadD mem));

  ins_cost(150);
  format %{ "FLD_D  ST,$mem\n\t"
            "FSTP   $dst" %}
  opcode(0xDD);               /* DD /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem),
              Pop_Reg_DPR(dst) );
  ins_pipe( fpu_reg_mem );
%}

// Load Double to XMM
instruct loadD(regD dst, memory mem) %{
  predicate(UseSSE>=2 && UseXmmLoadAndClearUpper);
  match(Set dst (LoadD mem));
  ins_cost(145);
  format %{ "MOVSD  $dst,$mem" %}
  ins_encode %{
    __ movdbl ($dst$$XMMRegister, $mem$$Address);
  %}
  ins_pipe( pipe_slow );
%}

instruct loadD_partial(regD dst, memory mem) %{
  predicate(UseSSE>=2 && !UseXmmLoadAndClearUpper);
  match(Set dst (LoadD mem));
  ins_cost(145);
  format %{ "MOVLPD $dst,$mem" %}
  ins_encode %{
    __ movdbl ($dst$$XMMRegister, $mem$$Address);
  %}
  ins_pipe( pipe_slow );
%}

// Load to XMM register (single-precision floating point)
// MOVSS instruction
instruct loadF(regF dst, memory mem) %{
  predicate(UseSSE>=1);
  match(Set dst (LoadF mem));
  ins_cost(145);
  format %{ "MOVSS  $dst,$mem" %}
  ins_encode %{
    __ movflt ($dst$$XMMRegister, $mem$$Address);
  %}
  ins_pipe( pipe_slow );
%}

// Load Float
instruct loadFPR(regFPR dst, memory mem) %{
  predicate(UseSSE==0);
  match(Set dst (LoadF mem));

  ins_cost(150);
  format %{ "FLD_S  ST,$mem\n\t"
            "FSTP   $dst" %}
  opcode(0xD9);               /* D9 /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem),
              Pop_Reg_FPR(dst) );
  ins_pipe( fpu_reg_mem );
%}

// Load Effective Address
instruct leaP8(eRegP dst, indOffset8 mem) %{
  match(Set dst mem);

  ins_cost(110);
  format %{ "LEA    $dst,$mem" %}
  opcode(0x8D);
  ins_encode( OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_reg_fat );
%}

instruct leaP32(eRegP dst, indOffset32 mem) %{
  match(Set dst mem);

  ins_cost(110);
  format %{ "LEA    $dst,$mem" %}
  opcode(0x8D);
  ins_encode( OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_reg_fat );
%}

instruct leaPIdxOff(eRegP dst, indIndexOffset mem) %{
  match(Set dst mem);

  ins_cost(110);
  format %{ "LEA    $dst,$mem" %}
  opcode(0x8D);
  ins_encode( OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_reg_fat );
%}

instruct leaPIdxScale(eRegP dst, indIndexScale mem) %{
  match(Set dst mem);

  ins_cost(110);
  format %{ "LEA    $dst,$mem" %}
  opcode(0x8D);
  ins_encode( OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_reg_fat );
%}

instruct leaPIdxScaleOff(eRegP dst, indIndexScaleOffset mem) %{
  match(Set dst mem);

  ins_cost(110);
  format %{ "LEA    $dst,$mem" %}
  opcode(0x8D);
  ins_encode( OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_reg_fat );
%}

// Load Constant
instruct loadConI(rRegI dst, immI src) %{
  match(Set dst src);

  format %{ "MOV    $dst,$src" %}
  ins_encode( LdImmI(dst, src) );
  ins_pipe( ialu_reg_fat );
%}

// Load Constant zero
instruct loadConI0(rRegI dst, immI0 src, eFlagsReg cr) %{
  match(Set dst src);
  effect(KILL cr);

  ins_cost(50);
  format %{ "XOR    $dst,$dst" %}
  opcode(0x33);  /* + rd */
  ins_encode( OpcP, RegReg( dst, dst ) );
  ins_pipe( ialu_reg );
%}

instruct loadConP(eRegP dst, immP src) %{
  match(Set dst src);

  format %{ "MOV    $dst,$src" %}
  opcode(0xB8);  /* + rd */
  ins_encode( LdImmP(dst, src) );
  ins_pipe( ialu_reg_fat );
%}

instruct loadConL(eRegL dst, immL src, eFlagsReg cr) %{
  match(Set dst src);
  effect(KILL cr);
  ins_cost(200);
  format %{ "MOV    $dst.lo,$src.lo\n\t"
            "MOV    $dst.hi,$src.hi" %}
  opcode(0xB8);
  ins_encode( LdImmL_Lo(dst, src), LdImmL_Hi(dst, src) );
  ins_pipe( ialu_reg_long_fat );
%}

instruct loadConL0(eRegL dst, immL0 src, eFlagsReg cr) %{
  match(Set dst src);
  effect(KILL cr);
  ins_cost(150);
  format %{ "XOR    $dst.lo,$dst.lo\n\t"
            "XOR    $dst.hi,$dst.hi" %}
  opcode(0x33,0x33);
  ins_encode( RegReg_Lo(dst,dst), RegReg_Hi(dst, dst) );
  ins_pipe( ialu_reg_long );
%}

// The instruction usage is guarded by predicate in operand immFPR().
instruct loadConFPR(regFPR dst, immFPR con) %{
  match(Set dst con);
  ins_cost(125);
  format %{ "FLD_S  ST,[$constantaddress]\t# load from constant table: float=$con\n\t"
            "FSTP   $dst" %}
  ins_encode %{
    __ fld_s($constantaddress($con));
    __ fstp_d($dst$$reg);
  %}
  ins_pipe(fpu_reg_con);
%}

// The instruction usage is guarded by predicate in operand immFPR0().
instruct loadConFPR0(regFPR dst, immFPR0 con) %{
  match(Set dst con);
  ins_cost(125);
  format %{ "FLDZ   ST\n\t"
            "FSTP   $dst" %}
  ins_encode %{
    __ fldz();
    __ fstp_d($dst$$reg);
  %}
  ins_pipe(fpu_reg_con);
%}

// The instruction usage is guarded by predicate in operand immFPR1().
instruct loadConFPR1(regFPR dst, immFPR1 con) %{
  match(Set dst con);
  ins_cost(125);
  format %{ "FLD1   ST\n\t"
            "FSTP   $dst" %}
  ins_encode %{
    __ fld1();
    __ fstp_d($dst$$reg);
  %}
  ins_pipe(fpu_reg_con);
%}

// The instruction usage is guarded by predicate in operand immF().
instruct loadConF(regF dst, immF con) %{
  match(Set dst con);
  ins_cost(125);
  format %{ "MOVSS  $dst,[$constantaddress]\t# load from constant table: float=$con" %}
  ins_encode %{
    __ movflt($dst$$XMMRegister, $constantaddress($con));
  %}
  ins_pipe(pipe_slow);
%}

// The instruction usage is guarded by predicate in operand immF0().
instruct loadConF0(regF dst, immF0 src) %{
  match(Set dst src);
  ins_cost(100);
  format %{ "XORPS  $dst,$dst\t# float 0.0" %}
  ins_encode %{
    __ xorps($dst$$XMMRegister, $dst$$XMMRegister);
  %}
  ins_pipe(pipe_slow);
%}

// The instruction usage is guarded by predicate in operand immDPR().
instruct loadConDPR(regDPR dst, immDPR con) %{
  match(Set dst con);
  ins_cost(125);

  format %{ "FLD_D  ST,[$constantaddress]\t# load from constant table: double=$con\n\t"
            "FSTP   $dst" %}
  ins_encode %{
    __ fld_d($constantaddress($con));
    __ fstp_d($dst$$reg);
  %}
  ins_pipe(fpu_reg_con);
%}

// The instruction usage is guarded by predicate in operand immDPR0().
instruct loadConDPR0(regDPR dst, immDPR0 con) %{
  match(Set dst con);
  ins_cost(125);

  format %{ "FLDZ   ST\n\t"
            "FSTP   $dst" %}
  ins_encode %{
    __ fldz();
    __ fstp_d($dst$$reg);
  %}
  ins_pipe(fpu_reg_con);
%}

// The instruction usage is guarded by predicate in operand immDPR1().
instruct loadConDPR1(regDPR dst, immDPR1 con) %{
  match(Set dst con);
  ins_cost(125);

  format %{ "FLD1   ST\n\t"
            "FSTP   $dst" %}
  ins_encode %{
    __ fld1();
    __ fstp_d($dst$$reg);
  %}
  ins_pipe(fpu_reg_con);
%}

// The instruction usage is guarded by predicate in operand immD().
instruct loadConD(regD dst, immD con) %{
  match(Set dst con);
  ins_cost(125);
  format %{ "MOVSD  $dst,[$constantaddress]\t# load from constant table: double=$con" %}
  ins_encode %{
    __ movdbl($dst$$XMMRegister, $constantaddress($con));
  %}
  ins_pipe(pipe_slow);
%}

// The instruction usage is guarded by predicate in operand immD0().
instruct loadConD0(regD dst, immD0 src) %{
  match(Set dst src);
  ins_cost(100);
  format %{ "XORPD  $dst,$dst\t# double 0.0" %}
  ins_encode %{
    __ xorpd ($dst$$XMMRegister, $dst$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
%}

// Load Stack Slot
instruct loadSSI(rRegI dst, stackSlotI src) %{
  match(Set dst src);
  ins_cost(125);

  format %{ "MOV    $dst,$src" %}
  opcode(0x8B);
  ins_encode( OpcP, RegMem(dst,src));
  ins_pipe( ialu_reg_mem );
%}

instruct loadSSL(eRegL dst, stackSlotL src) %{
  match(Set dst src);

  ins_cost(200);
  format %{ "MOV    $dst,$src.lo\n\t"
            "MOV    $dst+4,$src.hi" %}
  opcode(0x8B, 0x8B);
  ins_encode( OpcP, RegMem( dst, src ), OpcS, RegMem_Hi( dst, src ) );
  ins_pipe( ialu_mem_long_reg );
%}

// Load Stack Slot
instruct loadSSP(eRegP dst, stackSlotP src) %{
  match(Set dst src);
  ins_cost(125);

  format %{ "MOV    $dst,$src" %}
  opcode(0x8B);
  ins_encode( OpcP, RegMem(dst,src));
  ins_pipe( ialu_reg_mem );
%}

// Load Stack Slot
instruct loadSSF(regFPR dst, stackSlotF src) %{
  match(Set dst src);
  ins_cost(125);

  format %{ "FLD_S  $src\n\t"
            "FSTP   $dst" %}
  opcode(0xD9);               /* D9 /0, FLD m32real */
  ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
              Pop_Reg_FPR(dst) );
  ins_pipe( fpu_reg_mem );
%}

// Load Stack Slot
instruct loadSSD(regDPR dst, stackSlotD src) %{
  match(Set dst src);
  ins_cost(125);

  format %{ "FLD_D  $src\n\t"
            "FSTP   $dst" %}
  opcode(0xDD);               /* DD /0, FLD m64real */
  ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
              Pop_Reg_DPR(dst) );
  ins_pipe( fpu_reg_mem );
%}

// Prefetch instructions.
// Must be safe to execute with invalid address (cannot fault).

instruct prefetchr0( memory mem ) %{
  predicate(UseSSE==0 && !VM_Version::supports_3dnow_prefetch());
  match(PrefetchRead mem);
  ins_cost(0);
  size(0);
  format %{ "PREFETCHR (non-SSE is empty encoding)" %}
  ins_encode();
  ins_pipe(empty);
%}

instruct prefetchr( memory mem ) %{
  predicate(UseSSE==0 && VM_Version::supports_3dnow_prefetch() || ReadPrefetchInstr==3);
  match(PrefetchRead mem);
  ins_cost(100);

  format %{ "PREFETCHR $mem\t! Prefetch into level 1 cache for read" %}
  ins_encode %{
    __ prefetchr($mem$$Address);
  %}
  ins_pipe(ialu_mem);
%}

instruct prefetchrNTA( memory mem ) %{
  predicate(UseSSE>=1 && ReadPrefetchInstr==0);
  match(PrefetchRead mem);
  ins_cost(100);

  format %{ "PREFETCHNTA $mem\t! Prefetch into non-temporal cache for read" %}
  ins_encode %{
    __ prefetchnta($mem$$Address);
  %}
  ins_pipe(ialu_mem);
%}

instruct prefetchrT0( memory mem ) %{
  predicate(UseSSE>=1 && ReadPrefetchInstr==1);
  match(PrefetchRead mem);
  ins_cost(100);

  format %{ "PREFETCHT0 $mem\t! Prefetch into L1 and L2 caches for read" %}
  ins_encode %{
    __ prefetcht0($mem$$Address);
  %}
  ins_pipe(ialu_mem);
%}

instruct prefetchrT2( memory mem ) %{
  predicate(UseSSE>=1 && ReadPrefetchInstr==2);
  match(PrefetchRead mem);
  ins_cost(100);

  format %{ "PREFETCHT2 $mem\t! Prefetch into L2 cache for read" %}
  ins_encode %{
    __ prefetcht2($mem$$Address);
  %}
  ins_pipe(ialu_mem);
%}

instruct prefetchw0( memory mem ) %{
  predicate(UseSSE==0 && !VM_Version::supports_3dnow_prefetch());
  match(PrefetchWrite mem);
  ins_cost(0);
  size(0);
  format %{ "Prefetch (non-SSE is empty encoding)" %}
  ins_encode();
  ins_pipe(empty);
%}

instruct prefetchw( memory mem ) %{
  predicate(UseSSE==0 && VM_Version::supports_3dnow_prefetch());
  match( PrefetchWrite mem );
  ins_cost(100);

  format %{ "PREFETCHW $mem\t! Prefetch into L1 cache and mark modified" %}
  ins_encode %{
    __ prefetchw($mem$$Address);
  %}
  ins_pipe(ialu_mem);
%}

instruct prefetchwNTA( memory mem ) %{
  predicate(UseSSE>=1);
  match(PrefetchWrite mem);
  ins_cost(100);

  format %{ "PREFETCHNTA $mem\t! Prefetch into non-temporal cache for write" %}
  ins_encode %{
    __ prefetchnta($mem$$Address);
  %}
  ins_pipe(ialu_mem);
%}

// Prefetch instructions for allocation.

instruct prefetchAlloc0( memory mem ) %{
  predicate(UseSSE==0 && AllocatePrefetchInstr!=3);
  match(PrefetchAllocation mem);
  ins_cost(0);
  size(0);
  format %{ "Prefetch allocation (non-SSE is empty encoding)" %}
  ins_encode();
  ins_pipe(empty);
%}

instruct prefetchAlloc( memory mem ) %{
  predicate(AllocatePrefetchInstr==3);
  match( PrefetchAllocation mem );
  ins_cost(100);

  format %{ "PREFETCHW $mem\t! Prefetch allocation into L1 cache and mark modified" %}
  ins_encode %{
    __ prefetchw($mem$$Address);
  %}
  ins_pipe(ialu_mem);
%}

instruct prefetchAllocNTA( memory mem ) %{
  predicate(UseSSE>=1 && AllocatePrefetchInstr==0);
  match(PrefetchAllocation mem);
  ins_cost(100);

  format %{ "PREFETCHNTA $mem\t! Prefetch allocation into non-temporal cache for write" %}
  ins_encode %{
    __ prefetchnta($mem$$Address);
  %}
  ins_pipe(ialu_mem);
%}

instruct prefetchAllocT0( memory mem ) %{
  predicate(UseSSE>=1 && AllocatePrefetchInstr==1);
  match(PrefetchAllocation mem);
  ins_cost(100);

  format %{ "PREFETCHT0 $mem\t! Prefetch allocation into L1 and L2 caches for write" %}
  ins_encode %{
    __ prefetcht0($mem$$Address);
  %}
  ins_pipe(ialu_mem);
%}

instruct prefetchAllocT2( memory mem ) %{
  predicate(UseSSE>=1 && AllocatePrefetchInstr==2);
  match(PrefetchAllocation mem);
  ins_cost(100);

  format %{ "PREFETCHT2 $mem\t! Prefetch allocation into L2 cache for write" %}
  ins_encode %{
    __ prefetcht2($mem$$Address);
  %}
  ins_pipe(ialu_mem);
%}

//----------Store Instructions-------------------------------------------------

// Store Byte
instruct storeB(memory mem, xRegI src) %{
  match(Set mem (StoreB mem src));

  ins_cost(125);
  format %{ "MOV8   $mem,$src" %}
  opcode(0x88);
  ins_encode( OpcP, RegMem( src, mem ) );
  ins_pipe( ialu_mem_reg );
%}

// Store Char/Short
instruct storeC(memory mem, rRegI src) %{
  match(Set mem (StoreC mem src));

  ins_cost(125);
  format %{ "MOV16  $mem,$src" %}
  opcode(0x89, 0x66);
  ins_encode( OpcS, OpcP, RegMem( src, mem ) );
  ins_pipe( ialu_mem_reg );
%}

// Store Integer
instruct storeI(memory mem, rRegI src) %{
  match(Set mem (StoreI mem src));

  ins_cost(125);
  format %{ "MOV    $mem,$src" %}
  opcode(0x89);
  ins_encode( OpcP, RegMem( src, mem ) );
  ins_pipe( ialu_mem_reg );
%}

// Store Long
instruct storeL(long_memory mem, eRegL src) %{
  predicate(!((StoreLNode*)n)->require_atomic_access());
  match(Set mem (StoreL mem src));

  ins_cost(200);
  format %{ "MOV    $mem,$src.lo\n\t"
            "MOV    $mem+4,$src.hi" %}
  opcode(0x89, 0x89);
  ins_encode( OpcP, RegMem( src, mem ), OpcS, RegMem_Hi( src, mem ) );
  ins_pipe( ialu_mem_long_reg );
%}

// Store Long to Integer
instruct storeL2I(memory mem, eRegL src) %{
  match(Set mem (StoreI mem (ConvL2I src)));

  format %{ "MOV    $mem,$src.lo\t# long -> int" %}
  ins_encode %{
    __ movl($mem$$Address, $src$$Register);
  %}
  ins_pipe(ialu_mem_reg);
%}

// Volatile Store Long.  Must be atomic, so move it into
// the FP TOS and then do a 64-bit FIST.  Has to probe the
// target address before the store (for null-ptr checks)
// so the memory operand is used twice in the encoding.
instruct storeL_volatile(memory mem, stackSlotL src, eFlagsReg cr ) %{
  predicate(UseSSE<=1 && ((StoreLNode*)n)->require_atomic_access());
  match(Set mem (StoreL mem src));
  effect( KILL cr );
  ins_cost(400);
  format %{ "CMP    $mem,EAX\t# Probe address for implicit null check\n\t"
            "FILD   $src\n\t"
            "FISTp  $mem\t # 64-bit atomic volatile long store" %}
  opcode(0x3B);
  ins_encode( OpcP, RegMem( EAX, mem ), enc_storeL_volatile(mem,src));
  ins_pipe( fpu_reg_mem );
%}

instruct storeLX_volatile(memory mem, stackSlotL src, regD tmp, eFlagsReg cr) %{
  predicate(UseSSE>=2 && ((StoreLNode*)n)->require_atomic_access());
  match(Set mem (StoreL mem src));
  effect( TEMP tmp, KILL cr );
  ins_cost(380);
  format %{ "CMP    $mem,EAX\t# Probe address for implicit null check\n\t"
            "MOVSD  $tmp,$src\n\t"
            "MOVSD  $mem,$tmp\t # 64-bit atomic volatile long store" %}
  ins_encode %{
    __ cmpl(rax, $mem$$Address);
    __ movdbl($tmp$$XMMRegister, Address(rsp, $src$$disp));
    __ movdbl($mem$$Address, $tmp$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
%}

instruct storeLX_reg_volatile(memory mem, eRegL src, regD tmp2, regD tmp, eFlagsReg cr) %{
  predicate(UseSSE>=2 && ((StoreLNode*)n)->require_atomic_access());
  match(Set mem (StoreL mem src));
  effect( TEMP tmp2 , TEMP tmp, KILL cr );
  ins_cost(360);
  format %{ "CMP    $mem,EAX\t# Probe address for implicit null check\n\t"
            "MOVD   $tmp,$src.lo\n\t"
            "MOVD   $tmp2,$src.hi\n\t"
            "PUNPCKLDQ $tmp,$tmp2\n\t"
            "MOVSD  $mem,$tmp\t # 64-bit atomic volatile long store" %}
  ins_encode %{
    __ cmpl(rax, $mem$$Address);
    __ movdl($tmp$$XMMRegister, $src$$Register);
    __ movdl($tmp2$$XMMRegister, HIGH_FROM_LOW($src$$Register));
    __ punpckldq($tmp$$XMMRegister, $tmp2$$XMMRegister);
    __ movdbl($mem$$Address, $tmp$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
%}

// Store Pointer; for storing unknown oops and raw pointers
instruct storeP(memory mem, anyRegP src) %{
  match(Set mem (StoreP mem src));

  ins_cost(125);
  format %{ "MOV    $mem,$src" %}
  opcode(0x89);
  ins_encode( OpcP, RegMem( src, mem ) );
  ins_pipe( ialu_mem_reg );
%}

// Store Integer Immediate
instruct storeImmI(memory mem, immI src) %{
  match(Set mem (StoreI mem src));

  ins_cost(150);
  format %{ "MOV    $mem,$src" %}
  opcode(0xC7);               /* C7 /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con32( src ));
  ins_pipe( ialu_mem_imm );
%}

// Store Short/Char Immediate
instruct storeImmI16(memory mem, immI16 src) %{
  predicate(UseStoreImmI16);
  match(Set mem (StoreC mem src));

  ins_cost(150);
  format %{ "MOV16  $mem,$src" %}
  opcode(0xC7);     /* C7 /0 Same as 32 store immediate with prefix */
  ins_encode( SizePrefix, OpcP, RMopc_Mem(0x00,mem),  Con16( src ));
  ins_pipe( ialu_mem_imm );
%}

// Store Pointer Immediate; null pointers or constant oops that do not
// need card-mark barriers.
instruct storeImmP(memory mem, immP src) %{
  match(Set mem (StoreP mem src));

  ins_cost(150);
  format %{ "MOV    $mem,$src" %}
  opcode(0xC7);               /* C7 /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con32( src ));
  ins_pipe( ialu_mem_imm );
%}

// Store Byte Immediate
instruct storeImmB(memory mem, immI8 src) %{
  match(Set mem (StoreB mem src));

  ins_cost(150);
  format %{ "MOV8   $mem,$src" %}
  opcode(0xC6);               /* C6 /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con8or32( src ));
  ins_pipe( ialu_mem_imm );
%}

// Store CMS card-mark Immediate
instruct storeImmCM(memory mem, immI8 src) %{
  match(Set mem (StoreCM mem src));

  ins_cost(150);
  format %{ "MOV8   $mem,$src\t! CMS card-mark imm0" %}
  opcode(0xC6);               /* C6 /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con8or32( src ));
  ins_pipe( ialu_mem_imm );
%}

// Store Double
instruct storeDPR( memory mem, regDPR1 src) %{
  predicate(UseSSE<=1);
  match(Set mem (StoreD mem src));

  ins_cost(100);
  format %{ "FST_D  $mem,$src" %}
  opcode(0xDD);       /* DD /2 */
  ins_encode( enc_FPR_store(mem,src) );
  ins_pipe( fpu_mem_reg );
%}

// Store double does rounding on x86
instruct storeDPR_rounded( memory mem, regDPR1 src) %{
  predicate(UseSSE<=1);
  match(Set mem (StoreD mem (RoundDouble src)));

  ins_cost(100);
  format %{ "FST_D  $mem,$src\t# round" %}
  opcode(0xDD);       /* DD /2 */
  ins_encode( enc_FPR_store(mem,src) );
  ins_pipe( fpu_mem_reg );
%}

// Store XMM register to memory (double-precision floating points)
// MOVSD instruction
instruct storeD(memory mem, regD src) %{
  predicate(UseSSE>=2);
  match(Set mem (StoreD mem src));
  ins_cost(95);
  format %{ "MOVSD  $mem,$src" %}
  ins_encode %{
    __ movdbl($mem$$Address, $src$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
%}

// Store XMM register to memory (single-precision floating point)
// MOVSS instruction
instruct storeF(memory mem, regF src) %{
  predicate(UseSSE>=1);
  match(Set mem (StoreF mem src));
  ins_cost(95);
  format %{ "MOVSS  $mem,$src" %}
  ins_encode %{
    __ movflt($mem$$Address, $src$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
%}

// Store Float
instruct storeFPR( memory mem, regFPR1 src) %{
  predicate(UseSSE==0);
  match(Set mem (StoreF mem src));

  ins_cost(100);
  format %{ "FST_S  $mem,$src" %}
  opcode(0xD9);       /* D9 /2 */
  ins_encode( enc_FPR_store(mem,src) );
  ins_pipe( fpu_mem_reg );
%}

// Store Float does rounding on x86
instruct storeFPR_rounded( memory mem, regFPR1 src) %{
  predicate(UseSSE==0);
  match(Set mem (StoreF mem (RoundFloat src)));

  ins_cost(100);
  format %{ "FST_S  $mem,$src\t# round" %}
  opcode(0xD9);       /* D9 /2 */
  ins_encode( enc_FPR_store(mem,src) );
  ins_pipe( fpu_mem_reg );
%}

// Store Float does rounding on x86
instruct storeFPR_Drounded( memory mem, regDPR1 src) %{
  predicate(UseSSE<=1);
  match(Set mem (StoreF mem (ConvD2F src)));

  ins_cost(100);
  format %{ "FST_S  $mem,$src\t# D-round" %}
  opcode(0xD9);       /* D9 /2 */
  ins_encode( enc_FPR_store(mem,src) );
  ins_pipe( fpu_mem_reg );
%}

// Store immediate Float value (it is faster than store from FPU register)
// The instruction usage is guarded by predicate in operand immFPR().
instruct storeFPR_imm( memory mem, immFPR src) %{
  match(Set mem (StoreF mem src));

  ins_cost(50);
  format %{ "MOV    $mem,$src\t# store float" %}
  opcode(0xC7);               /* C7 /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con32FPR_as_bits( src ));
  ins_pipe( ialu_mem_imm );
%}

// Store immediate Float value (it is faster than store from XMM register)
// The instruction usage is guarded by predicate in operand immF().
instruct storeF_imm( memory mem, immF src) %{
  match(Set mem (StoreF mem src));

  ins_cost(50);
  format %{ "MOV    $mem,$src\t# store float" %}
  opcode(0xC7);               /* C7 /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem),  Con32F_as_bits( src ));
  ins_pipe( ialu_mem_imm );
%}

// Store Integer to stack slot
instruct storeSSI(stackSlotI dst, rRegI src) %{
  match(Set dst src);

  ins_cost(100);
  format %{ "MOV    $dst,$src" %}
  opcode(0x89);
  ins_encode( OpcPRegSS( dst, src ) );
  ins_pipe( ialu_mem_reg );
%}

// Store Integer to stack slot
instruct storeSSP(stackSlotP dst, eRegP src) %{
  match(Set dst src);

  ins_cost(100);
  format %{ "MOV    $dst,$src" %}
  opcode(0x89);
  ins_encode( OpcPRegSS( dst, src ) );
  ins_pipe( ialu_mem_reg );
%}

// Store Long to stack slot
instruct storeSSL(stackSlotL dst, eRegL src) %{
  match(Set dst src);

  ins_cost(200);
  format %{ "MOV    $dst,$src.lo\n\t"
            "MOV    $dst+4,$src.hi" %}
  opcode(0x89, 0x89);
  ins_encode( OpcP, RegMem( src, dst ), OpcS, RegMem_Hi( src, dst ) );
  ins_pipe( ialu_mem_long_reg );
%}

//----------MemBar Instructions-----------------------------------------------
// Memory barrier flavors

instruct membar_acquire() %{
  match(MemBarAcquire);
  ins_cost(400);

  size(0);
  format %{ "MEMBAR-acquire ! (empty encoding)" %}
  ins_encode();
  ins_pipe(empty);
%}

instruct membar_acquire_lock() %{
  match(MemBarAcquireLock);
  ins_cost(0);

  size(0);
  format %{ "MEMBAR-acquire (prior CMPXCHG in FastLock so empty encoding)" %}
  ins_encode( );
  ins_pipe(empty);
%}

instruct membar_release() %{
  match(MemBarRelease);
  ins_cost(400);

  size(0);
  format %{ "MEMBAR-release ! (empty encoding)" %}
  ins_encode( );
  ins_pipe(empty);
%}

instruct membar_release_lock() %{
  match(MemBarReleaseLock);
  ins_cost(0);

  size(0);
  format %{ "MEMBAR-release (a FastUnlock follows so empty encoding)" %}
  ins_encode( );
  ins_pipe(empty);
%}

instruct membar_volatile(eFlagsReg cr) %{
  match(MemBarVolatile);
  effect(KILL cr);
  ins_cost(400);

  format %{ 
    $$template
    if (os::is_MP()) {
      $$emit$$"LOCK ADDL [ESP + #0], 0\t! membar_volatile"
    } else {
      $$emit$$"MEMBAR-volatile ! (empty encoding)"
    }
  %}
  ins_encode %{
    __ membar(Assembler::StoreLoad);
  %}
  ins_pipe(pipe_slow);
%}

instruct unnecessary_membar_volatile() %{
  match(MemBarVolatile);
  predicate(Matcher::post_store_load_barrier(n));
  ins_cost(0);

  size(0);
  format %{ "MEMBAR-volatile (unnecessary so empty encoding)" %}
  ins_encode( );
  ins_pipe(empty);
%}

instruct membar_storestore() %{
  match(MemBarStoreStore);
  ins_cost(0);

  size(0);
  format %{ "MEMBAR-storestore (empty encoding)" %}
  ins_encode( );
  ins_pipe(empty);
%}

//----------Move Instructions--------------------------------------------------
instruct castX2P(eAXRegP dst, eAXRegI src) %{
  match(Set dst (CastX2P src));
  format %{ "# X2P  $dst, $src" %}
  ins_encode( /*empty encoding*/ );
  ins_cost(0);
  ins_pipe(empty);
%}

instruct castP2X(rRegI dst, eRegP src ) %{
  match(Set dst (CastP2X src));
  ins_cost(50);
  format %{ "MOV    $dst, $src\t# CastP2X" %}
  ins_encode( enc_Copy( dst, src) );
  ins_pipe( ialu_reg_reg );
%}

//----------Conditional Move---------------------------------------------------
// Conditional move
instruct jmovI_reg(cmpOp cop, eFlagsReg cr, rRegI dst, rRegI src) %{
  predicate(!VM_Version::supports_cmov() );
  match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "J$cop,us skip\t# signed cmove\n\t"
            "MOV    $dst,$src\n"
      "skip:" %}
  ins_encode %{
    Label Lskip;
    // Invert sense of branch from sense of CMOV
    __ jccb((Assembler::Condition)($cop$$cmpcode^1), Lskip);
    __ movl($dst$$Register, $src$$Register);
    __ bind(Lskip);
  %}
  ins_pipe( pipe_cmov_reg );
%}

instruct jmovI_regU(cmpOpU cop, eFlagsRegU cr, rRegI dst, rRegI src) %{
  predicate(!VM_Version::supports_cmov() );
  match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "J$cop,us skip\t# unsigned cmove\n\t"
            "MOV    $dst,$src\n"
      "skip:" %}
  ins_encode %{
    Label Lskip;
    // Invert sense of branch from sense of CMOV
    __ jccb((Assembler::Condition)($cop$$cmpcode^1), Lskip);
    __ movl($dst$$Register, $src$$Register);
    __ bind(Lskip);
  %}
  ins_pipe( pipe_cmov_reg );
%}

instruct cmovI_reg(rRegI dst, rRegI src, eFlagsReg cr, cmpOp cop ) %{
  predicate(VM_Version::supports_cmov() );
  match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cop $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cop), RegReg( dst, src ) );
  ins_pipe( pipe_cmov_reg );
%}

instruct cmovI_regU( cmpOpU cop, eFlagsRegU cr, rRegI dst, rRegI src ) %{
  predicate(VM_Version::supports_cmov() );
  match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cop $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cop), RegReg( dst, src ) );
  ins_pipe( pipe_cmov_reg );
%}

instruct cmovI_regUCF( cmpOpUCF cop, eFlagsRegUCF cr, rRegI dst, rRegI src ) %{
  predicate(VM_Version::supports_cmov() );
  match(Set dst (CMoveI (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  expand %{
    cmovI_regU(cop, cr, dst, src);
  %}
%}

// Conditional move
instruct cmovI_mem(cmpOp cop, eFlagsReg cr, rRegI dst, memory src) %{
  predicate(VM_Version::supports_cmov() );
  match(Set dst (CMoveI (Binary cop cr) (Binary dst (LoadI src))));
  ins_cost(250);
  format %{ "CMOV$cop $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cop), RegMem( dst, src ) );
  ins_pipe( pipe_cmov_mem );
%}

// Conditional move
instruct cmovI_memU(cmpOpU cop, eFlagsRegU cr, rRegI dst, memory src) %{
  predicate(VM_Version::supports_cmov() );
  match(Set dst (CMoveI (Binary cop cr) (Binary dst (LoadI src))));
  ins_cost(250);
  format %{ "CMOV$cop $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cop), RegMem( dst, src ) );
  ins_pipe( pipe_cmov_mem );
%}

instruct cmovI_memUCF(cmpOpUCF cop, eFlagsRegUCF cr, rRegI dst, memory src) %{
  predicate(VM_Version::supports_cmov() );
  match(Set dst (CMoveI (Binary cop cr) (Binary dst (LoadI src))));
  ins_cost(250);
  expand %{
    cmovI_memU(cop, cr, dst, src);
  %}
%}

// Conditional move
instruct cmovP_reg(eRegP dst, eRegP src, eFlagsReg cr, cmpOp cop ) %{
  predicate(VM_Version::supports_cmov() );
  match(Set dst (CMoveP (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cop $dst,$src\t# ptr" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cop), RegReg( dst, src ) );
  ins_pipe( pipe_cmov_reg );
%}

// Conditional move (non-P6 version)
// Note:  a CMoveP is generated for  stubs and native wrappers
//        regardless of whether we are on a P6, so we
//        emulate a cmov here
instruct cmovP_reg_nonP6(eRegP dst, eRegP src, eFlagsReg cr, cmpOp cop ) %{
  match(Set dst (CMoveP (Binary cop cr) (Binary dst src)));
  ins_cost(300);
  format %{ "Jn$cop   skip\n\t"
          "MOV    $dst,$src\t# pointer\n"
      "skip:" %}
  opcode(0x8b);
  ins_encode( enc_cmov_branch(cop, 0x2), OpcP, RegReg(dst, src));
  ins_pipe( pipe_cmov_reg );
%}

// Conditional move
instruct cmovP_regU(cmpOpU cop, eFlagsRegU cr, eRegP dst, eRegP src ) %{
  predicate(VM_Version::supports_cmov() );
  match(Set dst (CMoveP (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cop $dst,$src\t# ptr" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cop), RegReg( dst, src ) );
  ins_pipe( pipe_cmov_reg );
%}

instruct cmovP_regUCF(cmpOpUCF cop, eFlagsRegUCF cr, eRegP dst, eRegP src ) %{
  predicate(VM_Version::supports_cmov() );
  match(Set dst (CMoveP (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  expand %{
    cmovP_regU(cop, cr, dst, src);
  %}
%}

// DISABLED: Requires the ADLC to emit a bottom_type call that
// correctly meets the two pointer arguments; one is an incoming
// register but the other is a memory operand.  ALSO appears to
// be buggy with implicit null checks.
//
//// Conditional move
//instruct cmovP_mem(cmpOp cop, eFlagsReg cr, eRegP dst, memory src) %{
//  predicate(VM_Version::supports_cmov() );
//  match(Set dst (CMoveP (Binary cop cr) (Binary dst (LoadP src))));
//  ins_cost(250);
//  format %{ "CMOV$cop $dst,$src\t# ptr" %}
//  opcode(0x0F,0x40);
//  ins_encode( enc_cmov(cop), RegMem( dst, src ) );
//  ins_pipe( pipe_cmov_mem );
//%}
//
//// Conditional move
//instruct cmovP_memU(cmpOpU cop, eFlagsRegU cr, eRegP dst, memory src) %{
//  predicate(VM_Version::supports_cmov() );
//  match(Set dst (CMoveP (Binary cop cr) (Binary dst (LoadP src))));
//  ins_cost(250);
//  format %{ "CMOV$cop $dst,$src\t# ptr" %}
//  opcode(0x0F,0x40);
//  ins_encode( enc_cmov(cop), RegMem( dst, src ) );
//  ins_pipe( pipe_cmov_mem );
//%}

// Conditional move
instruct fcmovDPR_regU(cmpOp_fcmov cop, eFlagsRegU cr, regDPR1 dst, regDPR src) %{
  predicate(UseSSE<=1);
  match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "FCMOV$cop $dst,$src\t# double" %}
  opcode(0xDA);
  ins_encode( enc_cmov_dpr(cop,src) );
  ins_pipe( pipe_cmovDPR_reg );
%}

// Conditional move
instruct fcmovFPR_regU(cmpOp_fcmov cop, eFlagsRegU cr, regFPR1 dst, regFPR src) %{
  predicate(UseSSE==0);
  match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "FCMOV$cop $dst,$src\t# float" %}
  opcode(0xDA);
  ins_encode( enc_cmov_dpr(cop,src) );
  ins_pipe( pipe_cmovDPR_reg );
%}

// Float CMOV on Intel doesn't handle *signed* compares, only unsigned.
instruct fcmovDPR_regS(cmpOp cop, eFlagsReg cr, regDPR dst, regDPR src) %{
  predicate(UseSSE<=1);
  match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "Jn$cop   skip\n\t"
            "MOV    $dst,$src\t# double\n"
      "skip:" %}
  opcode (0xdd, 0x3);     /* DD D8+i or DD /3 */
  ins_encode( enc_cmov_branch( cop, 0x4 ), Push_Reg_DPR(src), OpcP, RegOpc(dst) );
  ins_pipe( pipe_cmovDPR_reg );
%}

// Float CMOV on Intel doesn't handle *signed* compares, only unsigned.
instruct fcmovFPR_regS(cmpOp cop, eFlagsReg cr, regFPR dst, regFPR src) %{
  predicate(UseSSE==0);
  match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "Jn$cop    skip\n\t"
            "MOV    $dst,$src\t# float\n"
      "skip:" %}
  opcode (0xdd, 0x3);     /* DD D8+i or DD /3 */
  ins_encode( enc_cmov_branch( cop, 0x4 ), Push_Reg_FPR(src), OpcP, RegOpc(dst) );
  ins_pipe( pipe_cmovDPR_reg );
%}

// No CMOVE with SSE/SSE2
instruct fcmovF_regS(cmpOp cop, eFlagsReg cr, regF dst, regF src) %{
  predicate (UseSSE>=1);
  match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "Jn$cop   skip\n\t"
            "MOVSS  $dst,$src\t# float\n"
      "skip:" %}
  ins_encode %{
    Label skip;
    // Invert sense of branch from sense of CMOV
    __ jccb((Assembler::Condition)($cop$$cmpcode^1), skip);
    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
    __ bind(skip);
  %}
  ins_pipe( pipe_slow );
%}

// No CMOVE with SSE/SSE2
instruct fcmovD_regS(cmpOp cop, eFlagsReg cr, regD dst, regD src) %{
  predicate (UseSSE>=2);
  match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "Jn$cop   skip\n\t"
            "MOVSD  $dst,$src\t# float\n"
      "skip:" %}
  ins_encode %{
    Label skip;
    // Invert sense of branch from sense of CMOV
    __ jccb((Assembler::Condition)($cop$$cmpcode^1), skip);
    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
    __ bind(skip);
  %}
  ins_pipe( pipe_slow );
%}

// unsigned version
instruct fcmovF_regU(cmpOpU cop, eFlagsRegU cr, regF dst, regF src) %{
  predicate (UseSSE>=1);
  match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "Jn$cop   skip\n\t"
            "MOVSS  $dst,$src\t# float\n"
      "skip:" %}
  ins_encode %{
    Label skip;
    // Invert sense of branch from sense of CMOV
    __ jccb((Assembler::Condition)($cop$$cmpcode^1), skip);
    __ movflt($dst$$XMMRegister, $src$$XMMRegister);
    __ bind(skip);
  %}
  ins_pipe( pipe_slow );
%}

instruct fcmovF_regUCF(cmpOpUCF cop, eFlagsRegUCF cr, regF dst, regF src) %{
  predicate (UseSSE>=1);
  match(Set dst (CMoveF (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovF_regU(cop, cr, dst, src);
  %}
%}

// unsigned version
instruct fcmovD_regU(cmpOpU cop, eFlagsRegU cr, regD dst, regD src) %{
  predicate (UseSSE>=2);
  match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "Jn$cop   skip\n\t"
            "MOVSD  $dst,$src\t# float\n"
      "skip:" %}
  ins_encode %{
    Label skip;
    // Invert sense of branch from sense of CMOV
    __ jccb((Assembler::Condition)($cop$$cmpcode^1), skip);
    __ movdbl($dst$$XMMRegister, $src$$XMMRegister);
    __ bind(skip);
  %}
  ins_pipe( pipe_slow );
%}

instruct fcmovD_regUCF(cmpOpUCF cop, eFlagsRegUCF cr, regD dst, regD src) %{
  predicate (UseSSE>=2);
  match(Set dst (CMoveD (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovD_regU(cop, cr, dst, src);
  %}
%}

instruct cmovL_reg(cmpOp cop, eFlagsReg cr, eRegL dst, eRegL src) %{
  predicate(VM_Version::supports_cmov() );
  match(Set dst (CMoveL (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cop $dst.lo,$src.lo\n\t"
            "CMOV$cop $dst.hi,$src.hi" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cop), RegReg_Lo2( dst, src ), enc_cmov(cop), RegReg_Hi2( dst, src ) );
  ins_pipe( pipe_cmov_reg_long );
%}

instruct cmovL_regU(cmpOpU cop, eFlagsRegU cr, eRegL dst, eRegL src) %{
  predicate(VM_Version::supports_cmov() );
  match(Set dst (CMoveL (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cop $dst.lo,$src.lo\n\t"
            "CMOV$cop $dst.hi,$src.hi" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cop), RegReg_Lo2( dst, src ), enc_cmov(cop), RegReg_Hi2( dst, src ) );
  ins_pipe( pipe_cmov_reg_long );
%}

instruct cmovL_regUCF(cmpOpUCF cop, eFlagsRegUCF cr, eRegL dst, eRegL src) %{
  predicate(VM_Version::supports_cmov() );
  match(Set dst (CMoveL (Binary cop cr) (Binary dst src)));
  ins_cost(200);
  expand %{
    cmovL_regU(cop, cr, dst, src);
  %}
%}

//----------Arithmetic Instructions--------------------------------------------
//----------Addition Instructions----------------------------------------------

instruct addExactI_eReg(eAXRegI dst, rRegI src, eFlagsReg cr)
%{
  match(AddExactI dst src);
  effect(DEF cr);

  format %{ "ADD    $dst, $src\t# addExact int" %}
  ins_encode %{
    __ addl($dst$$Register, $src$$Register);
  %}
  ins_pipe(ialu_reg_reg);
%}

instruct addExactI_eReg_imm(eAXRegI dst, immI src, eFlagsReg cr)
%{
  match(AddExactI dst src);
  effect(DEF cr);

  format %{ "ADD    $dst, $src\t# addExact int" %}
  ins_encode %{
    __ addl($dst$$Register, $src$$constant);
  %}
  ins_pipe(ialu_reg_reg);
%}

instruct addExactI_eReg_mem(eAXRegI dst, memory src, eFlagsReg cr)
%{
  match(AddExactI dst (LoadI src));
  effect(DEF cr);

  ins_cost(125);
  format %{ "ADD    $dst,$src\t# addExact int" %}
  ins_encode %{
    __ addl($dst$$Register, $src$$Address);
  %}
  ins_pipe( ialu_reg_mem );
%}


// Integer Addition Instructions
instruct addI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
  match(Set dst (AddI dst src));
  effect(KILL cr);

  size(2);
  format %{ "ADD    $dst,$src" %}
  opcode(0x03);
  ins_encode( OpcP, RegReg( dst, src) );
  ins_pipe( ialu_reg_reg );
%}

instruct addI_eReg_imm(rRegI dst, immI src, eFlagsReg cr) %{
  match(Set dst (AddI dst src));
  effect(KILL cr);

  format %{ "ADD    $dst,$src" %}
  opcode(0x81, 0x00); /* /0 id */
  ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
  ins_pipe( ialu_reg );
%}

instruct incI_eReg(rRegI dst, immI1 src, eFlagsReg cr) %{
  predicate(UseIncDec);
  match(Set dst (AddI dst src));
  effect(KILL cr);

  size(1);
  format %{ "INC    $dst" %}
  opcode(0x40); /*  */
  ins_encode( Opc_plus( primary, dst ) );
  ins_pipe( ialu_reg );
%}

instruct leaI_eReg_immI(rRegI dst, rRegI src0, immI src1) %{
  match(Set dst (AddI src0 src1));
  ins_cost(110);

  format %{ "LEA    $dst,[$src0 + $src1]" %}
  opcode(0x8D); /* 0x8D /r */
  ins_encode( OpcP, RegLea( dst, src0, src1 ) );
  ins_pipe( ialu_reg_reg );
%}

instruct leaP_eReg_immI(eRegP dst, eRegP src0, immI src1) %{
  match(Set dst (AddP src0 src1));
  ins_cost(110);

  format %{ "LEA    $dst,[$src0 + $src1]\t# ptr" %}
  opcode(0x8D); /* 0x8D /r */
  ins_encode( OpcP, RegLea( dst, src0, src1 ) );
  ins_pipe( ialu_reg_reg );
%}

instruct decI_eReg(rRegI dst, immI_M1 src, eFlagsReg cr) %{
  predicate(UseIncDec);
  match(Set dst (AddI dst src));
  effect(KILL cr);

  size(1);
  format %{ "DEC    $dst" %}
  opcode(0x48); /*  */
  ins_encode( Opc_plus( primary, dst ) );
  ins_pipe( ialu_reg );
%}

instruct addP_eReg(eRegP dst, rRegI src, eFlagsReg cr) %{
  match(Set dst (AddP dst src));
  effect(KILL cr);

  size(2);
  format %{ "ADD    $dst,$src" %}
  opcode(0x03);
  ins_encode( OpcP, RegReg( dst, src) );
  ins_pipe( ialu_reg_reg );
%}

instruct addP_eReg_imm(eRegP dst, immI src, eFlagsReg cr) %{
  match(Set dst (AddP dst src));
  effect(KILL cr);

  format %{ "ADD    $dst,$src" %}
  opcode(0x81,0x00); /* Opcode 81 /0 id */
  // ins_encode( RegImm( dst, src) );
  ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
  ins_pipe( ialu_reg );
%}

instruct addI_eReg_mem(rRegI dst, memory src, eFlagsReg cr) %{
  match(Set dst (AddI dst (LoadI src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "ADD    $dst,$src" %}
  opcode(0x03);
  ins_encode( OpcP, RegMem( dst, src) );
  ins_pipe( ialu_reg_mem );
%}

instruct addI_mem_eReg(memory dst, rRegI src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (AddI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(150);
  format %{ "ADD    $dst,$src" %}
  opcode(0x01);  /* Opcode 01 /r */
  ins_encode( OpcP, RegMem( src, dst ) );
  ins_pipe( ialu_mem_reg );
%}

// Add Memory with Immediate
instruct addI_mem_imm(memory dst, immI src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (AddI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "ADD    $dst,$src" %}
  opcode(0x81);               /* Opcode 81 /0 id */
  ins_encode( OpcSE( src ), RMopc_Mem(0x00,dst), Con8or32( src ) );
  ins_pipe( ialu_mem_imm );
%}

instruct incI_mem(memory dst, immI1 src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (AddI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "INC    $dst" %}
  opcode(0xFF);               /* Opcode FF /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,dst));
  ins_pipe( ialu_mem_imm );
%}

instruct decI_mem(memory dst, immI_M1 src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (AddI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "DEC    $dst" %}
  opcode(0xFF);               /* Opcode FF /1 */
  ins_encode( OpcP, RMopc_Mem(0x01,dst));
  ins_pipe( ialu_mem_imm );
%}


instruct checkCastPP( eRegP dst ) %{
  match(Set dst (CheckCastPP dst));

  size(0);
  format %{ "#checkcastPP of $dst" %}
  ins_encode( /*empty encoding*/ );
  ins_pipe( empty );
%}

instruct castPP( eRegP dst ) %{
  match(Set dst (CastPP dst));
  format %{ "#castPP of $dst" %}
  ins_encode( /*empty encoding*/ );
  ins_pipe( empty );
%}

instruct castII( rRegI dst ) %{
  match(Set dst (CastII dst));
  format %{ "#castII of $dst" %}
  ins_encode( /*empty encoding*/ );
  ins_cost(0);
  ins_pipe( empty );
%}


// Load-locked - same as a regular pointer load when used with compare-swap
instruct loadPLocked(eRegP dst, memory mem) %{
  match(Set dst (LoadPLocked mem));

  ins_cost(125);
  format %{ "MOV    $dst,$mem\t# Load ptr. locked" %}
  opcode(0x8B);
  ins_encode( OpcP, RegMem(dst,mem));
  ins_pipe( ialu_reg_mem );
%}

// Conditional-store of the updated heap-top.
// Used during allocation of the shared heap.
// Sets flags (EQ) on success.  Implemented with a CMPXCHG on Intel.
instruct storePConditional( memory heap_top_ptr, eAXRegP oldval, eRegP newval, eFlagsReg cr ) %{
  match(Set cr (StorePConditional heap_top_ptr (Binary oldval newval)));
  // EAX is killed if there is contention, but then it's also unused.
  // In the common case of no contention, EAX holds the new oop address.
  format %{ "CMPXCHG $heap_top_ptr,$newval\t# If EAX==$heap_top_ptr Then store $newval into $heap_top_ptr" %}
  ins_encode( lock_prefix, Opcode(0x0F), Opcode(0xB1), RegMem(newval,heap_top_ptr) );
  ins_pipe( pipe_cmpxchg );
%}

// Conditional-store of an int value.
// ZF flag is set on success, reset otherwise.  Implemented with a CMPXCHG on Intel.
instruct storeIConditional( memory mem, eAXRegI oldval, rRegI newval, eFlagsReg cr ) %{
  match(Set cr (StoreIConditional mem (Binary oldval newval)));
  effect(KILL oldval);
  format %{ "CMPXCHG $mem,$newval\t# If EAX==$mem Then store $newval into $mem" %}
  ins_encode( lock_prefix, Opcode(0x0F), Opcode(0xB1), RegMem(newval, mem) );
  ins_pipe( pipe_cmpxchg );
%}

// Conditional-store of a long value.
// ZF flag is set on success, reset otherwise.  Implemented with a CMPXCHG8 on Intel.
instruct storeLConditional( memory mem, eADXRegL oldval, eBCXRegL newval, eFlagsReg cr ) %{
  match(Set cr (StoreLConditional mem (Binary oldval newval)));
  effect(KILL oldval);
  format %{ "XCHG   EBX,ECX\t# correct order for CMPXCHG8 instruction\n\t"
            "CMPXCHG8 $mem,ECX:EBX\t# If EDX:EAX==$mem Then store ECX:EBX into $mem\n\t"
            "XCHG   EBX,ECX"
  %}
  ins_encode %{
    // Note: we need to swap rbx, and rcx before and after the
    //       cmpxchg8 instruction because the instruction uses
    //       rcx as the high order word of the new value to store but
    //       our register encoding uses rbx.
    __ xchgl(as_Register(EBX_enc), as_Register(ECX_enc));
    if( os::is_MP() )
      __ lock();
    __ cmpxchg8($mem$$Address);
    __ xchgl(as_Register(EBX_enc), as_Register(ECX_enc));
  %}
  ins_pipe( pipe_cmpxchg );
%}

// No flag versions for CompareAndSwap{P,I,L} because matcher can't match them

instruct compareAndSwapL( rRegI res, eSIRegP mem_ptr, eADXRegL oldval, eBCXRegL newval, eFlagsReg cr ) %{
  predicate(VM_Version::supports_cx8());
  match(Set res (CompareAndSwapL mem_ptr (Binary oldval newval)));
  effect(KILL cr, KILL oldval);
  format %{ "CMPXCHG8 [$mem_ptr],$newval\t# If EDX:EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
            "MOV    $res,0\n\t"
            "JNE,s  fail\n\t"
            "MOV    $res,1\n"
          "fail:" %}
  ins_encode( enc_cmpxchg8(mem_ptr),
              enc_flags_ne_to_boolean(res) );
  ins_pipe( pipe_cmpxchg );
%}

instruct compareAndSwapP( rRegI res,  pRegP mem_ptr, eAXRegP oldval, eCXRegP newval, eFlagsReg cr) %{
  match(Set res (CompareAndSwapP mem_ptr (Binary oldval newval)));
  effect(KILL cr, KILL oldval);
  format %{ "CMPXCHG [$mem_ptr],$newval\t# If EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
            "MOV    $res,0\n\t"
            "JNE,s  fail\n\t"
            "MOV    $res,1\n"
          "fail:" %}
  ins_encode( enc_cmpxchg(mem_ptr), enc_flags_ne_to_boolean(res) );
  ins_pipe( pipe_cmpxchg );
%}

instruct compareAndSwapI( rRegI res, pRegP mem_ptr, eAXRegI oldval, eCXRegI newval, eFlagsReg cr) %{
  match(Set res (CompareAndSwapI mem_ptr (Binary oldval newval)));
  effect(KILL cr, KILL oldval);
  format %{ "CMPXCHG [$mem_ptr],$newval\t# If EAX==[$mem_ptr] Then store $newval into [$mem_ptr]\n\t"
            "MOV    $res,0\n\t"
            "JNE,s  fail\n\t"
            "MOV    $res,1\n"
          "fail:" %}
  ins_encode( enc_cmpxchg(mem_ptr), enc_flags_ne_to_boolean(res) );
  ins_pipe( pipe_cmpxchg );
%}

instruct xaddI_no_res( memory mem, Universe dummy, immI add, eFlagsReg cr) %{
  predicate(n->as_LoadStore()->result_not_used());
  match(Set dummy (GetAndAddI mem add));
  effect(KILL cr);
  format %{ "ADDL  [$mem],$add" %}
  ins_encode %{
    if (os::is_MP()) { __ lock(); }
    __ addl($mem$$Address, $add$$constant);
  %}
  ins_pipe( pipe_cmpxchg );
%}

instruct xaddI( memory mem, rRegI newval, eFlagsReg cr) %{
  match(Set newval (GetAndAddI mem newval));
  effect(KILL cr);
  format %{ "XADDL  [$mem],$newval" %}
  ins_encode %{
    if (os::is_MP()) { __ lock(); }
    __ xaddl($mem$$Address, $newval$$Register);
  %}
  ins_pipe( pipe_cmpxchg );
%}

instruct xchgI( memory mem, rRegI newval) %{
  match(Set newval (GetAndSetI mem newval));
  format %{ "XCHGL  $newval,[$mem]" %}
  ins_encode %{
    __ xchgl($newval$$Register, $mem$$Address);
  %}
  ins_pipe( pipe_cmpxchg );
%}

instruct xchgP( memory mem, pRegP newval) %{
  match(Set newval (GetAndSetP mem newval));
  format %{ "XCHGL  $newval,[$mem]" %}
  ins_encode %{
    __ xchgl($newval$$Register, $mem$$Address);
  %}
  ins_pipe( pipe_cmpxchg );
%}

//----------Subtraction Instructions-------------------------------------------

instruct subExactI_eReg(eAXRegI dst, rRegI src, eFlagsReg cr)
%{
  match(SubExactI dst src);
  effect(DEF cr);

  format %{ "SUB    $dst, $src\t# subExact int" %}
  ins_encode %{
    __ subl($dst$$Register, $src$$Register);
  %}
  ins_pipe(ialu_reg_reg);
%}

instruct subExactI_eReg_imm(eAXRegI dst, immI src, eFlagsReg cr)
%{
  match(SubExactI dst src);
  effect(DEF cr);

  format %{ "SUB    $dst, $src\t# subExact int" %}
  ins_encode %{
    __ subl($dst$$Register, $src$$constant);
  %}
  ins_pipe(ialu_reg_reg);
%}

instruct subExactI_eReg_mem(eAXRegI dst, memory src, eFlagsReg cr)
%{
  match(SubExactI dst (LoadI src));
  effect(DEF cr);

  ins_cost(125);
  format %{ "SUB    $dst,$src\t# subExact int" %}
  ins_encode %{
    __ subl($dst$$Register, $src$$Address);
  %}
  ins_pipe( ialu_reg_mem );
%}

// Integer Subtraction Instructions
instruct subI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
  match(Set dst (SubI dst src));
  effect(KILL cr);

  size(2);
  format %{ "SUB    $dst,$src" %}
  opcode(0x2B);
  ins_encode( OpcP, RegReg( dst, src) );
  ins_pipe( ialu_reg_reg );
%}

instruct subI_eReg_imm(rRegI dst, immI src, eFlagsReg cr) %{
  match(Set dst (SubI dst src));
  effect(KILL cr);

  format %{ "SUB    $dst,$src" %}
  opcode(0x81,0x05);  /* Opcode 81 /5 */
  // ins_encode( RegImm( dst, src) );
  ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
  ins_pipe( ialu_reg );
%}

instruct subI_eReg_mem(rRegI dst, memory src, eFlagsReg cr) %{
  match(Set dst (SubI dst (LoadI src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "SUB    $dst,$src" %}
  opcode(0x2B);
  ins_encode( OpcP, RegMem( dst, src) );
  ins_pipe( ialu_reg_mem );
%}

instruct subI_mem_eReg(memory dst, rRegI src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (SubI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(150);
  format %{ "SUB    $dst,$src" %}
  opcode(0x29);  /* Opcode 29 /r */
  ins_encode( OpcP, RegMem( src, dst ) );
  ins_pipe( ialu_mem_reg );
%}

// Subtract from a pointer
instruct subP_eReg(eRegP dst, rRegI src, immI0 zero, eFlagsReg cr) %{
  match(Set dst (AddP dst (SubI zero src)));
  effect(KILL cr);

  size(2);
  format %{ "SUB    $dst,$src" %}
  opcode(0x2B);
  ins_encode( OpcP, RegReg( dst, src) );
  ins_pipe( ialu_reg_reg );
%}

instruct negI_eReg(rRegI dst, immI0 zero, eFlagsReg cr) %{
  match(Set dst (SubI zero dst));
  effect(KILL cr);

  size(2);
  format %{ "NEG    $dst" %}
  opcode(0xF7,0x03);  // Opcode F7 /3
  ins_encode( OpcP, RegOpc( dst ) );
  ins_pipe( ialu_reg );
%}

instruct negExactI_eReg(eAXRegI dst, eFlagsReg cr) %{
  match(NegExactI dst);
  effect(DEF cr);

  format %{ "NEG    $dst\t# negExact int"%}
  ins_encode %{
    __ negl($dst$$Register);
  %}
  ins_pipe(ialu_reg);
%}

//----------Multiplication/Division Instructions-------------------------------
// Integer Multiplication Instructions
// Multiply Register
instruct mulI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
  match(Set dst (MulI dst src));
  effect(KILL cr);

  size(3);
  ins_cost(300);
  format %{ "IMUL   $dst,$src" %}
  opcode(0xAF, 0x0F);
  ins_encode( OpcS, OpcP, RegReg( dst, src) );
  ins_pipe( ialu_reg_reg_alu0 );
%}

// Multiply 32-bit Immediate
instruct mulI_eReg_imm(rRegI dst, rRegI src, immI imm, eFlagsReg cr) %{
  match(Set dst (MulI src imm));
  effect(KILL cr);

  ins_cost(300);
  format %{ "IMUL   $dst,$src,$imm" %}
  opcode(0x69);  /* 69 /r id */
  ins_encode( OpcSE(imm), RegReg( dst, src ), Con8or32( imm ) );
  ins_pipe( ialu_reg_reg_alu0 );
%}

instruct loadConL_low_only(eADXRegL_low_only dst, immL32 src, eFlagsReg cr) %{
  match(Set dst src);
  effect(KILL cr);

  // Note that this is artificially increased to make it more expensive than loadConL
  ins_cost(250);
  format %{ "MOV    EAX,$src\t// low word only" %}
  opcode(0xB8);
  ins_encode( LdImmL_Lo(dst, src) );
  ins_pipe( ialu_reg_fat );
%}

// Multiply by 32-bit Immediate, taking the shifted high order results
//  (special case for shift by 32)
instruct mulI_imm_high(eDXRegI dst, nadxRegI src1, eADXRegL_low_only src2, immI_32 cnt, eFlagsReg cr) %{
  match(Set dst (ConvL2I (RShiftL (MulL (ConvI2L src1) src2) cnt)));
  predicate( _kids[0]->_kids[0]->_kids[1]->_leaf->Opcode() == Op_ConL &&
             _kids[0]->_kids[0]->_kids[1]->_leaf->as_Type()->type()->is_long()->get_con() >= min_jint &&
             _kids[0]->_kids[0]->_kids[1]->_leaf->as_Type()->type()->is_long()->get_con() <= max_jint );
  effect(USE src1, KILL cr);

  // Note that this is adjusted by 150 to compensate for the overcosting of loadConL_low_only
  ins_cost(0*100 + 1*400 - 150);
  format %{ "IMUL   EDX:EAX,$src1" %}
  ins_encode( multiply_con_and_shift_high( dst, src1, src2, cnt, cr ) );
  ins_pipe( pipe_slow );
%}

// Multiply by 32-bit Immediate, taking the shifted high order results
instruct mulI_imm_RShift_high(eDXRegI dst, nadxRegI src1, eADXRegL_low_only src2, immI_32_63 cnt, eFlagsReg cr) %{
  match(Set dst (ConvL2I (RShiftL (MulL (ConvI2L src1) src2) cnt)));
  predicate( _kids[0]->_kids[0]->_kids[1]->_leaf->Opcode() == Op_ConL &&
             _kids[0]->_kids[0]->_kids[1]->_leaf->as_Type()->type()->is_long()->get_con() >= min_jint &&
             _kids[0]->_kids[0]->_kids[1]->_leaf->as_Type()->type()->is_long()->get_con() <= max_jint );
  effect(USE src1, KILL cr);

  // Note that this is adjusted by 150 to compensate for the overcosting of loadConL_low_only
  ins_cost(1*100 + 1*400 - 150);
  format %{ "IMUL   EDX:EAX,$src1\n\t"
            "SAR    EDX,$cnt-32" %}
  ins_encode( multiply_con_and_shift_high( dst, src1, src2, cnt, cr ) );
  ins_pipe( pipe_slow );
%}

// Multiply Memory 32-bit Immediate
instruct mulI_mem_imm(rRegI dst, memory src, immI imm, eFlagsReg cr) %{
  match(Set dst (MulI (LoadI src) imm));
  effect(KILL cr);

  ins_cost(300);
  format %{ "IMUL   $dst,$src,$imm" %}
  opcode(0x69);  /* 69 /r id */
  ins_encode( OpcSE(imm), RegMem( dst, src ), Con8or32( imm ) );
  ins_pipe( ialu_reg_mem_alu0 );
%}

// Multiply Memory
instruct mulI(rRegI dst, memory src, eFlagsReg cr) %{
  match(Set dst (MulI dst (LoadI src)));
  effect(KILL cr);

  ins_cost(350);
  format %{ "IMUL   $dst,$src" %}
  opcode(0xAF, 0x0F);
  ins_encode( OpcS, OpcP, RegMem( dst, src) );
  ins_pipe( ialu_reg_mem_alu0 );
%}

// Multiply Register Int to Long
instruct mulI2L(eADXRegL dst, eAXRegI src, nadxRegI src1, eFlagsReg flags) %{
  // Basic Idea: long = (long)int * (long)int
  match(Set dst (MulL (ConvI2L src) (ConvI2L src1)));
  effect(DEF dst, USE src, USE src1, KILL flags);

  ins_cost(300);
  format %{ "IMUL   $dst,$src1" %}

  ins_encode( long_int_multiply( dst, src1 ) );
  ins_pipe( ialu_reg_reg_alu0 );
%}

instruct mulIS_eReg(eADXRegL dst, immL_32bits mask, eFlagsReg flags, eAXRegI src, nadxRegI src1) %{
  // Basic Idea:  long = (int & 0xffffffffL) * (int & 0xffffffffL)
  match(Set dst (MulL (AndL (ConvI2L src) mask) (AndL (ConvI2L src1) mask)));
  effect(KILL flags);

  ins_cost(300);
  format %{ "MUL    $dst,$src1" %}

  ins_encode( long_uint_multiply(dst, src1) );
  ins_pipe( ialu_reg_reg_alu0 );
%}

// Multiply Register Long
instruct mulL_eReg(eADXRegL dst, eRegL src, rRegI tmp, eFlagsReg cr) %{
  match(Set dst (MulL dst src));
  effect(KILL cr, TEMP tmp);
  ins_cost(4*100+3*400);
// Basic idea: lo(result) = lo(x_lo * y_lo)
//             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi)
  format %{ "MOV    $tmp,$src.lo\n\t"
            "IMUL   $tmp,EDX\n\t"
            "MOV    EDX,$src.hi\n\t"
            "IMUL   EDX,EAX\n\t"
            "ADD    $tmp,EDX\n\t"
            "MUL    EDX:EAX,$src.lo\n\t"
            "ADD    EDX,$tmp" %}
  ins_encode( long_multiply( dst, src, tmp ) );
  ins_pipe( pipe_slow );
%}

// Multiply Register Long where the left operand's high 32 bits are zero
instruct mulL_eReg_lhi0(eADXRegL dst, eRegL src, rRegI tmp, eFlagsReg cr) %{
  predicate(is_operand_hi32_zero(n->in(1)));
  match(Set dst (MulL dst src));
  effect(KILL cr, TEMP tmp);
  ins_cost(2*100+2*400);
// Basic idea: lo(result) = lo(x_lo * y_lo)
//             hi(result) = hi(x_lo * y_lo) + lo(x_lo * y_hi) where lo(x_hi * y_lo) = 0 because x_hi = 0
  format %{ "MOV    $tmp,$src.hi\n\t"
            "IMUL   $tmp,EAX\n\t"
            "MUL    EDX:EAX,$src.lo\n\t"
            "ADD    EDX,$tmp" %}
  ins_encode %{
    __ movl($tmp$$Register, HIGH_FROM_LOW($src$$Register));
    __ imull($tmp$$Register, rax);
    __ mull($src$$Register);
    __ addl(rdx, $tmp$$Register);
  %}
  ins_pipe( pipe_slow );
%}

// Multiply Register Long where the right operand's high 32 bits are zero
instruct mulL_eReg_rhi0(eADXRegL dst, eRegL src, rRegI tmp, eFlagsReg cr) %{
  predicate(is_operand_hi32_zero(n->in(2)));
  match(Set dst (MulL dst src));
  effect(KILL cr, TEMP tmp);
  ins_cost(2*100+2*400);
// Basic idea: lo(result) = lo(x_lo * y_lo)
//             hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) where lo(x_lo * y_hi) = 0 because y_hi = 0
  format %{ "MOV    $tmp,$src.lo\n\t"
            "IMUL   $tmp,EDX\n\t"
            "MUL    EDX:EAX,$src.lo\n\t"
            "ADD    EDX,$tmp" %}
  ins_encode %{
    __ movl($tmp$$Register, $src$$Register);
    __ imull($tmp$$Register, rdx);
    __ mull($src$$Register);
    __ addl(rdx, $tmp$$Register);
  %}
  ins_pipe( pipe_slow );
%}

// Multiply Register Long where the left and the right operands' high 32 bits are zero
instruct mulL_eReg_hi0(eADXRegL dst, eRegL src, eFlagsReg cr) %{
  predicate(is_operand_hi32_zero(n->in(1)) && is_operand_hi32_zero(n->in(2)));
  match(Set dst (MulL dst src));
  effect(KILL cr);
  ins_cost(1*400);
// Basic idea: lo(result) = lo(x_lo * y_lo)
//             hi(result) = hi(x_lo * y_lo) where lo(x_hi * y_lo) = 0 and lo(x_lo * y_hi) = 0 because x_hi = 0 and y_hi = 0
  format %{ "MUL    EDX:EAX,$src.lo\n\t" %}
  ins_encode %{
    __ mull($src$$Register);
  %}
  ins_pipe( pipe_slow );
%}

// Multiply Register Long by small constant
instruct mulL_eReg_con(eADXRegL dst, immL_127 src, rRegI tmp, eFlagsReg cr) %{
  match(Set dst (MulL dst src));
  effect(KILL cr, TEMP tmp);
  ins_cost(2*100+2*400);
  size(12);
// Basic idea: lo(result) = lo(src * EAX)
//             hi(result) = hi(src * EAX) + lo(src * EDX)
  format %{ "IMUL   $tmp,EDX,$src\n\t"
            "MOV    EDX,$src\n\t"
            "MUL    EDX\t# EDX*EAX -> EDX:EAX\n\t"
            "ADD    EDX,$tmp" %}
  ins_encode( long_multiply_con( dst, src, tmp ) );
  ins_pipe( pipe_slow );
%}

instruct mulExactI_eReg(eAXRegI dst, rRegI src, eFlagsReg cr)
%{
  match(MulExactI dst src);
  effect(DEF cr);

  ins_cost(300);
  format %{ "IMUL   $dst, $src\t# mulExact int" %}
  ins_encode %{
    __ imull($dst$$Register, $src$$Register);
  %}
  ins_pipe(ialu_reg_reg_alu0);
%}

instruct mulExactI_eReg_imm(eAXRegI dst, rRegI src, immI imm, eFlagsReg cr)
%{
  match(MulExactI src imm);
  effect(DEF cr);

  ins_cost(300);
  format %{ "IMUL   $dst, $src, $imm\t# mulExact int" %}
  ins_encode %{
    __ imull($dst$$Register, $src$$Register, $imm$$constant);
  %}
  ins_pipe(ialu_reg_reg_alu0);
%}

instruct mulExactI_eReg_mem(eAXRegI dst, memory src, eFlagsReg cr)
%{
  match(MulExactI dst (LoadI src));
  effect(DEF cr);

  ins_cost(350);
  format %{ "IMUL   $dst, $src\t# mulExact int" %}
  ins_encode %{
    __ imull($dst$$Register, $src$$Address);
  %}
  ins_pipe(ialu_reg_mem_alu0);
%}


// Integer DIV with Register
instruct divI_eReg(eAXRegI rax, eDXRegI rdx, eCXRegI div, eFlagsReg cr) %{
  match(Set rax (DivI rax div));
  effect(KILL rdx, KILL cr);
  size(26);
  ins_cost(30*100+10*100);
  format %{ "CMP    EAX,0x80000000\n\t"
            "JNE,s  normal\n\t"
            "XOR    EDX,EDX\n\t"
            "CMP    ECX,-1\n\t"
            "JE,s   done\n"
    "normal: CDQ\n\t"
            "IDIV   $div\n\t"
    "done:"        %}
  opcode(0xF7, 0x7);  /* Opcode F7 /7 */
  ins_encode( cdq_enc, OpcP, RegOpc(div) );
  ins_pipe( ialu_reg_reg_alu0 );
%}

// Divide Register Long
instruct divL_eReg( eADXRegL dst, eRegL src1, eRegL src2, eFlagsReg cr, eCXRegI cx, eBXRegI bx ) %{
  match(Set dst (DivL src1 src2));
  effect( KILL cr, KILL cx, KILL bx );
  ins_cost(10000);
  format %{ "PUSH   $src1.hi\n\t"
            "PUSH   $src1.lo\n\t"
            "PUSH   $src2.hi\n\t"
            "PUSH   $src2.lo\n\t"
            "CALL   SharedRuntime::ldiv\n\t"
            "ADD    ESP,16" %}
  ins_encode( long_div(src1,src2) );
  ins_pipe( pipe_slow );
%}

// Integer DIVMOD with Register, both quotient and mod results
instruct divModI_eReg_divmod(eAXRegI rax, eDXRegI rdx, eCXRegI div, eFlagsReg cr) %{
  match(DivModI rax div);
  effect(KILL cr);
  size(26);
  ins_cost(30*100+10*100);
  format %{ "CMP    EAX,0x80000000\n\t"
            "JNE,s  normal\n\t"
            "XOR    EDX,EDX\n\t"
            "CMP    ECX,-1\n\t"
            "JE,s   done\n"
    "normal: CDQ\n\t"
            "IDIV   $div\n\t"
    "done:"        %}
  opcode(0xF7, 0x7);  /* Opcode F7 /7 */
  ins_encode( cdq_enc, OpcP, RegOpc(div) );
  ins_pipe( pipe_slow );
%}

// Integer MOD with Register
instruct modI_eReg(eDXRegI rdx, eAXRegI rax, eCXRegI div, eFlagsReg cr) %{
  match(Set rdx (ModI rax div));
  effect(KILL rax, KILL cr);

  size(26);
  ins_cost(300);
  format %{ "CDQ\n\t"
            "IDIV   $div" %}
  opcode(0xF7, 0x7);  /* Opcode F7 /7 */
  ins_encode( cdq_enc, OpcP, RegOpc(div) );
  ins_pipe( ialu_reg_reg_alu0 );
%}

// Remainder Register Long
instruct modL_eReg( eADXRegL dst, eRegL src1, eRegL src2, eFlagsReg cr, eCXRegI cx, eBXRegI bx ) %{
  match(Set dst (ModL src1 src2));
  effect( KILL cr, KILL cx, KILL bx );
  ins_cost(10000);
  format %{ "PUSH   $src1.hi\n\t"
            "PUSH   $src1.lo\n\t"
            "PUSH   $src2.hi\n\t"
            "PUSH   $src2.lo\n\t"
            "CALL   SharedRuntime::lrem\n\t"
            "ADD    ESP,16" %}
  ins_encode( long_mod(src1,src2) );
  ins_pipe( pipe_slow );
%}

// Divide Register Long (no special case since divisor != -1)
instruct divL_eReg_imm32( eADXRegL dst, immL32 imm, rRegI tmp, rRegI tmp2, eFlagsReg cr ) %{
  match(Set dst (DivL dst imm));
  effect( TEMP tmp, TEMP tmp2, KILL cr );
  ins_cost(1000);
  format %{ "MOV    $tmp,abs($imm) # ldiv EDX:EAX,$imm\n\t"
            "XOR    $tmp2,$tmp2\n\t"
            "CMP    $tmp,EDX\n\t"
            "JA,s   fast\n\t"
            "MOV    $tmp2,EAX\n\t"
            "MOV    EAX,EDX\n\t"
            "MOV    EDX,0\n\t"
            "JLE,s  pos\n\t"
            "LNEG   EAX : $tmp2\n\t"
            "DIV    $tmp # unsigned division\n\t"
            "XCHG   EAX,$tmp2\n\t"
            "DIV    $tmp\n\t"
            "LNEG   $tmp2 : EAX\n\t"
            "JMP,s  done\n"
    "pos:\n\t"
            "DIV    $tmp\n\t"
            "XCHG   EAX,$tmp2\n"
    "fast:\n\t"
            "DIV    $tmp\n"
    "done:\n\t"
            "MOV    EDX,$tmp2\n\t"
            "NEG    EDX:EAX # if $imm < 0" %}
  ins_encode %{
    int con = (int)$imm$$constant;
    assert(con != 0 && con != -1 && con != min_jint, "wrong divisor");
    int pcon = (con > 0) ? con : -con;
    Label Lfast, Lpos, Ldone;

    __ movl($tmp$$Register, pcon);
    __ xorl($tmp2$$Register,$tmp2$$Register);
    __ cmpl($tmp$$Register, HIGH_FROM_LOW($dst$$Register));
    __ jccb(Assembler::above, Lfast); // result fits into 32 bit

    __ movl($tmp2$$Register, $dst$$Register); // save
    __ movl($dst$$Register, HIGH_FROM_LOW($dst$$Register));
    __ movl(HIGH_FROM_LOW($dst$$Register),0); // preserve flags
    __ jccb(Assembler::lessEqual, Lpos); // result is positive

    // Negative dividend.
    // convert value to positive to use unsigned division
    __ lneg($dst$$Register, $tmp2$$Register);
    __ divl($tmp$$Register);
    __ xchgl($dst$$Register, $tmp2$$Register);
    __ divl($tmp$$Register);
    // revert result back to negative
    __ lneg($tmp2$$Register, $dst$$Register);
    __ jmpb(Ldone);

    __ bind(Lpos);
    __ divl($tmp$$Register); // Use unsigned division
    __ xchgl($dst$$Register, $tmp2$$Register);
    // Fallthrow for final divide, tmp2 has 32 bit hi result

    __ bind(Lfast);
    // fast path: src is positive
    __ divl($tmp$$Register); // Use unsigned division

    __ bind(Ldone);
    __ movl(HIGH_FROM_LOW($dst$$Register),$tmp2$$Register);
    if (con < 0) {
      __ lneg(HIGH_FROM_LOW($dst$$Register), $dst$$Register);
    }
  %}
  ins_pipe( pipe_slow );
%}

// Remainder Register Long (remainder fit into 32 bits)
instruct modL_eReg_imm32( eADXRegL dst, immL32 imm, rRegI tmp, rRegI tmp2, eFlagsReg cr ) %{
  match(Set dst (ModL dst imm));
  effect( TEMP tmp, TEMP tmp2, KILL cr );
  ins_cost(1000);
  format %{ "MOV    $tmp,abs($imm) # lrem EDX:EAX,$imm\n\t"
            "CMP    $tmp,EDX\n\t"
            "JA,s   fast\n\t"
            "MOV    $tmp2,EAX\n\t"
            "MOV    EAX,EDX\n\t"
            "MOV    EDX,0\n\t"
            "JLE,s  pos\n\t"
            "LNEG   EAX : $tmp2\n\t"
            "DIV    $tmp # unsigned division\n\t"
            "MOV    EAX,$tmp2\n\t"
            "DIV    $tmp\n\t"
            "NEG    EDX\n\t"
            "JMP,s  done\n"
    "pos:\n\t"
            "DIV    $tmp\n\t"
            "MOV    EAX,$tmp2\n"
    "fast:\n\t"
            "DIV    $tmp\n"
    "done:\n\t"
            "MOV    EAX,EDX\n\t"
            "SAR    EDX,31\n\t" %}
  ins_encode %{
    int con = (int)$imm$$constant;
    assert(con != 0 && con != -1 && con != min_jint, "wrong divisor");
    int pcon = (con > 0) ? con : -con;
    Label  Lfast, Lpos, Ldone;

    __ movl($tmp$$Register, pcon);
    __ cmpl($tmp$$Register, HIGH_FROM_LOW($dst$$Register));
    __ jccb(Assembler::above, Lfast); // src is positive and result fits into 32 bit

    __ movl($tmp2$$Register, $dst$$Register); // save
    __ movl($dst$$Register, HIGH_FROM_LOW($dst$$Register));
    __ movl(HIGH_FROM_LOW($dst$$Register),0); // preserve flags
    __ jccb(Assembler::lessEqual, Lpos); // result is positive

    // Negative dividend.
    // convert value to positive to use unsigned division
    __ lneg($dst$$Register, $tmp2$$Register);
    __ divl($tmp$$Register);
    __ movl($dst$$Register, $tmp2$$Register);
    __ divl($tmp$$Register);
    // revert remainder back to negative
    __ negl(HIGH_FROM_LOW($dst$$Register));
    __ jmpb(Ldone);

    __ bind(Lpos);
    __ divl($tmp$$Register);
    __ movl($dst$$Register, $tmp2$$Register);

    __ bind(Lfast);
    // fast path: src is positive
    __ divl($tmp$$Register);

    __ bind(Ldone);
    __ movl($dst$$Register, HIGH_FROM_LOW($dst$$Register));
    __ sarl(HIGH_FROM_LOW($dst$$Register), 31); // result sign

  %}
  ins_pipe( pipe_slow );
%}

// Integer Shift Instructions
// Shift Left by one
instruct shlI_eReg_1(rRegI dst, immI1 shift, eFlagsReg cr) %{
  match(Set dst (LShiftI dst shift));
  effect(KILL cr);

  size(2);
  format %{ "SHL    $dst,$shift" %}
  opcode(0xD1, 0x4);  /* D1 /4 */
  ins_encode( OpcP, RegOpc( dst ) );
  ins_pipe( ialu_reg );
%}

// Shift Left by 8-bit immediate
instruct salI_eReg_imm(rRegI dst, immI8 shift, eFlagsReg cr) %{
  match(Set dst (LShiftI dst shift));
  effect(KILL cr);

  size(3);
  format %{ "SHL    $dst,$shift" %}
  opcode(0xC1, 0x4);  /* C1 /4 ib */
  ins_encode( RegOpcImm( dst, shift) );
  ins_pipe( ialu_reg );
%}

// Shift Left by variable
instruct salI_eReg_CL(rRegI dst, eCXRegI shift, eFlagsReg cr) %{
  match(Set dst (LShiftI dst shift));
  effect(KILL cr);

  size(2);
  format %{ "SHL    $dst,$shift" %}
  opcode(0xD3, 0x4);  /* D3 /4 */
  ins_encode( OpcP, RegOpc( dst ) );
  ins_pipe( ialu_reg_reg );
%}

// Arithmetic shift right by one
instruct sarI_eReg_1(rRegI dst, immI1 shift, eFlagsReg cr) %{
  match(Set dst (RShiftI dst shift));
  effect(KILL cr);

  size(2);
  format %{ "SAR    $dst,$shift" %}
  opcode(0xD1, 0x7);  /* D1 /7 */
  ins_encode( OpcP, RegOpc( dst ) );
  ins_pipe( ialu_reg );
%}

// Arithmetic shift right by one
instruct sarI_mem_1(memory dst, immI1 shift, eFlagsReg cr) %{
  match(Set dst (StoreI dst (RShiftI (LoadI dst) shift)));
  effect(KILL cr);
  format %{ "SAR    $dst,$shift" %}
  opcode(0xD1, 0x7);  /* D1 /7 */
  ins_encode( OpcP, RMopc_Mem(secondary,dst) );
  ins_pipe( ialu_mem_imm );
%}

// Arithmetic Shift Right by 8-bit immediate
instruct sarI_eReg_imm(rRegI dst, immI8 shift, eFlagsReg cr) %{
  match(Set dst (RShiftI dst shift));
  effect(KILL cr);

  size(3);
  format %{ "SAR    $dst,$shift" %}
  opcode(0xC1, 0x7);  /* C1 /7 ib */
  ins_encode( RegOpcImm( dst, shift ) );
  ins_pipe( ialu_mem_imm );
%}

// Arithmetic Shift Right by 8-bit immediate
instruct sarI_mem_imm(memory dst, immI8 shift, eFlagsReg cr) %{
  match(Set dst (StoreI dst (RShiftI (LoadI dst) shift)));
  effect(KILL cr);

  format %{ "SAR    $dst,$shift" %}
  opcode(0xC1, 0x7);  /* C1 /7 ib */
  ins_encode( OpcP, RMopc_Mem(secondary, dst ), Con8or32( shift ) );
  ins_pipe( ialu_mem_imm );
%}

// Arithmetic Shift Right by variable
instruct sarI_eReg_CL(rRegI dst, eCXRegI shift, eFlagsReg cr) %{
  match(Set dst (RShiftI dst shift));
  effect(KILL cr);

  size(2);
  format %{ "SAR    $dst,$shift" %}
  opcode(0xD3, 0x7);  /* D3 /7 */
  ins_encode( OpcP, RegOpc( dst ) );
  ins_pipe( ialu_reg_reg );
%}

// Logical shift right by one
instruct shrI_eReg_1(rRegI dst, immI1 shift, eFlagsReg cr) %{
  match(Set dst (URShiftI dst shift));
  effect(KILL cr);

  size(2);
  format %{ "SHR    $dst,$shift" %}
  opcode(0xD1, 0x5);  /* D1 /5 */
  ins_encode( OpcP, RegOpc( dst ) );
  ins_pipe( ialu_reg );
%}

// Logical Shift Right by 8-bit immediate
instruct shrI_eReg_imm(rRegI dst, immI8 shift, eFlagsReg cr) %{
  match(Set dst (URShiftI dst shift));
  effect(KILL cr);

  size(3);
  format %{ "SHR    $dst,$shift" %}
  opcode(0xC1, 0x5);  /* C1 /5 ib */
  ins_encode( RegOpcImm( dst, shift) );
  ins_pipe( ialu_reg );
%}


// Logical Shift Right by 24, followed by Arithmetic Shift Left by 24.
// This idiom is used by the compiler for the i2b bytecode.
instruct i2b(rRegI dst, xRegI src, immI_24 twentyfour) %{
  match(Set dst (RShiftI (LShiftI src twentyfour) twentyfour));

  size(3);
  format %{ "MOVSX  $dst,$src :8" %}
  ins_encode %{
    __ movsbl($dst$$Register, $src$$Register);
  %}
  ins_pipe(ialu_reg_reg);
%}

// Logical Shift Right by 16, followed by Arithmetic Shift Left by 16.
// This idiom is used by the compiler the i2s bytecode.
instruct i2s(rRegI dst, xRegI src, immI_16 sixteen) %{
  match(Set dst (RShiftI (LShiftI src sixteen) sixteen));

  size(3);
  format %{ "MOVSX  $dst,$src :16" %}
  ins_encode %{
    __ movswl($dst$$Register, $src$$Register);
  %}
  ins_pipe(ialu_reg_reg);
%}


// Logical Shift Right by variable
instruct shrI_eReg_CL(rRegI dst, eCXRegI shift, eFlagsReg cr) %{
  match(Set dst (URShiftI dst shift));
  effect(KILL cr);

  size(2);
  format %{ "SHR    $dst,$shift" %}
  opcode(0xD3, 0x5);  /* D3 /5 */
  ins_encode( OpcP, RegOpc( dst ) );
  ins_pipe( ialu_reg_reg );
%}


//----------Logical Instructions-----------------------------------------------
//----------Integer Logical Instructions---------------------------------------
// And Instructions
// And Register with Register
instruct andI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
  match(Set dst (AndI dst src));
  effect(KILL cr);

  size(2);
  format %{ "AND    $dst,$src" %}
  opcode(0x23);
  ins_encode( OpcP, RegReg( dst, src) );
  ins_pipe( ialu_reg_reg );
%}

// And Register with Immediate
instruct andI_eReg_imm(rRegI dst, immI src, eFlagsReg cr) %{
  match(Set dst (AndI dst src));
  effect(KILL cr);

  format %{ "AND    $dst,$src" %}
  opcode(0x81,0x04);  /* Opcode 81 /4 */
  // ins_encode( RegImm( dst, src) );
  ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
  ins_pipe( ialu_reg );
%}

// And Register with Memory
instruct andI_eReg_mem(rRegI dst, memory src, eFlagsReg cr) %{
  match(Set dst (AndI dst (LoadI src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "AND    $dst,$src" %}
  opcode(0x23);
  ins_encode( OpcP, RegMem( dst, src) );
  ins_pipe( ialu_reg_mem );
%}

// And Memory with Register
instruct andI_mem_eReg(memory dst, rRegI src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (AndI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(150);
  format %{ "AND    $dst,$src" %}
  opcode(0x21);  /* Opcode 21 /r */
  ins_encode( OpcP, RegMem( src, dst ) );
  ins_pipe( ialu_mem_reg );
%}

// And Memory with Immediate
instruct andI_mem_imm(memory dst, immI src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (AndI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "AND    $dst,$src" %}
  opcode(0x81, 0x4);  /* Opcode 81 /4 id */
  // ins_encode( MemImm( dst, src) );
  ins_encode( OpcSE( src ), RMopc_Mem(secondary, dst ), Con8or32( src ) );
  ins_pipe( ialu_mem_imm );
%}

// Or Instructions
// Or Register with Register
instruct orI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
  match(Set dst (OrI dst src));
  effect(KILL cr);

  size(2);
  format %{ "OR     $dst,$src" %}
  opcode(0x0B);
  ins_encode( OpcP, RegReg( dst, src) );
  ins_pipe( ialu_reg_reg );
%}

instruct orI_eReg_castP2X(rRegI dst, eRegP src, eFlagsReg cr) %{
  match(Set dst (OrI dst (CastP2X src)));
  effect(KILL cr);

  size(2);
  format %{ "OR     $dst,$src" %}
  opcode(0x0B);
  ins_encode( OpcP, RegReg( dst, src) );
  ins_pipe( ialu_reg_reg );
%}


// Or Register with Immediate
instruct orI_eReg_imm(rRegI dst, immI src, eFlagsReg cr) %{
  match(Set dst (OrI dst src));
  effect(KILL cr);

  format %{ "OR     $dst,$src" %}
  opcode(0x81,0x01);  /* Opcode 81 /1 id */
  // ins_encode( RegImm( dst, src) );
  ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
  ins_pipe( ialu_reg );
%}

// Or Register with Memory
instruct orI_eReg_mem(rRegI dst, memory src, eFlagsReg cr) %{
  match(Set dst (OrI dst (LoadI src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "OR     $dst,$src" %}
  opcode(0x0B);
  ins_encode( OpcP, RegMem( dst, src) );
  ins_pipe( ialu_reg_mem );
%}

// Or Memory with Register
instruct orI_mem_eReg(memory dst, rRegI src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (OrI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(150);
  format %{ "OR     $dst,$src" %}
  opcode(0x09);  /* Opcode 09 /r */
  ins_encode( OpcP, RegMem( src, dst ) );
  ins_pipe( ialu_mem_reg );
%}

// Or Memory with Immediate
instruct orI_mem_imm(memory dst, immI src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (OrI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "OR     $dst,$src" %}
  opcode(0x81,0x1);  /* Opcode 81 /1 id */
  // ins_encode( MemImm( dst, src) );
  ins_encode( OpcSE( src ), RMopc_Mem(secondary, dst ), Con8or32( src ) );
  ins_pipe( ialu_mem_imm );
%}

// ROL/ROR
// ROL expand
instruct rolI_eReg_imm1(rRegI dst, immI1 shift, eFlagsReg cr) %{
  effect(USE_DEF dst, USE shift, KILL cr);

  format %{ "ROL    $dst, $shift" %}
  opcode(0xD1, 0x0); /* Opcode D1 /0 */
  ins_encode( OpcP, RegOpc( dst ));
  ins_pipe( ialu_reg );
%}

instruct rolI_eReg_imm8(rRegI dst, immI8 shift, eFlagsReg cr) %{
  effect(USE_DEF dst, USE shift, KILL cr);

  format %{ "ROL    $dst, $shift" %}
  opcode(0xC1, 0x0); /*Opcode /C1  /0  */
  ins_encode( RegOpcImm(dst, shift) );
  ins_pipe(ialu_reg);
%}

instruct rolI_eReg_CL(ncxRegI dst, eCXRegI shift, eFlagsReg cr) %{
  effect(USE_DEF dst, USE shift, KILL cr);

  format %{ "ROL    $dst, $shift" %}
  opcode(0xD3, 0x0);    /* Opcode D3 /0 */
  ins_encode(OpcP, RegOpc(dst));
  ins_pipe( ialu_reg_reg );
%}
// end of ROL expand

// ROL 32bit by one once
instruct rolI_eReg_i1(rRegI dst, immI1 lshift, immI_M1 rshift, eFlagsReg cr) %{
  match(Set dst ( OrI (LShiftI dst lshift) (URShiftI dst rshift)));

  expand %{
    rolI_eReg_imm1(dst, lshift, cr);
  %}
%}

// ROL 32bit var by imm8 once
instruct rolI_eReg_i8(rRegI dst, immI8 lshift, immI8 rshift, eFlagsReg cr) %{
  predicate(  0 == ((n->in(1)->in(2)->get_int() + n->in(2)->in(2)->get_int()) & 0x1f));
  match(Set dst ( OrI (LShiftI dst lshift) (URShiftI dst rshift)));

  expand %{
    rolI_eReg_imm8(dst, lshift, cr);
  %}
%}

// ROL 32bit var by var once
instruct rolI_eReg_Var_C0(ncxRegI dst, eCXRegI shift, immI0 zero, eFlagsReg cr) %{
  match(Set dst ( OrI (LShiftI dst shift) (URShiftI dst (SubI zero shift))));

  expand %{
    rolI_eReg_CL(dst, shift, cr);
  %}
%}

// ROL 32bit var by var once
instruct rolI_eReg_Var_C32(ncxRegI dst, eCXRegI shift, immI_32 c32, eFlagsReg cr) %{
  match(Set dst ( OrI (LShiftI dst shift) (URShiftI dst (SubI c32 shift))));

  expand %{
    rolI_eReg_CL(dst, shift, cr);
  %}
%}

// ROR expand
instruct rorI_eReg_imm1(rRegI dst, immI1 shift, eFlagsReg cr) %{
  effect(USE_DEF dst, USE shift, KILL cr);

  format %{ "ROR    $dst, $shift" %}
  opcode(0xD1,0x1);  /* Opcode D1 /1 */
  ins_encode( OpcP, RegOpc( dst ) );
  ins_pipe( ialu_reg );
%}

instruct rorI_eReg_imm8(rRegI dst, immI8 shift, eFlagsReg cr) %{
  effect (USE_DEF dst, USE shift, KILL cr);

  format %{ "ROR    $dst, $shift" %}
  opcode(0xC1, 0x1); /* Opcode /C1 /1 ib */
  ins_encode( RegOpcImm(dst, shift) );
  ins_pipe( ialu_reg );
%}

instruct rorI_eReg_CL(ncxRegI dst, eCXRegI shift, eFlagsReg cr)%{
  effect(USE_DEF dst, USE shift, KILL cr);

  format %{ "ROR    $dst, $shift" %}
  opcode(0xD3, 0x1);    /* Opcode D3 /1 */
  ins_encode(OpcP, RegOpc(dst));
  ins_pipe( ialu_reg_reg );
%}
// end of ROR expand

// ROR right once
instruct rorI_eReg_i1(rRegI dst, immI1 rshift, immI_M1 lshift, eFlagsReg cr) %{
  match(Set dst ( OrI (URShiftI dst rshift) (LShiftI dst lshift)));

  expand %{
    rorI_eReg_imm1(dst, rshift, cr);
  %}
%}

// ROR 32bit by immI8 once
instruct rorI_eReg_i8(rRegI dst, immI8 rshift, immI8 lshift, eFlagsReg cr) %{
  predicate(  0 == ((n->in(1)->in(2)->get_int() + n->in(2)->in(2)->get_int()) & 0x1f));
  match(Set dst ( OrI (URShiftI dst rshift) (LShiftI dst lshift)));

  expand %{
    rorI_eReg_imm8(dst, rshift, cr);
  %}
%}

// ROR 32bit var by var once
instruct rorI_eReg_Var_C0(ncxRegI dst, eCXRegI shift, immI0 zero, eFlagsReg cr) %{
  match(Set dst ( OrI (URShiftI dst shift) (LShiftI dst (SubI zero shift))));

  expand %{
    rorI_eReg_CL(dst, shift, cr);
  %}
%}

// ROR 32bit var by var once
instruct rorI_eReg_Var_C32(ncxRegI dst, eCXRegI shift, immI_32 c32, eFlagsReg cr) %{
  match(Set dst ( OrI (URShiftI dst shift) (LShiftI dst (SubI c32 shift))));

  expand %{
    rorI_eReg_CL(dst, shift, cr);
  %}
%}

// Xor Instructions
// Xor Register with Register
instruct xorI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
  match(Set dst (XorI dst src));
  effect(KILL cr);

  size(2);
  format %{ "XOR    $dst,$src" %}
  opcode(0x33);
  ins_encode( OpcP, RegReg( dst, src) );
  ins_pipe( ialu_reg_reg );
%}

// Xor Register with Immediate -1
instruct xorI_eReg_im1(rRegI dst, immI_M1 imm) %{
  match(Set dst (XorI dst imm));  

  size(2);
  format %{ "NOT    $dst" %}  
  ins_encode %{
     __ notl($dst$$Register);
  %}
  ins_pipe( ialu_reg );
%}

// Xor Register with Immediate
instruct xorI_eReg_imm(rRegI dst, immI src, eFlagsReg cr) %{
  match(Set dst (XorI dst src));
  effect(KILL cr);

  format %{ "XOR    $dst,$src" %}
  opcode(0x81,0x06);  /* Opcode 81 /6 id */
  // ins_encode( RegImm( dst, src) );
  ins_encode( OpcSErm( dst, src ), Con8or32( src ) );
  ins_pipe( ialu_reg );
%}

// Xor Register with Memory
instruct xorI_eReg_mem(rRegI dst, memory src, eFlagsReg cr) %{
  match(Set dst (XorI dst (LoadI src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "XOR    $dst,$src" %}
  opcode(0x33);
  ins_encode( OpcP, RegMem(dst, src) );
  ins_pipe( ialu_reg_mem );
%}

// Xor Memory with Register
instruct xorI_mem_eReg(memory dst, rRegI src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (XorI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(150);
  format %{ "XOR    $dst,$src" %}
  opcode(0x31);  /* Opcode 31 /r */
  ins_encode( OpcP, RegMem( src, dst ) );
  ins_pipe( ialu_mem_reg );
%}

// Xor Memory with Immediate
instruct xorI_mem_imm(memory dst, immI src, eFlagsReg cr) %{
  match(Set dst (StoreI dst (XorI (LoadI dst) src)));
  effect(KILL cr);

  ins_cost(125);
  format %{ "XOR    $dst,$src" %}
  opcode(0x81,0x6);  /* Opcode 81 /6 id */
  ins_encode( OpcSE( src ), RMopc_Mem(secondary, dst ), Con8or32( src ) );
  ins_pipe( ialu_mem_imm );
%}

//----------Convert Int to Boolean---------------------------------------------

instruct movI_nocopy(rRegI dst, rRegI src) %{
  effect( DEF dst, USE src );
  format %{ "MOV    $dst,$src" %}
  ins_encode( enc_Copy( dst, src) );
  ins_pipe( ialu_reg_reg );
%}

instruct ci2b( rRegI dst, rRegI src, eFlagsReg cr ) %{
  effect( USE_DEF dst, USE src, KILL cr );

  size(4);
  format %{ "NEG    $dst\n\t"
            "ADC    $dst,$src" %}
  ins_encode( neg_reg(dst),
              OpcRegReg(0x13,dst,src) );
  ins_pipe( ialu_reg_reg_long );
%}

instruct convI2B( rRegI dst, rRegI src, eFlagsReg cr ) %{
  match(Set dst (Conv2B src));

  expand %{
    movI_nocopy(dst,src);
    ci2b(dst,src,cr);
  %}
%}

instruct movP_nocopy(rRegI dst, eRegP src) %{
  effect( DEF dst, USE src );
  format %{ "MOV    $dst,$src" %}
  ins_encode( enc_Copy( dst, src) );
  ins_pipe( ialu_reg_reg );
%}

instruct cp2b( rRegI dst, eRegP src, eFlagsReg cr ) %{
  effect( USE_DEF dst, USE src, KILL cr );
  format %{ "NEG    $dst\n\t"
            "ADC    $dst,$src" %}
  ins_encode( neg_reg(dst),
              OpcRegReg(0x13,dst,src) );
  ins_pipe( ialu_reg_reg_long );
%}

instruct convP2B( rRegI dst, eRegP src, eFlagsReg cr ) %{
  match(Set dst (Conv2B src));

  expand %{
    movP_nocopy(dst,src);
    cp2b(dst,src,cr);
  %}
%}

instruct cmpLTMask(eCXRegI dst, ncxRegI p, ncxRegI q, eFlagsReg cr) %{
  match(Set dst (CmpLTMask p q));
  effect(KILL cr);
  ins_cost(400);

  // SETlt can only use low byte of EAX,EBX, ECX, or EDX as destination
  format %{ "XOR    $dst,$dst\n\t"
            "CMP    $p,$q\n\t"
            "SETlt  $dst\n\t"
            "NEG    $dst" %}
  ins_encode %{
    Register Rp = $p$$Register;
    Register Rq = $q$$Register;
    Register Rd = $dst$$Register;
    Label done;
    __ xorl(Rd, Rd);
    __ cmpl(Rp, Rq);
    __ setb(Assembler::less, Rd);
    __ negl(Rd);
  %}

  ins_pipe(pipe_slow);
%}

instruct cmpLTMask0(rRegI dst, immI0 zero, eFlagsReg cr) %{
  match(Set dst (CmpLTMask dst zero));
  effect(DEF dst, KILL cr);
  ins_cost(100);

  format %{ "SAR    $dst,31\t# cmpLTMask0" %}
  ins_encode %{
  __ sarl($dst$$Register, 31);
  %}
  ins_pipe(ialu_reg);
%}

/* better to save a register than avoid a branch */
instruct cadd_cmpLTMask(rRegI p, rRegI q, rRegI y, eFlagsReg cr) %{
  match(Set p (AddI (AndI (CmpLTMask p q) y) (SubI p q)));
  effect(KILL cr);
  ins_cost(400);
  format %{ "SUB    $p,$q\t# cadd_cmpLTMask\n\t"
            "JGE    done\n\t"
            "ADD    $p,$y\n"
            "done:  " %}
  ins_encode %{
    Register Rp = $p$$Register;
    Register Rq = $q$$Register;
    Register Ry = $y$$Register;
    Label done;
    __ subl(Rp, Rq);
    __ jccb(Assembler::greaterEqual, done);
    __ addl(Rp, Ry);
    __ bind(done);
  %}

  ins_pipe(pipe_cmplt);
%}

/* better to save a register than avoid a branch */
instruct and_cmpLTMask(rRegI p, rRegI q, rRegI y, eFlagsReg cr) %{
  match(Set y (AndI (CmpLTMask p q) y));
  effect(KILL cr);

  ins_cost(300);

  format %{ "CMPL     $p, $q\t# and_cmpLTMask\n\t"
            "JLT      done\n\t"
            "XORL     $y, $y\n"
            "done:  " %}
  ins_encode %{
    Register Rp = $p$$Register;
    Register Rq = $q$$Register;
    Register Ry = $y$$Register;
    Label done;
    __ cmpl(Rp, Rq);
    __ jccb(Assembler::less, done);
    __ xorl(Ry, Ry);
    __ bind(done);
  %}

  ins_pipe(pipe_cmplt);
%}

/* If I enable this, I encourage spilling in the inner loop of compress.
instruct cadd_cmpLTMask_mem(ncxRegI p, ncxRegI q, memory y, eCXRegI tmp, eFlagsReg cr) %{
  match(Set p (AddI (AndI (CmpLTMask p q) (LoadI y)) (SubI p q)));
*/

//----------Long Instructions------------------------------------------------
// Add Long Register with Register
instruct addL_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
  match(Set dst (AddL dst src));
  effect(KILL cr);
  ins_cost(200);
  format %{ "ADD    $dst.lo,$src.lo\n\t"
            "ADC    $dst.hi,$src.hi" %}
  opcode(0x03, 0x13);
  ins_encode( RegReg_Lo(dst, src), RegReg_Hi(dst,src) );
  ins_pipe( ialu_reg_reg_long );
%}

// Add Long Register with Immediate
instruct addL_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
  match(Set dst (AddL dst src));
  effect(KILL cr);
  format %{ "ADD    $dst.lo,$src.lo\n\t"
            "ADC    $dst.hi,$src.hi" %}
  opcode(0x81,0x00,0x02);  /* Opcode 81 /0, 81 /2 */
  ins_encode( Long_OpcSErm_Lo( dst, src ), Long_OpcSErm_Hi( dst, src ) );
  ins_pipe( ialu_reg_long );
%}

// Add Long Register with Memory
instruct addL_eReg_mem(eRegL dst, load_long_memory mem, eFlagsReg cr) %{
  match(Set dst (AddL dst (LoadL mem)));
  effect(KILL cr);
  ins_cost(125);
  format %{ "ADD    $dst.lo,$mem\n\t"
            "ADC    $dst.hi,$mem+4" %}
  opcode(0x03, 0x13);
  ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
  ins_pipe( ialu_reg_long_mem );
%}

// Subtract Long Register with Register.
instruct subL_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
  match(Set dst (SubL dst src));
  effect(KILL cr);
  ins_cost(200);
  format %{ "SUB    $dst.lo,$src.lo\n\t"
            "SBB    $dst.hi,$src.hi" %}
  opcode(0x2B, 0x1B);
  ins_encode( RegReg_Lo(dst, src), RegReg_Hi(dst,src) );
  ins_pipe( ialu_reg_reg_long );
%}

// Subtract Long Register with Immediate
instruct subL_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
  match(Set dst (SubL dst src));
  effect(KILL cr);
  format %{ "SUB    $dst.lo,$src.lo\n\t"
            "SBB    $dst.hi,$src.hi" %}
  opcode(0x81,0x05,0x03);  /* Opcode 81 /5, 81 /3 */
  ins_encode( Long_OpcSErm_Lo( dst, src ), Long_OpcSErm_Hi( dst, src ) );
  ins_pipe( ialu_reg_long );
%}

// Subtract Long Register with Memory
instruct subL_eReg_mem(eRegL dst, load_long_memory mem, eFlagsReg cr) %{
  match(Set dst (SubL dst (LoadL mem)));
  effect(KILL cr);
  ins_cost(125);
  format %{ "SUB    $dst.lo,$mem\n\t"
            "SBB    $dst.hi,$mem+4" %}
  opcode(0x2B, 0x1B);
  ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
  ins_pipe( ialu_reg_long_mem );
%}

instruct negL_eReg(eRegL dst, immL0 zero, eFlagsReg cr) %{
  match(Set dst (SubL zero dst));
  effect(KILL cr);
  ins_cost(300);
  format %{ "NEG    $dst.hi\n\tNEG    $dst.lo\n\tSBB    $dst.hi,0" %}
  ins_encode( neg_long(dst) );
  ins_pipe( ialu_reg_reg_long );
%}

// And Long Register with Register
instruct andL_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
  match(Set dst (AndL dst src));
  effect(KILL cr);
  format %{ "AND    $dst.lo,$src.lo\n\t"
            "AND    $dst.hi,$src.hi" %}
  opcode(0x23,0x23);
  ins_encode( RegReg_Lo( dst, src), RegReg_Hi( dst, src) );
  ins_pipe( ialu_reg_reg_long );
%}

// And Long Register with Immediate
instruct andL_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
  match(Set dst (AndL dst src));
  effect(KILL cr);
  format %{ "AND    $dst.lo,$src.lo\n\t"
            "AND    $dst.hi,$src.hi" %}
  opcode(0x81,0x04,0x04);  /* Opcode 81 /4, 81 /4 */
  ins_encode( Long_OpcSErm_Lo( dst, src ), Long_OpcSErm_Hi( dst, src ) );
  ins_pipe( ialu_reg_long );
%}

// And Long Register with Memory
instruct andL_eReg_mem(eRegL dst, load_long_memory mem, eFlagsReg cr) %{
  match(Set dst (AndL dst (LoadL mem)));
  effect(KILL cr);
  ins_cost(125);
  format %{ "AND    $dst.lo,$mem\n\t"
            "AND    $dst.hi,$mem+4" %}
  opcode(0x23, 0x23);
  ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
  ins_pipe( ialu_reg_long_mem );
%}

// Or Long Register with Register
instruct orl_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
  match(Set dst (OrL dst src));
  effect(KILL cr);
  format %{ "OR     $dst.lo,$src.lo\n\t"
            "OR     $dst.hi,$src.hi" %}
  opcode(0x0B,0x0B);
  ins_encode( RegReg_Lo( dst, src), RegReg_Hi( dst, src) );
  ins_pipe( ialu_reg_reg_long );
%}

// Or Long Register with Immediate
instruct orl_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
  match(Set dst (OrL dst src));
  effect(KILL cr);
  format %{ "OR     $dst.lo,$src.lo\n\t"
            "OR     $dst.hi,$src.hi" %}
  opcode(0x81,0x01,0x01);  /* Opcode 81 /1, 81 /1 */
  ins_encode( Long_OpcSErm_Lo( dst, src ), Long_OpcSErm_Hi( dst, src ) );
  ins_pipe( ialu_reg_long );
%}

// Or Long Register with Memory
instruct orl_eReg_mem(eRegL dst, load_long_memory mem, eFlagsReg cr) %{
  match(Set dst (OrL dst (LoadL mem)));
  effect(KILL cr);
  ins_cost(125);
  format %{ "OR     $dst.lo,$mem\n\t"
            "OR     $dst.hi,$mem+4" %}
  opcode(0x0B,0x0B);
  ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
  ins_pipe( ialu_reg_long_mem );
%}

// Xor Long Register with Register
instruct xorl_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
  match(Set dst (XorL dst src));
  effect(KILL cr);
  format %{ "XOR    $dst.lo,$src.lo\n\t"
            "XOR    $dst.hi,$src.hi" %}
  opcode(0x33,0x33);
  ins_encode( RegReg_Lo( dst, src), RegReg_Hi( dst, src) );
  ins_pipe( ialu_reg_reg_long );
%}

// Xor Long Register with Immediate -1
instruct xorl_eReg_im1(eRegL dst, immL_M1 imm) %{
  match(Set dst (XorL dst imm));  
  format %{ "NOT    $dst.lo\n\t"
            "NOT    $dst.hi" %}
  ins_encode %{
     __ notl($dst$$Register);
     __ notl(HIGH_FROM_LOW($dst$$Register));
  %}
  ins_pipe( ialu_reg_long );
%}

// Xor Long Register with Immediate
instruct xorl_eReg_imm(eRegL dst, immL src, eFlagsReg cr) %{
  match(Set dst (XorL dst src));
  effect(KILL cr);
  format %{ "XOR    $dst.lo,$src.lo\n\t"
            "XOR    $dst.hi,$src.hi" %}
  opcode(0x81,0x06,0x06);  /* Opcode 81 /6, 81 /6 */
  ins_encode( Long_OpcSErm_Lo( dst, src ), Long_OpcSErm_Hi( dst, src ) );
  ins_pipe( ialu_reg_long );
%}

// Xor Long Register with Memory
instruct xorl_eReg_mem(eRegL dst, load_long_memory mem, eFlagsReg cr) %{
  match(Set dst (XorL dst (LoadL mem)));
  effect(KILL cr);
  ins_cost(125);
  format %{ "XOR    $dst.lo,$mem\n\t"
            "XOR    $dst.hi,$mem+4" %}
  opcode(0x33,0x33);
  ins_encode( OpcP, RegMem( dst, mem), OpcS, RegMem_Hi(dst,mem) );
  ins_pipe( ialu_reg_long_mem );
%}

// Shift Left Long by 1
instruct shlL_eReg_1(eRegL dst, immI_1 cnt, eFlagsReg cr) %{
  predicate(UseNewLongLShift);
  match(Set dst (LShiftL dst cnt));
  effect(KILL cr);
  ins_cost(100);
  format %{ "ADD    $dst.lo,$dst.lo\n\t"
            "ADC    $dst.hi,$dst.hi" %}
  ins_encode %{
    __ addl($dst$$Register,$dst$$Register);
    __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
  %}
  ins_pipe( ialu_reg_long );
%}

// Shift Left Long by 2
instruct shlL_eReg_2(eRegL dst, immI_2 cnt, eFlagsReg cr) %{
  predicate(UseNewLongLShift);
  match(Set dst (LShiftL dst cnt));
  effect(KILL cr);
  ins_cost(100);
  format %{ "ADD    $dst.lo,$dst.lo\n\t"
            "ADC    $dst.hi,$dst.hi\n\t" 
            "ADD    $dst.lo,$dst.lo\n\t"
            "ADC    $dst.hi,$dst.hi" %}
  ins_encode %{
    __ addl($dst$$Register,$dst$$Register);
    __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
    __ addl($dst$$Register,$dst$$Register);
    __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
  %}
  ins_pipe( ialu_reg_long );
%}

// Shift Left Long by 3
instruct shlL_eReg_3(eRegL dst, immI_3 cnt, eFlagsReg cr) %{
  predicate(UseNewLongLShift);
  match(Set dst (LShiftL dst cnt));
  effect(KILL cr);
  ins_cost(100);
  format %{ "ADD    $dst.lo,$dst.lo\n\t"
            "ADC    $dst.hi,$dst.hi\n\t" 
            "ADD    $dst.lo,$dst.lo\n\t"
            "ADC    $dst.hi,$dst.hi\n\t" 
            "ADD    $dst.lo,$dst.lo\n\t"
            "ADC    $dst.hi,$dst.hi" %}
  ins_encode %{
    __ addl($dst$$Register,$dst$$Register);
    __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
    __ addl($dst$$Register,$dst$$Register);
    __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
    __ addl($dst$$Register,$dst$$Register);
    __ adcl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
  %}
  ins_pipe( ialu_reg_long );
%}

// Shift Left Long by 1-31
instruct shlL_eReg_1_31(eRegL dst, immI_1_31 cnt, eFlagsReg cr) %{
  match(Set dst (LShiftL dst cnt));
  effect(KILL cr);
  ins_cost(200);
  format %{ "SHLD   $dst.hi,$dst.lo,$cnt\n\t"
            "SHL    $dst.lo,$cnt" %}
  opcode(0xC1, 0x4, 0xA4);  /* 0F/A4, then C1 /4 ib */
  ins_encode( move_long_small_shift(dst,cnt) );
  ins_pipe( ialu_reg_long );
%}

// Shift Left Long by 32-63
instruct shlL_eReg_32_63(eRegL dst, immI_32_63 cnt, eFlagsReg cr) %{
  match(Set dst (LShiftL dst cnt));
  effect(KILL cr);
  ins_cost(300);
  format %{ "MOV    $dst.hi,$dst.lo\n"
          "\tSHL    $dst.hi,$cnt-32\n"
          "\tXOR    $dst.lo,$dst.lo" %}
  opcode(0xC1, 0x4);  /* C1 /4 ib */
  ins_encode( move_long_big_shift_clr(dst,cnt) );
  ins_pipe( ialu_reg_long );
%}

// Shift Left Long by variable
instruct salL_eReg_CL(eRegL dst, eCXRegI shift, eFlagsReg cr) %{
  match(Set dst (LShiftL dst shift));
  effect(KILL cr);
  ins_cost(500+200);
  size(17);
  format %{ "TEST   $shift,32\n\t"
            "JEQ,s  small\n\t"
            "MOV    $dst.hi,$dst.lo\n\t"
            "XOR    $dst.lo,$dst.lo\n"
    "small:\tSHLD   $dst.hi,$dst.lo,$shift\n\t"
            "SHL    $dst.lo,$shift" %}
  ins_encode( shift_left_long( dst, shift ) );
  ins_pipe( pipe_slow );
%}

// Shift Right Long by 1-31
instruct shrL_eReg_1_31(eRegL dst, immI_1_31 cnt, eFlagsReg cr) %{
  match(Set dst (URShiftL dst cnt));
  effect(KILL cr);
  ins_cost(200);
  format %{ "SHRD   $dst.lo,$dst.hi,$cnt\n\t"
            "SHR    $dst.hi,$cnt" %}
  opcode(0xC1, 0x5, 0xAC);  /* 0F/AC, then C1 /5 ib */
  ins_encode( move_long_small_shift(dst,cnt) );
  ins_pipe( ialu_reg_long );
%}

// Shift Right Long by 32-63
instruct shrL_eReg_32_63(eRegL dst, immI_32_63 cnt, eFlagsReg cr) %{
  match(Set dst (URShiftL dst cnt));
  effect(KILL cr);
  ins_cost(300);
  format %{ "MOV    $dst.lo,$dst.hi\n"
          "\tSHR    $dst.lo,$cnt-32\n"
          "\tXOR    $dst.hi,$dst.hi" %}
  opcode(0xC1, 0x5);  /* C1 /5 ib */
  ins_encode( move_long_big_shift_clr(dst,cnt) );
  ins_pipe( ialu_reg_long );
%}

// Shift Right Long by variable
instruct shrL_eReg_CL(eRegL dst, eCXRegI shift, eFlagsReg cr) %{
  match(Set dst (URShiftL dst shift));
  effect(KILL cr);
  ins_cost(600);
  size(17);
  format %{ "TEST   $shift,32\n\t"
            "JEQ,s  small\n\t"
            "MOV    $dst.lo,$dst.hi\n\t"
            "XOR    $dst.hi,$dst.hi\n"
    "small:\tSHRD   $dst.lo,$dst.hi,$shift\n\t"
            "SHR    $dst.hi,$shift" %}
  ins_encode( shift_right_long( dst, shift ) );
  ins_pipe( pipe_slow );
%}

// Shift Right Long by 1-31
instruct sarL_eReg_1_31(eRegL dst, immI_1_31 cnt, eFlagsReg cr) %{
  match(Set dst (RShiftL dst cnt));
  effect(KILL cr);
  ins_cost(200);
  format %{ "SHRD   $dst.lo,$dst.hi,$cnt\n\t"
            "SAR    $dst.hi,$cnt" %}
  opcode(0xC1, 0x7, 0xAC);  /* 0F/AC, then C1 /7 ib */
  ins_encode( move_long_small_shift(dst,cnt) );
  ins_pipe( ialu_reg_long );
%}

// Shift Right Long by 32-63
instruct sarL_eReg_32_63( eRegL dst, immI_32_63 cnt, eFlagsReg cr) %{
  match(Set dst (RShiftL dst cnt));
  effect(KILL cr);
  ins_cost(300);
  format %{ "MOV    $dst.lo,$dst.hi\n"
          "\tSAR    $dst.lo,$cnt-32\n"
          "\tSAR    $dst.hi,31" %}
  opcode(0xC1, 0x7);  /* C1 /7 ib */
  ins_encode( move_long_big_shift_sign(dst,cnt) );
  ins_pipe( ialu_reg_long );
%}

// Shift Right arithmetic Long by variable
instruct sarL_eReg_CL(eRegL dst, eCXRegI shift, eFlagsReg cr) %{
  match(Set dst (RShiftL dst shift));
  effect(KILL cr);
  ins_cost(600);
  size(18);
  format %{ "TEST   $shift,32\n\t"
            "JEQ,s  small\n\t"
            "MOV    $dst.lo,$dst.hi\n\t"
            "SAR    $dst.hi,31\n"
    "small:\tSHRD   $dst.lo,$dst.hi,$shift\n\t"
            "SAR    $dst.hi,$shift" %}
  ins_encode( shift_right_arith_long( dst, shift ) );
  ins_pipe( pipe_slow );
%}


//----------Double Instructions------------------------------------------------
// Double Math

// Compare & branch

// P6 version of float compare, sets condition codes in EFLAGS
instruct cmpDPR_cc_P6(eFlagsRegU cr, regDPR src1, regDPR src2, eAXRegI rax) %{
  predicate(VM_Version::supports_cmov() && UseSSE <=1);
  match(Set cr (CmpD src1 src2));
  effect(KILL rax);
  ins_cost(150);
  format %{ "FLD    $src1\n\t"
            "FUCOMIP ST,$src2  // P6 instruction\n\t"
            "JNP    exit\n\t"
            "MOV    ah,1       // saw a NaN, set CF\n\t"
            "SAHF\n"
     "exit:\tNOP               // avoid branch to branch" %}
  opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
  ins_encode( Push_Reg_DPR(src1),
              OpcP, RegOpc(src2),
              cmpF_P6_fixup );
  ins_pipe( pipe_slow );
%}

instruct cmpDPR_cc_P6CF(eFlagsRegUCF cr, regDPR src1, regDPR src2) %{
  predicate(VM_Version::supports_cmov() && UseSSE <=1);
  match(Set cr (CmpD src1 src2));
  ins_cost(150);
  format %{ "FLD    $src1\n\t"
            "FUCOMIP ST,$src2  // P6 instruction" %}
  opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
  ins_encode( Push_Reg_DPR(src1),
              OpcP, RegOpc(src2));
  ins_pipe( pipe_slow );
%}

// Compare & branch
instruct cmpDPR_cc(eFlagsRegU cr, regDPR src1, regDPR src2, eAXRegI rax) %{
  predicate(UseSSE<=1);
  match(Set cr (CmpD src1 src2));
  effect(KILL rax);
  ins_cost(200);
  format %{ "FLD    $src1\n\t"
            "FCOMp  $src2\n\t"
            "FNSTSW AX\n\t"
            "TEST   AX,0x400\n\t"
            "JZ,s   flags\n\t"
            "MOV    AH,1\t# unordered treat as LT\n"
    "flags:\tSAHF" %}
  opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
  ins_encode( Push_Reg_DPR(src1),
              OpcP, RegOpc(src2),
              fpu_flags);
  ins_pipe( pipe_slow );
%}

// Compare vs zero into -1,0,1
instruct cmpDPR_0(rRegI dst, regDPR src1, immDPR0 zero, eAXRegI rax, eFlagsReg cr) %{
  predicate(UseSSE<=1);
  match(Set dst (CmpD3 src1 zero));
  effect(KILL cr, KILL rax);
  ins_cost(280);
  format %{ "FTSTD  $dst,$src1" %}
  opcode(0xE4, 0xD9);
  ins_encode( Push_Reg_DPR(src1),
              OpcS, OpcP, PopFPU,
              CmpF_Result(dst));
  ins_pipe( pipe_slow );
%}

// Compare into -1,0,1
instruct cmpDPR_reg(rRegI dst, regDPR src1, regDPR src2, eAXRegI rax, eFlagsReg cr) %{
  predicate(UseSSE<=1);
  match(Set dst (CmpD3 src1 src2));
  effect(KILL cr, KILL rax);
  ins_cost(300);
  format %{ "FCMPD  $dst,$src1,$src2" %}
  opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
  ins_encode( Push_Reg_DPR(src1),
              OpcP, RegOpc(src2),
              CmpF_Result(dst));
  ins_pipe( pipe_slow );
%}

// float compare and set condition codes in EFLAGS by XMM regs
instruct cmpD_cc(eFlagsRegU cr, regD src1, regD src2) %{
  predicate(UseSSE>=2);
  match(Set cr (CmpD src1 src2));
  ins_cost(145);
  format %{ "UCOMISD $src1,$src2\n\t"
            "JNP,s   exit\n\t"
            "PUSHF\t# saw NaN, set CF\n\t"
            "AND     [rsp], #0xffffff2b\n\t"
            "POPF\n"
    "exit:" %}
  ins_encode %{
    __ ucomisd($src1$$XMMRegister, $src2$$XMMRegister);
    emit_cmpfp_fixup(_masm);
  %}
  ins_pipe( pipe_slow );
%}

instruct cmpD_ccCF(eFlagsRegUCF cr, regD src1, regD src2) %{
  predicate(UseSSE>=2);
  match(Set cr (CmpD src1 src2));
  ins_cost(100);
  format %{ "UCOMISD $src1,$src2" %}
  ins_encode %{
    __ ucomisd($src1$$XMMRegister, $src2$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
%}

// float compare and set condition codes in EFLAGS by XMM regs
instruct cmpD_ccmem(eFlagsRegU cr, regD src1, memory src2) %{
  predicate(UseSSE>=2);
  match(Set cr (CmpD src1 (LoadD src2)));
  ins_cost(145);
  format %{ "UCOMISD $src1,$src2\n\t"
            "JNP,s   exit\n\t"
            "PUSHF\t# saw NaN, set CF\n\t"
            "AND     [rsp], #0xffffff2b\n\t"
            "POPF\n"
    "exit:" %}
  ins_encode %{
    __ ucomisd($src1$$XMMRegister, $src2$$Address);
    emit_cmpfp_fixup(_masm);
  %}
  ins_pipe( pipe_slow );
%}

instruct cmpD_ccmemCF(eFlagsRegUCF cr, regD src1, memory src2) %{
  predicate(UseSSE>=2);
  match(Set cr (CmpD src1 (LoadD src2)));
  ins_cost(100);
  format %{ "UCOMISD $src1,$src2" %}
  ins_encode %{
    __ ucomisd($src1$$XMMRegister, $src2$$Address);
  %}
  ins_pipe( pipe_slow );
%}

// Compare into -1,0,1 in XMM
instruct cmpD_reg(xRegI dst, regD src1, regD src2, eFlagsReg cr) %{
  predicate(UseSSE>=2);
  match(Set dst (CmpD3 src1 src2));
  effect(KILL cr);
  ins_cost(255);
  format %{ "UCOMISD $src1, $src2\n\t"
            "MOV     $dst, #-1\n\t"
            "JP,s    done\n\t"
            "JB,s    done\n\t"
            "SETNE   $dst\n\t"
            "MOVZB   $dst, $dst\n"
    "done:" %}
  ins_encode %{
    __ ucomisd($src1$$XMMRegister, $src2$$XMMRegister);
    emit_cmpfp3(_masm, $dst$$Register);
  %}
  ins_pipe( pipe_slow );
%}

// Compare into -1,0,1 in XMM and memory
instruct cmpD_regmem(xRegI dst, regD src1, memory src2, eFlagsReg cr) %{
  predicate(UseSSE>=2);
  match(Set dst (CmpD3 src1 (LoadD src2)));
  effect(KILL cr);
  ins_cost(275);
  format %{ "UCOMISD $src1, $src2\n\t"
            "MOV     $dst, #-1\n\t"
            "JP,s    done\n\t"
            "JB,s    done\n\t"
            "SETNE   $dst\n\t"
            "MOVZB   $dst, $dst\n"
    "done:" %}
  ins_encode %{
    __ ucomisd($src1$$XMMRegister, $src2$$Address);
    emit_cmpfp3(_masm, $dst$$Register);
  %}
  ins_pipe( pipe_slow );
%}


instruct subDPR_reg(regDPR dst, regDPR src) %{
  predicate (UseSSE <=1);
  match(Set dst (SubD dst src));

  format %{ "FLD    $src\n\t"
            "DSUBp  $dst,ST" %}
  opcode(0xDE, 0x5); /* DE E8+i  or DE /5 */
  ins_cost(150);
  ins_encode( Push_Reg_DPR(src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_reg );
%}

instruct subDPR_reg_round(stackSlotD dst, regDPR src1, regDPR src2) %{
  predicate (UseSSE <=1);
  match(Set dst (RoundDouble (SubD src1 src2)));
  ins_cost(250);

  format %{ "FLD    $src2\n\t"
            "DSUB   ST,$src1\n\t"
            "FSTP_D $dst\t# D-round" %}
  opcode(0xD8, 0x5);
  ins_encode( Push_Reg_DPR(src2),
              OpcP, RegOpc(src1), Pop_Mem_DPR(dst) );
  ins_pipe( fpu_mem_reg_reg );
%}


instruct subDPR_reg_mem(regDPR dst, memory src) %{
  predicate (UseSSE <=1);
  match(Set dst (SubD dst (LoadD src)));
  ins_cost(150);

  format %{ "FLD    $src\n\t"
            "DSUBp  $dst,ST" %}
  opcode(0xDE, 0x5, 0xDD); /* DE C0+i */  /* LoadD  DD /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_mem );
%}

instruct absDPR_reg(regDPR1 dst, regDPR1 src) %{
  predicate (UseSSE<=1);
  match(Set dst (AbsD src));
  ins_cost(100);
  format %{ "FABS" %}
  opcode(0xE1, 0xD9);
  ins_encode( OpcS, OpcP );
  ins_pipe( fpu_reg_reg );
%}

instruct negDPR_reg(regDPR1 dst, regDPR1 src) %{
  predicate(UseSSE<=1);
  match(Set dst (NegD src));
  ins_cost(100);
  format %{ "FCHS" %}
  opcode(0xE0, 0xD9);
  ins_encode( OpcS, OpcP );
  ins_pipe( fpu_reg_reg );
%}

instruct addDPR_reg(regDPR dst, regDPR src) %{
  predicate(UseSSE<=1);
  match(Set dst (AddD dst src));
  format %{ "FLD    $src\n\t"
            "DADD   $dst,ST" %}
  size(4);
  ins_cost(150);
  opcode(0xDE, 0x0); /* DE C0+i or DE /0*/
  ins_encode( Push_Reg_DPR(src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_reg );
%}


instruct addDPR_reg_round(stackSlotD dst, regDPR src1, regDPR src2) %{
  predicate(UseSSE<=1);
  match(Set dst (RoundDouble (AddD src1 src2)));
  ins_cost(250);

  format %{ "FLD    $src2\n\t"
            "DADD   ST,$src1\n\t"
            "FSTP_D $dst\t# D-round" %}
  opcode(0xD8, 0x0); /* D8 C0+i or D8 /0*/
  ins_encode( Push_Reg_DPR(src2),
              OpcP, RegOpc(src1), Pop_Mem_DPR(dst) );
  ins_pipe( fpu_mem_reg_reg );
%}


instruct addDPR_reg_mem(regDPR dst, memory src) %{
  predicate(UseSSE<=1);
  match(Set dst (AddD dst (LoadD src)));
  ins_cost(150);

  format %{ "FLD    $src\n\t"
            "DADDp  $dst,ST" %}
  opcode(0xDE, 0x0, 0xDD); /* DE C0+i */  /* LoadD  DD /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_mem );
%}

// add-to-memory
instruct addDPR_mem_reg(memory dst, regDPR src) %{
  predicate(UseSSE<=1);
  match(Set dst (StoreD dst (RoundDouble (AddD (LoadD dst) src))));
  ins_cost(150);

  format %{ "FLD_D  $dst\n\t"
            "DADD   ST,$src\n\t"
            "FST_D  $dst" %}
  opcode(0xDD, 0x0);
  ins_encode( Opcode(0xDD), RMopc_Mem(0x00,dst),
              Opcode(0xD8), RegOpc(src),
              set_instruction_start,
              Opcode(0xDD), RMopc_Mem(0x03,dst) );
  ins_pipe( fpu_reg_mem );
%}

instruct addDPR_reg_imm1(regDPR dst, immDPR1 con) %{
  predicate(UseSSE<=1);
  match(Set dst (AddD dst con));
  ins_cost(125);
  format %{ "FLD1\n\t"
            "DADDp  $dst,ST" %}
  ins_encode %{
    __ fld1();
    __ faddp($dst$$reg);
  %}
  ins_pipe(fpu_reg);
%}

instruct addDPR_reg_imm(regDPR dst, immDPR con) %{
  predicate(UseSSE<=1 && _kids[1]->_leaf->getd() != 0.0 && _kids[1]->_leaf->getd() != 1.0 );
  match(Set dst (AddD dst con));
  ins_cost(200);
  format %{ "FLD_D  [$constantaddress]\t# load from constant table: double=$con\n\t"
            "DADDp  $dst,ST" %}
  ins_encode %{
    __ fld_d($constantaddress($con));
    __ faddp($dst$$reg);
  %}
  ins_pipe(fpu_reg_mem);
%}

instruct addDPR_reg_imm_round(stackSlotD dst, regDPR src, immDPR con) %{
  predicate(UseSSE<=1 && _kids[0]->_kids[1]->_leaf->getd() != 0.0 && _kids[0]->_kids[1]->_leaf->getd() != 1.0 );
  match(Set dst (RoundDouble (AddD src con)));
  ins_cost(200);
  format %{ "FLD_D  [$constantaddress]\t# load from constant table: double=$con\n\t"
            "DADD   ST,$src\n\t"
            "FSTP_D $dst\t# D-round" %}
  ins_encode %{
    __ fld_d($constantaddress($con));
    __ fadd($src$$reg);
    __ fstp_d(Address(rsp, $dst$$disp));
  %}
  ins_pipe(fpu_mem_reg_con);
%}

instruct mulDPR_reg(regDPR dst, regDPR src) %{
  predicate(UseSSE<=1);
  match(Set dst (MulD dst src));
  format %{ "FLD    $src\n\t"
            "DMULp  $dst,ST" %}
  opcode(0xDE, 0x1); /* DE C8+i or DE /1*/
  ins_cost(150);
  ins_encode( Push_Reg_DPR(src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_reg );
%}

// Strict FP instruction biases argument before multiply then
// biases result to avoid double rounding of subnormals.
//
// scale arg1 by multiplying arg1 by 2^(-15360)
// load arg2
// multiply scaled arg1 by arg2
// rescale product by 2^(15360)
//
instruct strictfp_mulDPR_reg(regDPR1 dst, regnotDPR1 src) %{
  predicate( UseSSE<=1 && Compile::current()->has_method() && Compile::current()->method()->is_strict() );
  match(Set dst (MulD dst src));
  ins_cost(1);   // Select this instruction for all strict FP double multiplies

  format %{ "FLD    StubRoutines::_fpu_subnormal_bias1\n\t"
            "DMULp  $dst,ST\n\t"
            "FLD    $src\n\t"
            "DMULp  $dst,ST\n\t"
            "FLD    StubRoutines::_fpu_subnormal_bias2\n\t"
            "DMULp  $dst,ST\n\t" %}
  opcode(0xDE, 0x1); /* DE C8+i or DE /1*/
  ins_encode( strictfp_bias1(dst),
              Push_Reg_DPR(src),
              OpcP, RegOpc(dst),
              strictfp_bias2(dst) );
  ins_pipe( fpu_reg_reg );
%}

instruct mulDPR_reg_imm(regDPR dst, immDPR con) %{
  predicate( UseSSE<=1 && _kids[1]->_leaf->getd() != 0.0 && _kids[1]->_leaf->getd() != 1.0 );
  match(Set dst (MulD dst con));
  ins_cost(200);
  format %{ "FLD_D  [$constantaddress]\t# load from constant table: double=$con\n\t"
            "DMULp  $dst,ST" %}
  ins_encode %{
    __ fld_d($constantaddress($con));
    __ fmulp($dst$$reg);
  %}
  ins_pipe(fpu_reg_mem);
%}


instruct mulDPR_reg_mem(regDPR dst, memory src) %{
  predicate( UseSSE<=1 );
  match(Set dst (MulD dst (LoadD src)));
  ins_cost(200);
  format %{ "FLD_D  $src\n\t"
            "DMULp  $dst,ST" %}
  opcode(0xDE, 0x1, 0xDD); /* DE C8+i or DE /1*/  /* LoadD  DD /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_mem );
%}

//
// Cisc-alternate to reg-reg multiply
instruct mulDPR_reg_mem_cisc(regDPR dst, regDPR src, memory mem) %{
  predicate( UseSSE<=1 );
  match(Set dst (MulD src (LoadD mem)));
  ins_cost(250);
  format %{ "FLD_D  $mem\n\t"
            "DMUL   ST,$src\n\t"
            "FSTP_D $dst" %}
  opcode(0xD8, 0x1, 0xD9); /* D8 C8+i */  /* LoadD D9 /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,mem),
              OpcReg_FPR(src),
              Pop_Reg_DPR(dst) );
  ins_pipe( fpu_reg_reg_mem );
%}


// MACRO3 -- addDPR a mulDPR
// This instruction is a '2-address' instruction in that the result goes
// back to src2.  This eliminates a move from the macro; possibly the
// register allocator will have to add it back (and maybe not).
instruct addDPR_mulDPR_reg(regDPR src2, regDPR src1, regDPR src0) %{
  predicate( UseSSE<=1 );
  match(Set src2 (AddD (MulD src0 src1) src2));
  format %{ "FLD    $src0\t# ===MACRO3d===\n\t"
            "DMUL   ST,$src1\n\t"
            "DADDp  $src2,ST" %}
  ins_cost(250);
  opcode(0xDD); /* LoadD DD /0 */
  ins_encode( Push_Reg_FPR(src0),
              FMul_ST_reg(src1),
              FAddP_reg_ST(src2) );
  ins_pipe( fpu_reg_reg_reg );
%}


// MACRO3 -- subDPR a mulDPR
instruct subDPR_mulDPR_reg(regDPR src2, regDPR src1, regDPR src0) %{
  predicate( UseSSE<=1 );
  match(Set src2 (SubD (MulD src0 src1) src2));
  format %{ "FLD    $src0\t# ===MACRO3d===\n\t"
            "DMUL   ST,$src1\n\t"
            "DSUBRp $src2,ST" %}
  ins_cost(250);
  ins_encode( Push_Reg_FPR(src0),
              FMul_ST_reg(src1),
              Opcode(0xDE), Opc_plus(0xE0,src2));
  ins_pipe( fpu_reg_reg_reg );
%}


instruct divDPR_reg(regDPR dst, regDPR src) %{
  predicate( UseSSE<=1 );
  match(Set dst (DivD dst src));

  format %{ "FLD    $src\n\t"
            "FDIVp  $dst,ST" %}
  opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
  ins_cost(150);
  ins_encode( Push_Reg_DPR(src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_reg );
%}

// Strict FP instruction biases argument before division then
// biases result, to avoid double rounding of subnormals.
//
// scale dividend by multiplying dividend by 2^(-15360)
// load divisor
// divide scaled dividend by divisor
// rescale quotient by 2^(15360)
//
instruct strictfp_divDPR_reg(regDPR1 dst, regnotDPR1 src) %{
  predicate (UseSSE<=1);
  match(Set dst (DivD dst src));
  predicate( UseSSE<=1 && Compile::current()->has_method() && Compile::current()->method()->is_strict() );
  ins_cost(01);

  format %{ "FLD    StubRoutines::_fpu_subnormal_bias1\n\t"
            "DMULp  $dst,ST\n\t"
            "FLD    $src\n\t"
            "FDIVp  $dst,ST\n\t"
            "FLD    StubRoutines::_fpu_subnormal_bias2\n\t"
            "DMULp  $dst,ST\n\t" %}
  opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
  ins_encode( strictfp_bias1(dst),
              Push_Reg_DPR(src),
              OpcP, RegOpc(dst),
              strictfp_bias2(dst) );
  ins_pipe( fpu_reg_reg );
%}

instruct divDPR_reg_round(stackSlotD dst, regDPR src1, regDPR src2) %{
  predicate( UseSSE<=1 && !(Compile::current()->has_method() && Compile::current()->method()->is_strict()) );
  match(Set dst (RoundDouble (DivD src1 src2)));

  format %{ "FLD    $src1\n\t"
            "FDIV   ST,$src2\n\t"
            "FSTP_D $dst\t# D-round" %}
  opcode(0xD8, 0x6); /* D8 F0+i or D8 /6 */
  ins_encode( Push_Reg_DPR(src1),
              OpcP, RegOpc(src2), Pop_Mem_DPR(dst) );
  ins_pipe( fpu_mem_reg_reg );
%}


instruct modDPR_reg(regDPR dst, regDPR src, eAXRegI rax, eFlagsReg cr) %{
  predicate(UseSSE<=1);
  match(Set dst (ModD dst src));
  effect(KILL rax, KILL cr); // emitModDPR() uses EAX and EFLAGS

  format %{ "DMOD   $dst,$src" %}
  ins_cost(250);
  ins_encode(Push_Reg_Mod_DPR(dst, src),
              emitModDPR(),
              Push_Result_Mod_DPR(src),
              Pop_Reg_DPR(dst));
  ins_pipe( pipe_slow );
%}

instruct modD_reg(regD dst, regD src0, regD src1, eAXRegI rax, eFlagsReg cr) %{
  predicate(UseSSE>=2);
  match(Set dst (ModD src0 src1));
  effect(KILL rax, KILL cr);

  format %{ "SUB    ESP,8\t # DMOD\n"
          "\tMOVSD  [ESP+0],$src1\n"
          "\tFLD_D  [ESP+0]\n"
          "\tMOVSD  [ESP+0],$src0\n"
          "\tFLD_D  [ESP+0]\n"
     "loop:\tFPREM\n"
          "\tFWAIT\n"
          "\tFNSTSW AX\n"
          "\tSAHF\n"
          "\tJP     loop\n"
          "\tFSTP_D [ESP+0]\n"
          "\tMOVSD  $dst,[ESP+0]\n"
          "\tADD    ESP,8\n"
          "\tFSTP   ST0\t # Restore FPU Stack"
    %}
  ins_cost(250);
  ins_encode( Push_ModD_encoding(src0, src1), emitModDPR(), Push_ResultD(dst), PopFPU);
  ins_pipe( pipe_slow );
%}

instruct sinDPR_reg(regDPR1 dst, regDPR1 src) %{
  predicate (UseSSE<=1);
  match(Set dst (SinD src));
  ins_cost(1800);
  format %{ "DSIN   $dst" %}
  opcode(0xD9, 0xFE);
  ins_encode( OpcP, OpcS );
  ins_pipe( pipe_slow );
%}

instruct sinD_reg(regD dst, eFlagsReg cr) %{
  predicate (UseSSE>=2);
  match(Set dst (SinD dst));
  effect(KILL cr); // Push_{Src|Result}D() uses "{SUB|ADD} ESP,8"
  ins_cost(1800);
  format %{ "DSIN   $dst" %}
  opcode(0xD9, 0xFE);
  ins_encode( Push_SrcD(dst), OpcP, OpcS, Push_ResultD(dst) );
  ins_pipe( pipe_slow );
%}

instruct cosDPR_reg(regDPR1 dst, regDPR1 src) %{
  predicate (UseSSE<=1);
  match(Set dst (CosD src));
  ins_cost(1800);
  format %{ "DCOS   $dst" %}
  opcode(0xD9, 0xFF);
  ins_encode( OpcP, OpcS );
  ins_pipe( pipe_slow );
%}

instruct cosD_reg(regD dst, eFlagsReg cr) %{
  predicate (UseSSE>=2);
  match(Set dst (CosD dst));
  effect(KILL cr); // Push_{Src|Result}D() uses "{SUB|ADD} ESP,8"
  ins_cost(1800);
  format %{ "DCOS   $dst" %}
  opcode(0xD9, 0xFF);
  ins_encode( Push_SrcD(dst), OpcP, OpcS, Push_ResultD(dst) );
  ins_pipe( pipe_slow );
%}

instruct tanDPR_reg(regDPR1 dst, regDPR1 src) %{
  predicate (UseSSE<=1);
  match(Set dst(TanD src));
  format %{ "DTAN   $dst" %}
  ins_encode( Opcode(0xD9), Opcode(0xF2),    // fptan
              Opcode(0xDD), Opcode(0xD8));   // fstp st
  ins_pipe( pipe_slow );
%}

instruct tanD_reg(regD dst, eFlagsReg cr) %{
  predicate (UseSSE>=2);
  match(Set dst(TanD dst));
  effect(KILL cr); // Push_{Src|Result}D() uses "{SUB|ADD} ESP,8"
  format %{ "DTAN   $dst" %}
  ins_encode( Push_SrcD(dst),
              Opcode(0xD9), Opcode(0xF2),    // fptan
              Opcode(0xDD), Opcode(0xD8),   // fstp st
              Push_ResultD(dst) );
  ins_pipe( pipe_slow );
%}

instruct atanDPR_reg(regDPR dst, regDPR src) %{
  predicate (UseSSE<=1);
  match(Set dst(AtanD dst src));
  format %{ "DATA   $dst,$src" %}
  opcode(0xD9, 0xF3);
  ins_encode( Push_Reg_DPR(src),
              OpcP, OpcS, RegOpc(dst) );
  ins_pipe( pipe_slow );
%}

instruct atanD_reg(regD dst, regD src, eFlagsReg cr) %{
  predicate (UseSSE>=2);
  match(Set dst(AtanD dst src));
  effect(KILL cr); // Push_{Src|Result}D() uses "{SUB|ADD} ESP,8"
  format %{ "DATA   $dst,$src" %}
  opcode(0xD9, 0xF3);
  ins_encode( Push_SrcD(src),
              OpcP, OpcS, Push_ResultD(dst) );
  ins_pipe( pipe_slow );
%}

instruct sqrtDPR_reg(regDPR dst, regDPR src) %{
  predicate (UseSSE<=1);
  match(Set dst (SqrtD src));
  format %{ "DSQRT  $dst,$src" %}
  opcode(0xFA, 0xD9);
  ins_encode( Push_Reg_DPR(src),
              OpcS, OpcP, Pop_Reg_DPR(dst) );
  ins_pipe( pipe_slow );
%}

instruct powDPR_reg(regDPR X, regDPR1 Y, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
  predicate (UseSSE<=1);
  match(Set Y (PowD X Y));  // Raise X to the Yth power
  effect(KILL rax, KILL rdx, KILL rcx, KILL cr);
  format %{ "fast_pow $X $Y -> $Y  // KILL $rax, $rcx, $rdx" %}
  ins_encode %{
    __ subptr(rsp, 8);
    __ fld_s($X$$reg - 1);
    __ fast_pow();
    __ addptr(rsp, 8);
  %}
  ins_pipe( pipe_slow );
%}

instruct powD_reg(regD dst, regD src0, regD src1, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
  predicate (UseSSE>=2);
  match(Set dst (PowD src0 src1));  // Raise src0 to the src1'th power
  effect(KILL rax, KILL rdx, KILL rcx, KILL cr);
  format %{ "fast_pow $src0 $src1 -> $dst  // KILL $rax, $rcx, $rdx" %}
  ins_encode %{
    __ subptr(rsp, 8);
    __ movdbl(Address(rsp, 0), $src1$$XMMRegister);
    __ fld_d(Address(rsp, 0));
    __ movdbl(Address(rsp, 0), $src0$$XMMRegister);
    __ fld_d(Address(rsp, 0));
    __ fast_pow();
    __ fstp_d(Address(rsp, 0));
    __ movdbl($dst$$XMMRegister, Address(rsp, 0));
    __ addptr(rsp, 8);
  %}
  ins_pipe( pipe_slow );
%}


instruct expDPR_reg(regDPR1 dpr1, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
  predicate (UseSSE<=1);
  match(Set dpr1 (ExpD dpr1));
  effect(KILL rax, KILL rcx, KILL rdx, KILL cr);
  format %{ "fast_exp $dpr1 -> $dpr1  // KILL $rax, $rcx, $rdx" %}
  ins_encode %{
    __ fast_exp();
  %}
  ins_pipe( pipe_slow );
%}

instruct expD_reg(regD dst, regD src, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
  predicate (UseSSE>=2);
  match(Set dst (ExpD src));
  effect(KILL rax, KILL rcx, KILL rdx, KILL cr);
  format %{ "fast_exp $dst -> $src  // KILL $rax, $rcx, $rdx" %}
  ins_encode %{
    __ subptr(rsp, 8);
    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
    __ fld_d(Address(rsp, 0));
    __ fast_exp();
    __ fstp_d(Address(rsp, 0));
    __ movdbl($dst$$XMMRegister, Address(rsp, 0));
    __ addptr(rsp, 8);
  %}
  ins_pipe( pipe_slow );
%}

instruct log10DPR_reg(regDPR1 dst, regDPR1 src) %{
  predicate (UseSSE<=1);
  // The source Double operand on FPU stack
  match(Set dst (Log10D src));
  // fldlg2       ; push log_10(2) on the FPU stack; full 80-bit number
  // fxch         ; swap ST(0) with ST(1)
  // fyl2x        ; compute log_10(2) * log_2(x)
  format %{ "FLDLG2 \t\t\t#Log10\n\t"
            "FXCH   \n\t"
            "FYL2X  \t\t\t# Q=Log10*Log_2(x)"
         %}
  ins_encode( Opcode(0xD9), Opcode(0xEC),   // fldlg2
              Opcode(0xD9), Opcode(0xC9),   // fxch
              Opcode(0xD9), Opcode(0xF1));  // fyl2x

  ins_pipe( pipe_slow );
%}

instruct log10D_reg(regD dst, regD src, eFlagsReg cr) %{
  predicate (UseSSE>=2);
  effect(KILL cr);
  match(Set dst (Log10D src));
  // fldlg2       ; push log_10(2) on the FPU stack; full 80-bit number
  // fyl2x        ; compute log_10(2) * log_2(x)
  format %{ "FLDLG2 \t\t\t#Log10\n\t"
            "FYL2X  \t\t\t# Q=Log10*Log_2(x)"
         %}
  ins_encode( Opcode(0xD9), Opcode(0xEC),   // fldlg2
              Push_SrcD(src),
              Opcode(0xD9), Opcode(0xF1),   // fyl2x
              Push_ResultD(dst));

  ins_pipe( pipe_slow );
%}

instruct logDPR_reg(regDPR1 dst, regDPR1 src) %{
  predicate (UseSSE<=1);
  // The source Double operand on FPU stack
  match(Set dst (LogD src));
  // fldln2       ; push log_e(2) on the FPU stack; full 80-bit number
  // fxch         ; swap ST(0) with ST(1)
  // fyl2x        ; compute log_e(2) * log_2(x)
  format %{ "FLDLN2 \t\t\t#Log_e\n\t"
            "FXCH   \n\t"
            "FYL2X  \t\t\t# Q=Log_e*Log_2(x)"
         %}
  ins_encode( Opcode(0xD9), Opcode(0xED),   // fldln2
              Opcode(0xD9), Opcode(0xC9),   // fxch
              Opcode(0xD9), Opcode(0xF1));  // fyl2x

  ins_pipe( pipe_slow );
%}

instruct logD_reg(regD dst, regD src, eFlagsReg cr) %{
  predicate (UseSSE>=2);
  effect(KILL cr);
  // The source and result Double operands in XMM registers
  match(Set dst (LogD src));
  // fldln2       ; push log_e(2) on the FPU stack; full 80-bit number
  // fyl2x        ; compute log_e(2) * log_2(x)
  format %{ "FLDLN2 \t\t\t#Log_e\n\t"
            "FYL2X  \t\t\t# Q=Log_e*Log_2(x)"
         %}
  ins_encode( Opcode(0xD9), Opcode(0xED),   // fldln2
              Push_SrcD(src),
              Opcode(0xD9), Opcode(0xF1),   // fyl2x
              Push_ResultD(dst));
  ins_pipe( pipe_slow );
%}

//-------------Float Instructions-------------------------------
// Float Math

// Code for float compare:
//     fcompp();
//     fwait(); fnstsw_ax();
//     sahf();
//     movl(dst, unordered_result);
//     jcc(Assembler::parity, exit);
//     movl(dst, less_result);
//     jcc(Assembler::below, exit);
//     movl(dst, equal_result);
//     jcc(Assembler::equal, exit);
//     movl(dst, greater_result);
//   exit:

// P6 version of float compare, sets condition codes in EFLAGS
instruct cmpFPR_cc_P6(eFlagsRegU cr, regFPR src1, regFPR src2, eAXRegI rax) %{
  predicate(VM_Version::supports_cmov() && UseSSE == 0);
  match(Set cr (CmpF src1 src2));
  effect(KILL rax);
  ins_cost(150);
  format %{ "FLD    $src1\n\t"
            "FUCOMIP ST,$src2  // P6 instruction\n\t"
            "JNP    exit\n\t"
            "MOV    ah,1       // saw a NaN, set CF (treat as LT)\n\t"
            "SAHF\n"
     "exit:\tNOP               // avoid branch to branch" %}
  opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
  ins_encode( Push_Reg_DPR(src1),
              OpcP, RegOpc(src2),
              cmpF_P6_fixup );
  ins_pipe( pipe_slow );
%}

instruct cmpFPR_cc_P6CF(eFlagsRegUCF cr, regFPR src1, regFPR src2) %{
  predicate(VM_Version::supports_cmov() && UseSSE == 0);
  match(Set cr (CmpF src1 src2));
  ins_cost(100);
  format %{ "FLD    $src1\n\t"
            "FUCOMIP ST,$src2  // P6 instruction" %}
  opcode(0xDF, 0x05); /* DF E8+i or DF /5 */
  ins_encode( Push_Reg_DPR(src1),
              OpcP, RegOpc(src2));
  ins_pipe( pipe_slow );
%}


// Compare & branch
instruct cmpFPR_cc(eFlagsRegU cr, regFPR src1, regFPR src2, eAXRegI rax) %{
  predicate(UseSSE == 0);
  match(Set cr (CmpF src1 src2));
  effect(KILL rax);
  ins_cost(200);
  format %{ "FLD    $src1\n\t"
            "FCOMp  $src2\n\t"
            "FNSTSW AX\n\t"
            "TEST   AX,0x400\n\t"
            "JZ,s   flags\n\t"
            "MOV    AH,1\t# unordered treat as LT\n"
    "flags:\tSAHF" %}
  opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
  ins_encode( Push_Reg_DPR(src1),
              OpcP, RegOpc(src2),
              fpu_flags);
  ins_pipe( pipe_slow );
%}

// Compare vs zero into -1,0,1
instruct cmpFPR_0(rRegI dst, regFPR src1, immFPR0 zero, eAXRegI rax, eFlagsReg cr) %{
  predicate(UseSSE == 0);
  match(Set dst (CmpF3 src1 zero));
  effect(KILL cr, KILL rax);
  ins_cost(280);
  format %{ "FTSTF  $dst,$src1" %}
  opcode(0xE4, 0xD9);
  ins_encode( Push_Reg_DPR(src1),
              OpcS, OpcP, PopFPU,
              CmpF_Result(dst));
  ins_pipe( pipe_slow );
%}

// Compare into -1,0,1
instruct cmpFPR_reg(rRegI dst, regFPR src1, regFPR src2, eAXRegI rax, eFlagsReg cr) %{
  predicate(UseSSE == 0);
  match(Set dst (CmpF3 src1 src2));
  effect(KILL cr, KILL rax);
  ins_cost(300);
  format %{ "FCMPF  $dst,$src1,$src2" %}
  opcode(0xD8, 0x3); /* D8 D8+i or D8 /3 */
  ins_encode( Push_Reg_DPR(src1),
              OpcP, RegOpc(src2),
              CmpF_Result(dst));
  ins_pipe( pipe_slow );
%}

// float compare and set condition codes in EFLAGS by XMM regs
instruct cmpF_cc(eFlagsRegU cr, regF src1, regF src2) %{
  predicate(UseSSE>=1);
  match(Set cr (CmpF src1 src2));
  ins_cost(145);
  format %{ "UCOMISS $src1,$src2\n\t"
            "JNP,s   exit\n\t"
            "PUSHF\t# saw NaN, set CF\n\t"
            "AND     [rsp], #0xffffff2b\n\t"
            "POPF\n"
    "exit:" %}
  ins_encode %{
    __ ucomiss($src1$$XMMRegister, $src2$$XMMRegister);
    emit_cmpfp_fixup(_masm);
  %}
  ins_pipe( pipe_slow );
%}

instruct cmpF_ccCF(eFlagsRegUCF cr, regF src1, regF src2) %{
  predicate(UseSSE>=1);
  match(Set cr (CmpF src1 src2));
  ins_cost(100);
  format %{ "UCOMISS $src1,$src2" %}
  ins_encode %{
    __ ucomiss($src1$$XMMRegister, $src2$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
%}

// float compare and set condition codes in EFLAGS by XMM regs
instruct cmpF_ccmem(eFlagsRegU cr, regF src1, memory src2) %{
  predicate(UseSSE>=1);
  match(Set cr (CmpF src1 (LoadF src2)));
  ins_cost(165);
  format %{ "UCOMISS $src1,$src2\n\t"
            "JNP,s   exit\n\t"
            "PUSHF\t# saw NaN, set CF\n\t"
            "AND     [rsp], #0xffffff2b\n\t"
            "POPF\n"
    "exit:" %}
  ins_encode %{
    __ ucomiss($src1$$XMMRegister, $src2$$Address);
    emit_cmpfp_fixup(_masm);
  %}
  ins_pipe( pipe_slow );
%}

instruct cmpF_ccmemCF(eFlagsRegUCF cr, regF src1, memory src2) %{
  predicate(UseSSE>=1);
  match(Set cr (CmpF src1 (LoadF src2)));
  ins_cost(100);
  format %{ "UCOMISS $src1,$src2" %}
  ins_encode %{
    __ ucomiss($src1$$XMMRegister, $src2$$Address);
  %}
  ins_pipe( pipe_slow );
%}

// Compare into -1,0,1 in XMM
instruct cmpF_reg(xRegI dst, regF src1, regF src2, eFlagsReg cr) %{
  predicate(UseSSE>=1);
  match(Set dst (CmpF3 src1 src2));
  effect(KILL cr);
  ins_cost(255);
  format %{ "UCOMISS $src1, $src2\n\t"
            "MOV     $dst, #-1\n\t"
            "JP,s    done\n\t"
            "JB,s    done\n\t"
            "SETNE   $dst\n\t"
            "MOVZB   $dst, $dst\n"
    "done:" %}
  ins_encode %{
    __ ucomiss($src1$$XMMRegister, $src2$$XMMRegister);
    emit_cmpfp3(_masm, $dst$$Register);
  %}
  ins_pipe( pipe_slow );
%}

// Compare into -1,0,1 in XMM and memory
instruct cmpF_regmem(xRegI dst, regF src1, memory src2, eFlagsReg cr) %{
  predicate(UseSSE>=1);
  match(Set dst (CmpF3 src1 (LoadF src2)));
  effect(KILL cr);
  ins_cost(275);
  format %{ "UCOMISS $src1, $src2\n\t"
            "MOV     $dst, #-1\n\t"
            "JP,s    done\n\t"
            "JB,s    done\n\t"
            "SETNE   $dst\n\t"
            "MOVZB   $dst, $dst\n"
    "done:" %}
  ins_encode %{
    __ ucomiss($src1$$XMMRegister, $src2$$Address);
    emit_cmpfp3(_masm, $dst$$Register);
  %}
  ins_pipe( pipe_slow );
%}

// Spill to obtain 24-bit precision
instruct subFPR24_reg(stackSlotF dst, regFPR src1, regFPR src2) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (SubF src1 src2));

  format %{ "FSUB   $dst,$src1 - $src2" %}
  opcode(0xD8, 0x4); /* D8 E0+i or D8 /4 mod==0x3 ;; result in TOS */
  ins_encode( Push_Reg_FPR(src1),
              OpcReg_FPR(src2),
              Pop_Mem_FPR(dst) );
  ins_pipe( fpu_mem_reg_reg );
%}
//
// This instruction does not round to 24-bits
instruct subFPR_reg(regFPR dst, regFPR src) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (SubF dst src));

  format %{ "FSUB   $dst,$src" %}
  opcode(0xDE, 0x5); /* DE E8+i  or DE /5 */
  ins_encode( Push_Reg_FPR(src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_reg );
%}

// Spill to obtain 24-bit precision
instruct addFPR24_reg(stackSlotF dst, regFPR src1, regFPR src2) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (AddF src1 src2));

  format %{ "FADD   $dst,$src1,$src2" %}
  opcode(0xD8, 0x0); /* D8 C0+i */
  ins_encode( Push_Reg_FPR(src2),
              OpcReg_FPR(src1),
              Pop_Mem_FPR(dst) );
  ins_pipe( fpu_mem_reg_reg );
%}
//
// This instruction does not round to 24-bits
instruct addFPR_reg(regFPR dst, regFPR src) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (AddF dst src));

  format %{ "FLD    $src\n\t"
            "FADDp  $dst,ST" %}
  opcode(0xDE, 0x0); /* DE C0+i or DE /0*/
  ins_encode( Push_Reg_FPR(src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_reg );
%}

instruct absFPR_reg(regFPR1 dst, regFPR1 src) %{
  predicate(UseSSE==0);
  match(Set dst (AbsF src));
  ins_cost(100);
  format %{ "FABS" %}
  opcode(0xE1, 0xD9);
  ins_encode( OpcS, OpcP );
  ins_pipe( fpu_reg_reg );
%}

instruct negFPR_reg(regFPR1 dst, regFPR1 src) %{
  predicate(UseSSE==0);
  match(Set dst (NegF src));
  ins_cost(100);
  format %{ "FCHS" %}
  opcode(0xE0, 0xD9);
  ins_encode( OpcS, OpcP );
  ins_pipe( fpu_reg_reg );
%}

// Cisc-alternate to addFPR_reg
// Spill to obtain 24-bit precision
instruct addFPR24_reg_mem(stackSlotF dst, regFPR src1, memory src2) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (AddF src1 (LoadF src2)));

  format %{ "FLD    $src2\n\t"
            "FADD   ST,$src1\n\t"
            "FSTP_S $dst" %}
  opcode(0xD8, 0x0, 0xD9); /* D8 C0+i */  /* LoadF  D9 /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
              OpcReg_FPR(src1),
              Pop_Mem_FPR(dst) );
  ins_pipe( fpu_mem_reg_mem );
%}
//
// Cisc-alternate to addFPR_reg
// This instruction does not round to 24-bits
instruct addFPR_reg_mem(regFPR dst, memory src) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (AddF dst (LoadF src)));

  format %{ "FADD   $dst,$src" %}
  opcode(0xDE, 0x0, 0xD9); /* DE C0+i or DE /0*/  /* LoadF  D9 /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_mem );
%}

// // Following two instructions for _222_mpegaudio
// Spill to obtain 24-bit precision
instruct addFPR24_mem_reg(stackSlotF dst, regFPR src2, memory src1 ) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (AddF src1 src2));

  format %{ "FADD   $dst,$src1,$src2" %}
  opcode(0xD8, 0x0, 0xD9); /* D8 C0+i */  /* LoadF  D9 /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src1),
              OpcReg_FPR(src2),
              Pop_Mem_FPR(dst) );
  ins_pipe( fpu_mem_reg_mem );
%}

// Cisc-spill variant
// Spill to obtain 24-bit precision
instruct addFPR24_mem_cisc(stackSlotF dst, memory src1, memory src2) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (AddF src1 (LoadF src2)));

  format %{ "FADD   $dst,$src1,$src2 cisc" %}
  opcode(0xD8, 0x0, 0xD9); /* D8 C0+i */  /* LoadF  D9 /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
              set_instruction_start,
              OpcP, RMopc_Mem(secondary,src1),
              Pop_Mem_FPR(dst) );
  ins_pipe( fpu_mem_mem_mem );
%}

// Spill to obtain 24-bit precision
instruct addFPR24_mem_mem(stackSlotF dst, memory src1, memory src2) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (AddF src1 src2));

  format %{ "FADD   $dst,$src1,$src2" %}
  opcode(0xD8, 0x0, 0xD9); /* D8 /0 */  /* LoadF  D9 /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
              set_instruction_start,
              OpcP, RMopc_Mem(secondary,src1),
              Pop_Mem_FPR(dst) );
  ins_pipe( fpu_mem_mem_mem );
%}


// Spill to obtain 24-bit precision
instruct addFPR24_reg_imm(stackSlotF dst, regFPR src, immFPR con) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (AddF src con));
  format %{ "FLD    $src\n\t"
            "FADD_S [$constantaddress]\t# load from constant table: float=$con\n\t"
            "FSTP_S $dst"  %}
  ins_encode %{
    __ fld_s($src$$reg - 1);  // FLD ST(i-1)
    __ fadd_s($constantaddress($con));
    __ fstp_s(Address(rsp, $dst$$disp));
  %}
  ins_pipe(fpu_mem_reg_con);
%}
//
// This instruction does not round to 24-bits
instruct addFPR_reg_imm(regFPR dst, regFPR src, immFPR con) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (AddF src con));
  format %{ "FLD    $src\n\t"
            "FADD_S [$constantaddress]\t# load from constant table: float=$con\n\t"
            "FSTP   $dst"  %}
  ins_encode %{
    __ fld_s($src$$reg - 1);  // FLD ST(i-1)
    __ fadd_s($constantaddress($con));
    __ fstp_d($dst$$reg);
  %}
  ins_pipe(fpu_reg_reg_con);
%}

// Spill to obtain 24-bit precision
instruct mulFPR24_reg(stackSlotF dst, regFPR src1, regFPR src2) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (MulF src1 src2));

  format %{ "FLD    $src1\n\t"
            "FMUL   $src2\n\t"
            "FSTP_S $dst"  %}
  opcode(0xD8, 0x1); /* D8 C8+i or D8 /1 ;; result in TOS */
  ins_encode( Push_Reg_FPR(src1),
              OpcReg_FPR(src2),
              Pop_Mem_FPR(dst) );
  ins_pipe( fpu_mem_reg_reg );
%}
//
// This instruction does not round to 24-bits
instruct mulFPR_reg(regFPR dst, regFPR src1, regFPR src2) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (MulF src1 src2));

  format %{ "FLD    $src1\n\t"
            "FMUL   $src2\n\t"
            "FSTP_S $dst"  %}
  opcode(0xD8, 0x1); /* D8 C8+i */
  ins_encode( Push_Reg_FPR(src2),
              OpcReg_FPR(src1),
              Pop_Reg_FPR(dst) );
  ins_pipe( fpu_reg_reg_reg );
%}


// Spill to obtain 24-bit precision
// Cisc-alternate to reg-reg multiply
instruct mulFPR24_reg_mem(stackSlotF dst, regFPR src1, memory src2) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (MulF src1 (LoadF src2)));

  format %{ "FLD_S  $src2\n\t"
            "FMUL   $src1\n\t"
            "FSTP_S $dst"  %}
  opcode(0xD8, 0x1, 0xD9); /* D8 C8+i or DE /1*/  /* LoadF D9 /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
              OpcReg_FPR(src1),
              Pop_Mem_FPR(dst) );
  ins_pipe( fpu_mem_reg_mem );
%}
//
// This instruction does not round to 24-bits
// Cisc-alternate to reg-reg multiply
instruct mulFPR_reg_mem(regFPR dst, regFPR src1, memory src2) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (MulF src1 (LoadF src2)));

  format %{ "FMUL   $dst,$src1,$src2" %}
  opcode(0xD8, 0x1, 0xD9); /* D8 C8+i */  /* LoadF D9 /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
              OpcReg_FPR(src1),
              Pop_Reg_FPR(dst) );
  ins_pipe( fpu_reg_reg_mem );
%}

// Spill to obtain 24-bit precision
instruct mulFPR24_mem_mem(stackSlotF dst, memory src1, memory src2) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (MulF src1 src2));

  format %{ "FMUL   $dst,$src1,$src2" %}
  opcode(0xD8, 0x1, 0xD9); /* D8 /1 */  /* LoadF D9 /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,src2),
              set_instruction_start,
              OpcP, RMopc_Mem(secondary,src1),
              Pop_Mem_FPR(dst) );
  ins_pipe( fpu_mem_mem_mem );
%}

// Spill to obtain 24-bit precision
instruct mulFPR24_reg_imm(stackSlotF dst, regFPR src, immFPR con) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (MulF src con));

  format %{ "FLD    $src\n\t"
            "FMUL_S [$constantaddress]\t# load from constant table: float=$con\n\t"
            "FSTP_S $dst"  %}
  ins_encode %{
    __ fld_s($src$$reg - 1);  // FLD ST(i-1)
    __ fmul_s($constantaddress($con));
    __ fstp_s(Address(rsp, $dst$$disp));
  %}
  ins_pipe(fpu_mem_reg_con);
%}
//
// This instruction does not round to 24-bits
instruct mulFPR_reg_imm(regFPR dst, regFPR src, immFPR con) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (MulF src con));

  format %{ "FLD    $src\n\t"
            "FMUL_S [$constantaddress]\t# load from constant table: float=$con\n\t"
            "FSTP   $dst"  %}
  ins_encode %{
    __ fld_s($src$$reg - 1);  // FLD ST(i-1)
    __ fmul_s($constantaddress($con));
    __ fstp_d($dst$$reg);
  %}
  ins_pipe(fpu_reg_reg_con);
%}


//
// MACRO1 -- subsume unshared load into mulFPR
// This instruction does not round to 24-bits
instruct mulFPR_reg_load1(regFPR dst, regFPR src, memory mem1 ) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (MulF (LoadF mem1) src));

  format %{ "FLD    $mem1    ===MACRO1===\n\t"
            "FMUL   ST,$src\n\t"
            "FSTP   $dst" %}
  opcode(0xD8, 0x1, 0xD9); /* D8 C8+i or D8 /1 */  /* LoadF D9 /0 */
  ins_encode( Opcode(tertiary), RMopc_Mem(0x00,mem1),
              OpcReg_FPR(src),
              Pop_Reg_FPR(dst) );
  ins_pipe( fpu_reg_reg_mem );
%}
//
// MACRO2 -- addFPR a mulFPR which subsumed an unshared load
// This instruction does not round to 24-bits
instruct addFPR_mulFPR_reg_load1(regFPR dst, memory mem1, regFPR src1, regFPR src2) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (AddF (MulF (LoadF mem1) src1) src2));
  ins_cost(95);

  format %{ "FLD    $mem1     ===MACRO2===\n\t"
            "FMUL   ST,$src1  subsume mulFPR left load\n\t"
            "FADD   ST,$src2\n\t"
            "FSTP   $dst" %}
  opcode(0xD9); /* LoadF D9 /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem1),
              FMul_ST_reg(src1),
              FAdd_ST_reg(src2),
              Pop_Reg_FPR(dst) );
  ins_pipe( fpu_reg_mem_reg_reg );
%}

// MACRO3 -- addFPR a mulFPR
// This instruction does not round to 24-bits.  It is a '2-address'
// instruction in that the result goes back to src2.  This eliminates
// a move from the macro; possibly the register allocator will have
// to add it back (and maybe not).
instruct addFPR_mulFPR_reg(regFPR src2, regFPR src1, regFPR src0) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set src2 (AddF (MulF src0 src1) src2));

  format %{ "FLD    $src0     ===MACRO3===\n\t"
            "FMUL   ST,$src1\n\t"
            "FADDP  $src2,ST" %}
  opcode(0xD9); /* LoadF D9 /0 */
  ins_encode( Push_Reg_FPR(src0),
              FMul_ST_reg(src1),
              FAddP_reg_ST(src2) );
  ins_pipe( fpu_reg_reg_reg );
%}

// MACRO4 -- divFPR subFPR
// This instruction does not round to 24-bits
instruct subFPR_divFPR_reg(regFPR dst, regFPR src1, regFPR src2, regFPR src3) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (DivF (SubF src2 src1) src3));

  format %{ "FLD    $src2   ===MACRO4===\n\t"
            "FSUB   ST,$src1\n\t"
            "FDIV   ST,$src3\n\t"
            "FSTP  $dst" %}
  opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
  ins_encode( Push_Reg_FPR(src2),
              subFPR_divFPR_encode(src1,src3),
              Pop_Reg_FPR(dst) );
  ins_pipe( fpu_reg_reg_reg_reg );
%}

// Spill to obtain 24-bit precision
instruct divFPR24_reg(stackSlotF dst, regFPR src1, regFPR src2) %{
  predicate(UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (DivF src1 src2));

  format %{ "FDIV   $dst,$src1,$src2" %}
  opcode(0xD8, 0x6); /* D8 F0+i or DE /6*/
  ins_encode( Push_Reg_FPR(src1),
              OpcReg_FPR(src2),
              Pop_Mem_FPR(dst) );
  ins_pipe( fpu_mem_reg_reg );
%}
//
// This instruction does not round to 24-bits
instruct divFPR_reg(regFPR dst, regFPR src) %{
  predicate(UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (DivF dst src));

  format %{ "FDIV   $dst,$src" %}
  opcode(0xDE, 0x7); /* DE F8+i or DE /7*/
  ins_encode( Push_Reg_FPR(src),
              OpcP, RegOpc(dst) );
  ins_pipe( fpu_reg_reg );
%}


// Spill to obtain 24-bit precision
instruct modFPR24_reg(stackSlotF dst, regFPR src1, regFPR src2, eAXRegI rax, eFlagsReg cr) %{
  predicate( UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (ModF src1 src2));
  effect(KILL rax, KILL cr); // emitModDPR() uses EAX and EFLAGS

  format %{ "FMOD   $dst,$src1,$src2" %}
  ins_encode( Push_Reg_Mod_DPR(src1, src2),
              emitModDPR(),
              Push_Result_Mod_DPR(src2),
              Pop_Mem_FPR(dst));
  ins_pipe( pipe_slow );
%}
//
// This instruction does not round to 24-bits
instruct modFPR_reg(regFPR dst, regFPR src, eAXRegI rax, eFlagsReg cr) %{
  predicate( UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (ModF dst src));
  effect(KILL rax, KILL cr); // emitModDPR() uses EAX and EFLAGS

  format %{ "FMOD   $dst,$src" %}
  ins_encode(Push_Reg_Mod_DPR(dst, src),
              emitModDPR(),
              Push_Result_Mod_DPR(src),
              Pop_Reg_FPR(dst));
  ins_pipe( pipe_slow );
%}

instruct modF_reg(regF dst, regF src0, regF src1, eAXRegI rax, eFlagsReg cr) %{
  predicate(UseSSE>=1);
  match(Set dst (ModF src0 src1));
  effect(KILL rax, KILL cr);
  format %{ "SUB    ESP,4\t # FMOD\n"
          "\tMOVSS  [ESP+0],$src1\n"
          "\tFLD_S  [ESP+0]\n"
          "\tMOVSS  [ESP+0],$src0\n"
          "\tFLD_S  [ESP+0]\n"
     "loop:\tFPREM\n"
          "\tFWAIT\n"
          "\tFNSTSW AX\n"
          "\tSAHF\n"
          "\tJP     loop\n"
          "\tFSTP_S [ESP+0]\n"
          "\tMOVSS  $dst,[ESP+0]\n"
          "\tADD    ESP,4\n"
          "\tFSTP   ST0\t # Restore FPU Stack"
    %}
  ins_cost(250);
  ins_encode( Push_ModF_encoding(src0, src1), emitModDPR(), Push_ResultF(dst,0x4), PopFPU);
  ins_pipe( pipe_slow );
%}


//----------Arithmetic Conversion Instructions---------------------------------
// The conversions operations are all Alpha sorted.  Please keep it that way!

instruct roundFloat_mem_reg(stackSlotF dst, regFPR src) %{
  predicate(UseSSE==0);
  match(Set dst (RoundFloat src));
  ins_cost(125);
  format %{ "FST_S  $dst,$src\t# F-round" %}
  ins_encode( Pop_Mem_Reg_FPR(dst, src) );
  ins_pipe( fpu_mem_reg );
%}

instruct roundDouble_mem_reg(stackSlotD dst, regDPR src) %{
  predicate(UseSSE<=1);
  match(Set dst (RoundDouble src));
  ins_cost(125);
  format %{ "FST_D  $dst,$src\t# D-round" %}
  ins_encode( Pop_Mem_Reg_DPR(dst, src) );
  ins_pipe( fpu_mem_reg );
%}

// Force rounding to 24-bit precision and 6-bit exponent
instruct convDPR2FPR_reg(stackSlotF dst, regDPR src) %{
  predicate(UseSSE==0);
  match(Set dst (ConvD2F src));
  format %{ "FST_S  $dst,$src\t# F-round" %}
  expand %{
    roundFloat_mem_reg(dst,src);
  %}
%}

// Force rounding to 24-bit precision and 6-bit exponent
instruct convDPR2F_reg(regF dst, regDPR src, eFlagsReg cr) %{
  predicate(UseSSE==1);
  match(Set dst (ConvD2F src));
  effect( KILL cr );
  format %{ "SUB    ESP,4\n\t"
            "FST_S  [ESP],$src\t# F-round\n\t"
            "MOVSS  $dst,[ESP]\n\t"
            "ADD ESP,4" %}
  ins_encode %{
    __ subptr(rsp, 4);
    if ($src$$reg != FPR1L_enc) {
      __ fld_s($src$$reg-1);
      __ fstp_s(Address(rsp, 0));
    } else {
      __ fst_s(Address(rsp, 0));
    }
    __ movflt($dst$$XMMRegister, Address(rsp, 0));
    __ addptr(rsp, 4);
  %}
  ins_pipe( pipe_slow );
%}

// Force rounding double precision to single precision
instruct convD2F_reg(regF dst, regD src) %{
  predicate(UseSSE>=2);
  match(Set dst (ConvD2F src));
  format %{ "CVTSD2SS $dst,$src\t# F-round" %}
  ins_encode %{
    __ cvtsd2ss ($dst$$XMMRegister, $src$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
%}

instruct convFPR2DPR_reg_reg(regDPR dst, regFPR src) %{
  predicate(UseSSE==0);
  match(Set dst (ConvF2D src));
  format %{ "FST_S  $dst,$src\t# D-round" %}
  ins_encode( Pop_Reg_Reg_DPR(dst, src));
  ins_pipe( fpu_reg_reg );
%}

instruct convFPR2D_reg(stackSlotD dst, regFPR src) %{
  predicate(UseSSE==1);
  match(Set dst (ConvF2D src));
  format %{ "FST_D  $dst,$src\t# D-round" %}
  expand %{
    roundDouble_mem_reg(dst,src);
  %}
%}

instruct convF2DPR_reg(regDPR dst, regF src, eFlagsReg cr) %{
  predicate(UseSSE==1);
  match(Set dst (ConvF2D src));
  effect( KILL cr );
  format %{ "SUB    ESP,4\n\t"
            "MOVSS  [ESP] $src\n\t"
            "FLD_S  [ESP]\n\t"
            "ADD    ESP,4\n\t"
            "FSTP   $dst\t# D-round" %}
  ins_encode %{
    __ subptr(rsp, 4);
    __ movflt(Address(rsp, 0), $src$$XMMRegister);
    __ fld_s(Address(rsp, 0));
    __ addptr(rsp, 4);
    __ fstp_d($dst$$reg);
  %}
  ins_pipe( pipe_slow );
%}

instruct convF2D_reg(regD dst, regF src) %{
  predicate(UseSSE>=2);
  match(Set dst (ConvF2D src));
  format %{ "CVTSS2SD $dst,$src\t# D-round" %}
  ins_encode %{
    __ cvtss2sd ($dst$$XMMRegister, $src$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
%}

// Convert a double to an int.  If the double is a NAN, stuff a zero in instead.
instruct convDPR2I_reg_reg( eAXRegI dst, eDXRegI tmp, regDPR src, eFlagsReg cr ) %{
  predicate(UseSSE<=1);
  match(Set dst (ConvD2I src));
  effect( KILL tmp, KILL cr );
  format %{ "FLD    $src\t# Convert double to int \n\t"
            "FLDCW  trunc mode\n\t"
            "SUB    ESP,4\n\t"
            "FISTp  [ESP + #0]\n\t"
            "FLDCW  std/24-bit mode\n\t"
            "POP    EAX\n\t"
            "CMP    EAX,0x80000000\n\t"
            "JNE,s  fast\n\t"
            "FLD_D  $src\n\t"
            "CALL   d2i_wrapper\n"
      "fast:" %}
  ins_encode( Push_Reg_DPR(src), DPR2I_encoding(src) );
  ins_pipe( pipe_slow );
%}

// Convert a double to an int.  If the double is a NAN, stuff a zero in instead.
instruct convD2I_reg_reg( eAXRegI dst, eDXRegI tmp, regD src, eFlagsReg cr ) %{
  predicate(UseSSE>=2);
  match(Set dst (ConvD2I src));
  effect( KILL tmp, KILL cr );
  format %{ "CVTTSD2SI $dst, $src\n\t"
            "CMP    $dst,0x80000000\n\t"
            "JNE,s  fast\n\t"
            "SUB    ESP, 8\n\t"
            "MOVSD  [ESP], $src\n\t"
            "FLD_D  [ESP]\n\t"
            "ADD    ESP, 8\n\t"
            "CALL   d2i_wrapper\n"
      "fast:" %}
  ins_encode %{
    Label fast;
    __ cvttsd2sil($dst$$Register, $src$$XMMRegister);
    __ cmpl($dst$$Register, 0x80000000);
    __ jccb(Assembler::notEqual, fast);
    __ subptr(rsp, 8);
    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
    __ fld_d(Address(rsp, 0));
    __ addptr(rsp, 8);
    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::d2i_wrapper())));
    __ bind(fast);
  %}
  ins_pipe( pipe_slow );
%}

instruct convDPR2L_reg_reg( eADXRegL dst, regDPR src, eFlagsReg cr ) %{
  predicate(UseSSE<=1);
  match(Set dst (ConvD2L src));
  effect( KILL cr );
  format %{ "FLD    $src\t# Convert double to long\n\t"
            "FLDCW  trunc mode\n\t"
            "SUB    ESP,8\n\t"
            "FISTp  [ESP + #0]\n\t"
            "FLDCW  std/24-bit mode\n\t"
            "POP    EAX\n\t"
            "POP    EDX\n\t"
            "CMP    EDX,0x80000000\n\t"
            "JNE,s  fast\n\t"
            "TEST   EAX,EAX\n\t"
            "JNE,s  fast\n\t"
            "FLD    $src\n\t"
            "CALL   d2l_wrapper\n"
      "fast:" %}
  ins_encode( Push_Reg_DPR(src),  DPR2L_encoding(src) );
  ins_pipe( pipe_slow );
%}

// XMM lacks a float/double->long conversion, so use the old FPU stack.
instruct convD2L_reg_reg( eADXRegL dst, regD src, eFlagsReg cr ) %{
  predicate (UseSSE>=2);
  match(Set dst (ConvD2L src));
  effect( KILL cr );
  format %{ "SUB    ESP,8\t# Convert double to long\n\t"
            "MOVSD  [ESP],$src\n\t"
            "FLD_D  [ESP]\n\t"
            "FLDCW  trunc mode\n\t"
            "FISTp  [ESP + #0]\n\t"
            "FLDCW  std/24-bit mode\n\t"
            "POP    EAX\n\t"
            "POP    EDX\n\t"
            "CMP    EDX,0x80000000\n\t"
            "JNE,s  fast\n\t"
            "TEST   EAX,EAX\n\t"
            "JNE,s  fast\n\t"
            "SUB    ESP,8\n\t"
            "MOVSD  [ESP],$src\n\t"
            "FLD_D  [ESP]\n\t"
            "ADD    ESP,8\n\t"
            "CALL   d2l_wrapper\n"
      "fast:" %}
  ins_encode %{
    Label fast;
    __ subptr(rsp, 8);
    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
    __ fld_d(Address(rsp, 0));
    __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_trunc()));
    __ fistp_d(Address(rsp, 0));
    // Restore the rounding mode, mask the exception
    if (Compile::current()->in_24_bit_fp_mode()) {
      __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
    } else {
      __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
    }
    // Load the converted long, adjust CPU stack
    __ pop(rax);
    __ pop(rdx);
    __ cmpl(rdx, 0x80000000);
    __ jccb(Assembler::notEqual, fast);
    __ testl(rax, rax);
    __ jccb(Assembler::notEqual, fast);
    __ subptr(rsp, 8);
    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
    __ fld_d(Address(rsp, 0));
    __ addptr(rsp, 8);
    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::d2l_wrapper())));
    __ bind(fast);
  %}
  ins_pipe( pipe_slow );
%}

// Convert a double to an int.  Java semantics require we do complex
// manglations in the corner cases.  So we set the rounding mode to
// 'zero', store the darned double down as an int, and reset the
// rounding mode to 'nearest'.  The hardware stores a flag value down
// if we would overflow or converted a NAN; we check for this and
// and go the slow path if needed.
instruct convFPR2I_reg_reg(eAXRegI dst, eDXRegI tmp, regFPR src, eFlagsReg cr ) %{
  predicate(UseSSE==0);
  match(Set dst (ConvF2I src));
  effect( KILL tmp, KILL cr );
  format %{ "FLD    $src\t# Convert float to int \n\t"
            "FLDCW  trunc mode\n\t"
            "SUB    ESP,4\n\t"
            "FISTp  [ESP + #0]\n\t"
            "FLDCW  std/24-bit mode\n\t"
            "POP    EAX\n\t"
            "CMP    EAX,0x80000000\n\t"
            "JNE,s  fast\n\t"
            "FLD    $src\n\t"
            "CALL   d2i_wrapper\n"
      "fast:" %}
  // DPR2I_encoding works for FPR2I
  ins_encode( Push_Reg_FPR(src), DPR2I_encoding(src) );
  ins_pipe( pipe_slow );
%}

// Convert a float in xmm to an int reg.
instruct convF2I_reg(eAXRegI dst, eDXRegI tmp, regF src, eFlagsReg cr ) %{
  predicate(UseSSE>=1);
  match(Set dst (ConvF2I src));
  effect( KILL tmp, KILL cr );
  format %{ "CVTTSS2SI $dst, $src\n\t"
            "CMP    $dst,0x80000000\n\t"
            "JNE,s  fast\n\t"
            "SUB    ESP, 4\n\t"
            "MOVSS  [ESP], $src\n\t"
            "FLD    [ESP]\n\t"
            "ADD    ESP, 4\n\t"
            "CALL   d2i_wrapper\n"
      "fast:" %}
  ins_encode %{
    Label fast;
    __ cvttss2sil($dst$$Register, $src$$XMMRegister);
    __ cmpl($dst$$Register, 0x80000000);
    __ jccb(Assembler::notEqual, fast);
    __ subptr(rsp, 4);
    __ movflt(Address(rsp, 0), $src$$XMMRegister);
    __ fld_s(Address(rsp, 0));
    __ addptr(rsp, 4);
    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::d2i_wrapper())));
    __ bind(fast);
  %}
  ins_pipe( pipe_slow );
%}

instruct convFPR2L_reg_reg( eADXRegL dst, regFPR src, eFlagsReg cr ) %{
  predicate(UseSSE==0);
  match(Set dst (ConvF2L src));
  effect( KILL cr );
  format %{ "FLD    $src\t# Convert float to long\n\t"
            "FLDCW  trunc mode\n\t"
            "SUB    ESP,8\n\t"
            "FISTp  [ESP + #0]\n\t"
            "FLDCW  std/24-bit mode\n\t"
            "POP    EAX\n\t"
            "POP    EDX\n\t"
            "CMP    EDX,0x80000000\n\t"
            "JNE,s  fast\n\t"
            "TEST   EAX,EAX\n\t"
            "JNE,s  fast\n\t"
            "FLD    $src\n\t"
            "CALL   d2l_wrapper\n"
      "fast:" %}
  // DPR2L_encoding works for FPR2L
  ins_encode( Push_Reg_FPR(src), DPR2L_encoding(src) );
  ins_pipe( pipe_slow );
%}

// XMM lacks a float/double->long conversion, so use the old FPU stack.
instruct convF2L_reg_reg( eADXRegL dst, regF src, eFlagsReg cr ) %{
  predicate (UseSSE>=1);
  match(Set dst (ConvF2L src));
  effect( KILL cr );
  format %{ "SUB    ESP,8\t# Convert float to long\n\t"
            "MOVSS  [ESP],$src\n\t"
            "FLD_S  [ESP]\n\t"
            "FLDCW  trunc mode\n\t"
            "FISTp  [ESP + #0]\n\t"
            "FLDCW  std/24-bit mode\n\t"
            "POP    EAX\n\t"
            "POP    EDX\n\t"
            "CMP    EDX,0x80000000\n\t"
            "JNE,s  fast\n\t"
            "TEST   EAX,EAX\n\t"
            "JNE,s  fast\n\t"
            "SUB    ESP,4\t# Convert float to long\n\t"
            "MOVSS  [ESP],$src\n\t"
            "FLD_S  [ESP]\n\t"
            "ADD    ESP,4\n\t"
            "CALL   d2l_wrapper\n"
      "fast:" %}
  ins_encode %{
    Label fast;
    __ subptr(rsp, 8);
    __ movflt(Address(rsp, 0), $src$$XMMRegister);
    __ fld_s(Address(rsp, 0));
    __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_trunc()));
    __ fistp_d(Address(rsp, 0));
    // Restore the rounding mode, mask the exception
    if (Compile::current()->in_24_bit_fp_mode()) {
      __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
    } else {
      __ fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_std()));
    }
    // Load the converted long, adjust CPU stack
    __ pop(rax);
    __ pop(rdx);
    __ cmpl(rdx, 0x80000000);
    __ jccb(Assembler::notEqual, fast);
    __ testl(rax, rax);
    __ jccb(Assembler::notEqual, fast);
    __ subptr(rsp, 4);
    __ movflt(Address(rsp, 0), $src$$XMMRegister);
    __ fld_s(Address(rsp, 0));
    __ addptr(rsp, 4);
    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::d2l_wrapper())));
    __ bind(fast);
  %}
  ins_pipe( pipe_slow );
%}

instruct convI2DPR_reg(regDPR dst, stackSlotI src) %{
  predicate( UseSSE<=1 );
  match(Set dst (ConvI2D src));
  format %{ "FILD   $src\n\t"
            "FSTP   $dst" %}
  opcode(0xDB, 0x0);  /* DB /0 */
  ins_encode(Push_Mem_I(src), Pop_Reg_DPR(dst));
  ins_pipe( fpu_reg_mem );
%}

instruct convI2D_reg(regD dst, rRegI src) %{
  predicate( UseSSE>=2 && !UseXmmI2D );
  match(Set dst (ConvI2D src));
  format %{ "CVTSI2SD $dst,$src" %}
  ins_encode %{
    __ cvtsi2sdl ($dst$$XMMRegister, $src$$Register);
  %}
  ins_pipe( pipe_slow );
%}

instruct convI2D_mem(regD dst, memory mem) %{
  predicate( UseSSE>=2 );
  match(Set dst (ConvI2D (LoadI mem)));
  format %{ "CVTSI2SD $dst,$mem" %}
  ins_encode %{
    __ cvtsi2sdl ($dst$$XMMRegister, $mem$$Address);
  %}
  ins_pipe( pipe_slow );
%}

instruct convXI2D_reg(regD dst, rRegI src)
%{
  predicate( UseSSE>=2 && UseXmmI2D );
  match(Set dst (ConvI2D src));

  format %{ "MOVD  $dst,$src\n\t"
            "CVTDQ2PD $dst,$dst\t# i2d" %}
  ins_encode %{
    __ movdl($dst$$XMMRegister, $src$$Register);
    __ cvtdq2pd($dst$$XMMRegister, $dst$$XMMRegister);
  %}
  ins_pipe(pipe_slow); // XXX
%}

instruct convI2DPR_mem(regDPR dst, memory mem) %{
  predicate( UseSSE<=1 && !Compile::current()->select_24_bit_instr());
  match(Set dst (ConvI2D (LoadI mem)));
  format %{ "FILD   $mem\n\t"
            "FSTP   $dst" %}
  opcode(0xDB);      /* DB /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem),
              Pop_Reg_DPR(dst));
  ins_pipe( fpu_reg_mem );
%}

// Convert a byte to a float; no rounding step needed.
instruct conv24I2FPR_reg(regFPR dst, stackSlotI src) %{
  predicate( UseSSE==0 && n->in(1)->Opcode() == Op_AndI && n->in(1)->in(2)->is_Con() && n->in(1)->in(2)->get_int() == 255 );
  match(Set dst (ConvI2F src));
  format %{ "FILD   $src\n\t"
            "FSTP   $dst" %}

  opcode(0xDB, 0x0);  /* DB /0 */
  ins_encode(Push_Mem_I(src), Pop_Reg_FPR(dst));
  ins_pipe( fpu_reg_mem );
%}

// In 24-bit mode, force exponent rounding by storing back out
instruct convI2FPR_SSF(stackSlotF dst, stackSlotI src) %{
  predicate( UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (ConvI2F src));
  ins_cost(200);
  format %{ "FILD   $src\n\t"
            "FSTP_S $dst" %}
  opcode(0xDB, 0x0);  /* DB /0 */
  ins_encode( Push_Mem_I(src),
              Pop_Mem_FPR(dst));
  ins_pipe( fpu_mem_mem );
%}

// In 24-bit mode, force exponent rounding by storing back out
instruct convI2FPR_SSF_mem(stackSlotF dst, memory mem) %{
  predicate( UseSSE==0 && Compile::current()->select_24_bit_instr());
  match(Set dst (ConvI2F (LoadI mem)));
  ins_cost(200);
  format %{ "FILD   $mem\n\t"
            "FSTP_S $dst" %}
  opcode(0xDB);  /* DB /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem),
              Pop_Mem_FPR(dst));
  ins_pipe( fpu_mem_mem );
%}

// This instruction does not round to 24-bits
instruct convI2FPR_reg(regFPR dst, stackSlotI src) %{
  predicate( UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (ConvI2F src));
  format %{ "FILD   $src\n\t"
            "FSTP   $dst" %}
  opcode(0xDB, 0x0);  /* DB /0 */
  ins_encode( Push_Mem_I(src),
              Pop_Reg_FPR(dst));
  ins_pipe( fpu_reg_mem );
%}

// This instruction does not round to 24-bits
instruct convI2FPR_mem(regFPR dst, memory mem) %{
  predicate( UseSSE==0 && !Compile::current()->select_24_bit_instr());
  match(Set dst (ConvI2F (LoadI mem)));
  format %{ "FILD   $mem\n\t"
            "FSTP   $dst" %}
  opcode(0xDB);      /* DB /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,mem),
              Pop_Reg_FPR(dst));
  ins_pipe( fpu_reg_mem );
%}

// Convert an int to a float in xmm; no rounding step needed.
instruct convI2F_reg(regF dst, rRegI src) %{
  predicate( UseSSE==1 || UseSSE>=2 && !UseXmmI2F );
  match(Set dst (ConvI2F src));
  format %{ "CVTSI2SS $dst, $src" %}
  ins_encode %{
    __ cvtsi2ssl ($dst$$XMMRegister, $src$$Register);
  %}
  ins_pipe( pipe_slow );
%}

 instruct convXI2F_reg(regF dst, rRegI src)
%{
  predicate( UseSSE>=2 && UseXmmI2F );
  match(Set dst (ConvI2F src));

  format %{ "MOVD  $dst,$src\n\t"
            "CVTDQ2PS $dst,$dst\t# i2f" %}
  ins_encode %{
    __ movdl($dst$$XMMRegister, $src$$Register);
    __ cvtdq2ps($dst$$XMMRegister, $dst$$XMMRegister);
  %}
  ins_pipe(pipe_slow); // XXX
%}

instruct convI2L_reg( eRegL dst, rRegI src, eFlagsReg cr) %{
  match(Set dst (ConvI2L src));
  effect(KILL cr);
  ins_cost(375);
  format %{ "MOV    $dst.lo,$src\n\t"
            "MOV    $dst.hi,$src\n\t"
            "SAR    $dst.hi,31" %}
  ins_encode(convert_int_long(dst,src));
  ins_pipe( ialu_reg_reg_long );
%}

// Zero-extend convert int to long
instruct convI2L_reg_zex(eRegL dst, rRegI src, immL_32bits mask, eFlagsReg flags ) %{
  match(Set dst (AndL (ConvI2L src) mask) );
  effect( KILL flags );
  ins_cost(250);
  format %{ "MOV    $dst.lo,$src\n\t"
            "XOR    $dst.hi,$dst.hi" %}
  opcode(0x33); // XOR
  ins_encode(enc_Copy(dst,src), OpcP, RegReg_Hi2(dst,dst) );
  ins_pipe( ialu_reg_reg_long );
%}

// Zero-extend long
instruct zerox_long(eRegL dst, eRegL src, immL_32bits mask, eFlagsReg flags ) %{
  match(Set dst (AndL src mask) );
  effect( KILL flags );
  ins_cost(250);
  format %{ "MOV    $dst.lo,$src.lo\n\t"
            "XOR    $dst.hi,$dst.hi\n\t" %}
  opcode(0x33); // XOR
  ins_encode(enc_Copy(dst,src), OpcP, RegReg_Hi2(dst,dst) );
  ins_pipe( ialu_reg_reg_long );
%}

instruct convL2DPR_reg( stackSlotD dst, eRegL src, eFlagsReg cr) %{
  predicate (UseSSE<=1);
  match(Set dst (ConvL2D src));
  effect( KILL cr );
  format %{ "PUSH   $src.hi\t# Convert long to double\n\t"
            "PUSH   $src.lo\n\t"
            "FILD   ST,[ESP + #0]\n\t"
            "ADD    ESP,8\n\t"
            "FSTP_D $dst\t# D-round" %}
  opcode(0xDF, 0x5);  /* DF /5 */
  ins_encode(convert_long_double(src), Pop_Mem_DPR(dst));
  ins_pipe( pipe_slow );
%}

instruct convL2D_reg( regD dst, eRegL src, eFlagsReg cr) %{
  predicate (UseSSE>=2);
  match(Set dst (ConvL2D src));
  effect( KILL cr );
  format %{ "PUSH   $src.hi\t# Convert long to double\n\t"
            "PUSH   $src.lo\n\t"
            "FILD_D [ESP]\n\t"
            "FSTP_D [ESP]\n\t"
            "MOVSD  $dst,[ESP]\n\t"
            "ADD    ESP,8" %}
  opcode(0xDF, 0x5);  /* DF /5 */
  ins_encode(convert_long_double2(src), Push_ResultD(dst));
  ins_pipe( pipe_slow );
%}

instruct convL2F_reg( regF dst, eRegL src, eFlagsReg cr) %{
  predicate (UseSSE>=1);
  match(Set dst (ConvL2F src));
  effect( KILL cr );
  format %{ "PUSH   $src.hi\t# Convert long to single float\n\t"
            "PUSH   $src.lo\n\t"
            "FILD_D [ESP]\n\t"
            "FSTP_S [ESP]\n\t"
            "MOVSS  $dst,[ESP]\n\t"
            "ADD    ESP,8" %}
  opcode(0xDF, 0x5);  /* DF /5 */
  ins_encode(convert_long_double2(src), Push_ResultF(dst,0x8));
  ins_pipe( pipe_slow );
%}

instruct convL2FPR_reg( stackSlotF dst, eRegL src, eFlagsReg cr) %{
  match(Set dst (ConvL2F src));
  effect( KILL cr );
  format %{ "PUSH   $src.hi\t# Convert long to single float\n\t"
            "PUSH   $src.lo\n\t"
            "FILD   ST,[ESP + #0]\n\t"
            "ADD    ESP,8\n\t"
            "FSTP_S $dst\t# F-round" %}
  opcode(0xDF, 0x5);  /* DF /5 */
  ins_encode(convert_long_double(src), Pop_Mem_FPR(dst));
  ins_pipe( pipe_slow );
%}

instruct convL2I_reg( rRegI dst, eRegL src ) %{
  match(Set dst (ConvL2I src));
  effect( DEF dst, USE src );
  format %{ "MOV    $dst,$src.lo" %}
  ins_encode(enc_CopyL_Lo(dst,src));
  ins_pipe( ialu_reg_reg );
%}


instruct MoveF2I_stack_reg(rRegI dst, stackSlotF src) %{
  match(Set dst (MoveF2I src));
  effect( DEF dst, USE src );
  ins_cost(100);
  format %{ "MOV    $dst,$src\t# MoveF2I_stack_reg" %}
  ins_encode %{
    __ movl($dst$$Register, Address(rsp, $src$$disp));
  %}
  ins_pipe( ialu_reg_mem );
%}

instruct MoveFPR2I_reg_stack(stackSlotI dst, regFPR src) %{
  predicate(UseSSE==0);
  match(Set dst (MoveF2I src));
  effect( DEF dst, USE src );

  ins_cost(125);
  format %{ "FST_S  $dst,$src\t# MoveF2I_reg_stack" %}
  ins_encode( Pop_Mem_Reg_FPR(dst, src) );
  ins_pipe( fpu_mem_reg );
%}

instruct MoveF2I_reg_stack_sse(stackSlotI dst, regF src) %{
  predicate(UseSSE>=1);
  match(Set dst (MoveF2I src));
  effect( DEF dst, USE src );

  ins_cost(95);
  format %{ "MOVSS  $dst,$src\t# MoveF2I_reg_stack_sse" %}
  ins_encode %{
    __ movflt(Address(rsp, $dst$$disp), $src$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
%}

instruct MoveF2I_reg_reg_sse(rRegI dst, regF src) %{
  predicate(UseSSE>=2);
  match(Set dst (MoveF2I src));
  effect( DEF dst, USE src );
  ins_cost(85);
  format %{ "MOVD   $dst,$src\t# MoveF2I_reg_reg_sse" %}
  ins_encode %{
    __ movdl($dst$$Register, $src$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
%}

instruct MoveI2F_reg_stack(stackSlotF dst, rRegI src) %{
  match(Set dst (MoveI2F src));
  effect( DEF dst, USE src );

  ins_cost(100);
  format %{ "MOV    $dst,$src\t# MoveI2F_reg_stack" %}
  ins_encode %{
    __ movl(Address(rsp, $dst$$disp), $src$$Register);
  %}
  ins_pipe( ialu_mem_reg );
%}


instruct MoveI2FPR_stack_reg(regFPR dst, stackSlotI src) %{
  predicate(UseSSE==0);
  match(Set dst (MoveI2F src));
  effect(DEF dst, USE src);

  ins_cost(125);
  format %{ "FLD_S  $src\n\t"
            "FSTP   $dst\t# MoveI2F_stack_reg" %}
  opcode(0xD9);               /* D9 /0, FLD m32real */
  ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
              Pop_Reg_FPR(dst) );
  ins_pipe( fpu_reg_mem );
%}

instruct MoveI2F_stack_reg_sse(regF dst, stackSlotI src) %{
  predicate(UseSSE>=1);
  match(Set dst (MoveI2F src));
  effect( DEF dst, USE src );

  ins_cost(95);
  format %{ "MOVSS  $dst,$src\t# MoveI2F_stack_reg_sse" %}
  ins_encode %{
    __ movflt($dst$$XMMRegister, Address(rsp, $src$$disp));
  %}
  ins_pipe( pipe_slow );
%}

instruct MoveI2F_reg_reg_sse(regF dst, rRegI src) %{
  predicate(UseSSE>=2);
  match(Set dst (MoveI2F src));
  effect( DEF dst, USE src );

  ins_cost(85);
  format %{ "MOVD   $dst,$src\t# MoveI2F_reg_reg_sse" %}
  ins_encode %{
    __ movdl($dst$$XMMRegister, $src$$Register);
  %}
  ins_pipe( pipe_slow );
%}

instruct MoveD2L_stack_reg(eRegL dst, stackSlotD src) %{
  match(Set dst (MoveD2L src));
  effect(DEF dst, USE src);

  ins_cost(250);
  format %{ "MOV    $dst.lo,$src\n\t"
            "MOV    $dst.hi,$src+4\t# MoveD2L_stack_reg" %}
  opcode(0x8B, 0x8B);
  ins_encode( OpcP, RegMem(dst,src), OpcS, RegMem_Hi(dst,src));
  ins_pipe( ialu_mem_long_reg );
%}

instruct MoveDPR2L_reg_stack(stackSlotL dst, regDPR src) %{
  predicate(UseSSE<=1);
  match(Set dst (MoveD2L src));
  effect(DEF dst, USE src);

  ins_cost(125);
  format %{ "FST_D  $dst,$src\t# MoveD2L_reg_stack" %}
  ins_encode( Pop_Mem_Reg_DPR(dst, src) );
  ins_pipe( fpu_mem_reg );
%}

instruct MoveD2L_reg_stack_sse(stackSlotL dst, regD src) %{
  predicate(UseSSE>=2);
  match(Set dst (MoveD2L src));
  effect(DEF dst, USE src);
  ins_cost(95);
  format %{ "MOVSD  $dst,$src\t# MoveD2L_reg_stack_sse" %}
  ins_encode %{
    __ movdbl(Address(rsp, $dst$$disp), $src$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
%}

instruct MoveD2L_reg_reg_sse(eRegL dst, regD src, regD tmp) %{
  predicate(UseSSE>=2);
  match(Set dst (MoveD2L src));
  effect(DEF dst, USE src, TEMP tmp);
  ins_cost(85);
  format %{ "MOVD   $dst.lo,$src\n\t"
            "PSHUFLW $tmp,$src,0x4E\n\t"
            "MOVD   $dst.hi,$tmp\t# MoveD2L_reg_reg_sse" %}
  ins_encode %{
    __ movdl($dst$$Register, $src$$XMMRegister);
    __ pshuflw($tmp$$XMMRegister, $src$$XMMRegister, 0x4e);
    __ movdl(HIGH_FROM_LOW($dst$$Register), $tmp$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
%}

instruct MoveL2D_reg_stack(stackSlotD dst, eRegL src) %{
  match(Set dst (MoveL2D src));
  effect(DEF dst, USE src);

  ins_cost(200);
  format %{ "MOV    $dst,$src.lo\n\t"
            "MOV    $dst+4,$src.hi\t# MoveL2D_reg_stack" %}
  opcode(0x89, 0x89);
  ins_encode( OpcP, RegMem( src, dst ), OpcS, RegMem_Hi( src, dst ) );
  ins_pipe( ialu_mem_long_reg );
%}


instruct MoveL2DPR_stack_reg(regDPR dst, stackSlotL src) %{
  predicate(UseSSE<=1);
  match(Set dst (MoveL2D src));
  effect(DEF dst, USE src);
  ins_cost(125);

  format %{ "FLD_D  $src\n\t"
            "FSTP   $dst\t# MoveL2D_stack_reg" %}
  opcode(0xDD);               /* DD /0, FLD m64real */
  ins_encode( OpcP, RMopc_Mem_no_oop(0x00,src),
              Pop_Reg_DPR(dst) );
  ins_pipe( fpu_reg_mem );
%}


instruct MoveL2D_stack_reg_sse(regD dst, stackSlotL src) %{
  predicate(UseSSE>=2 && UseXmmLoadAndClearUpper);
  match(Set dst (MoveL2D src));
  effect(DEF dst, USE src);

  ins_cost(95);
  format %{ "MOVSD  $dst,$src\t# MoveL2D_stack_reg_sse" %}
  ins_encode %{
    __ movdbl($dst$$XMMRegister, Address(rsp, $src$$disp));
  %}
  ins_pipe( pipe_slow );
%}

instruct MoveL2D_stack_reg_sse_partial(regD dst, stackSlotL src) %{
  predicate(UseSSE>=2 && !UseXmmLoadAndClearUpper);
  match(Set dst (MoveL2D src));
  effect(DEF dst, USE src);

  ins_cost(95);
  format %{ "MOVLPD $dst,$src\t# MoveL2D_stack_reg_sse" %}
  ins_encode %{
    __ movdbl($dst$$XMMRegister, Address(rsp, $src$$disp));
  %}
  ins_pipe( pipe_slow );
%}

instruct MoveL2D_reg_reg_sse(regD dst, eRegL src, regD tmp) %{
  predicate(UseSSE>=2);
  match(Set dst (MoveL2D src));
  effect(TEMP dst, USE src, TEMP tmp);
  ins_cost(85);
  format %{ "MOVD   $dst,$src.lo\n\t"
            "MOVD   $tmp,$src.hi\n\t"
            "PUNPCKLDQ $dst,$tmp\t# MoveL2D_reg_reg_sse" %}
  ins_encode %{
    __ movdl($dst$$XMMRegister, $src$$Register);
    __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
    __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
%}


// =======================================================================
// fast clearing of an array
instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
  predicate(!UseFastStosb);
  match(Set dummy (ClearArray cnt base));
  effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
  format %{ "XOR    EAX,EAX\t# ClearArray:\n\t"
            "SHL    ECX,1\t# Convert doublewords to words\n\t"
            "REP STOS\t# store EAX into [EDI++] while ECX--" %}
  ins_encode %{ 
    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
  %}
  ins_pipe( pipe_slow );
%}

instruct rep_fast_stosb(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
  predicate(UseFastStosb);
  match(Set dummy (ClearArray cnt base));
  effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
  format %{ "XOR    EAX,EAX\t# ClearArray:\n\t"
            "SHL    ECX,3\t# Convert doublewords to bytes\n\t"
            "REP STOSB\t# store EAX into [EDI++] while ECX--" %}
  ins_encode %{ 
    __ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
  %}
  ins_pipe( pipe_slow );
%}

instruct string_compare(eDIRegP str1, eCXRegI cnt1, eSIRegP str2, eDXRegI cnt2,
                        eAXRegI result, regD tmp1, eFlagsReg cr) %{
  match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
  effect(TEMP tmp1, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL cr);

  format %{ "String Compare $str1,$cnt1,$str2,$cnt2 -> $result   // KILL $tmp1" %}
  ins_encode %{
    __ string_compare($str1$$Register, $str2$$Register,
                      $cnt1$$Register, $cnt2$$Register, $result$$Register,
                      $tmp1$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
%}

// fast string equals
instruct string_equals(eDIRegP str1, eSIRegP str2, eCXRegI cnt, eAXRegI result,
                       regD tmp1, regD tmp2, eBXRegI tmp3, eFlagsReg cr) %{
  match(Set result (StrEquals (Binary str1 str2) cnt));
  effect(TEMP tmp1, TEMP tmp2, USE_KILL str1, USE_KILL str2, USE_KILL cnt, KILL tmp3, KILL cr);

  format %{ "String Equals $str1,$str2,$cnt -> $result    // KILL $tmp1, $tmp2, $tmp3" %}
  ins_encode %{
    __ char_arrays_equals(false, $str1$$Register, $str2$$Register,
                          $cnt$$Register, $result$$Register, $tmp3$$Register,
                          $tmp1$$XMMRegister, $tmp2$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
%}

// fast search of substring with known size.
instruct string_indexof_con(eDIRegP str1, eDXRegI cnt1, eSIRegP str2, immI int_cnt2,
                            eBXRegI result, regD vec, eAXRegI cnt2, eCXRegI tmp, eFlagsReg cr) %{
  predicate(UseSSE42Intrinsics);
  match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2)));
  effect(TEMP vec, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, KILL cnt2, KILL tmp, KILL cr);

  format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result   // KILL $vec, $cnt1, $cnt2, $tmp" %}
  ins_encode %{
    int icnt2 = (int)$int_cnt2$$constant;
    if (icnt2 >= 8) {
      // IndexOf for constant substrings with size >= 8 elements
      // which don't need to be loaded through stack.
      __ string_indexofC8($str1$$Register, $str2$$Register,
                          $cnt1$$Register, $cnt2$$Register,
                          icnt2, $result$$Register,
                          $vec$$XMMRegister, $tmp$$Register);
    } else {
      // Small strings are loaded through stack if they cross page boundary.
      __ string_indexof($str1$$Register, $str2$$Register,
                        $cnt1$$Register, $cnt2$$Register,
                        icnt2, $result$$Register,
                        $vec$$XMMRegister, $tmp$$Register);
    }
  %}
  ins_pipe( pipe_slow );
%}

instruct string_indexof(eDIRegP str1, eDXRegI cnt1, eSIRegP str2, eAXRegI cnt2,
                        eBXRegI result, regD vec, eCXRegI tmp, eFlagsReg cr) %{
  predicate(UseSSE42Intrinsics);
  match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2)));
  effect(TEMP vec, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, KILL tmp, KILL cr);

  format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result   // KILL all" %}
  ins_encode %{
    __ string_indexof($str1$$Register, $str2$$Register,
                      $cnt1$$Register, $cnt2$$Register,
                      (-1), $result$$Register,
                      $vec$$XMMRegister, $tmp$$Register);
  %}
  ins_pipe( pipe_slow );
%}

// fast array equals
instruct array_equals(eDIRegP ary1, eSIRegP ary2, eAXRegI result,
                      regD tmp1, regD tmp2, eCXRegI tmp3, eBXRegI tmp4, eFlagsReg cr)
%{
  match(Set result (AryEq ary1 ary2));
  effect(TEMP tmp1, TEMP tmp2, USE_KILL ary1, USE_KILL ary2, KILL tmp3, KILL tmp4, KILL cr);
  //ins_cost(300);

  format %{ "Array Equals $ary1,$ary2 -> $result   // KILL $tmp1, $tmp2, $tmp3, $tmp4" %}
  ins_encode %{
    __ char_arrays_equals(true, $ary1$$Register, $ary2$$Register,
                          $tmp3$$Register, $result$$Register, $tmp4$$Register,
                          $tmp1$$XMMRegister, $tmp2$$XMMRegister);
  %}
  ins_pipe( pipe_slow );
%}

// encode char[] to byte[] in ISO_8859_1
instruct encode_iso_array(eSIRegP src, eDIRegP dst, eDXRegI len,
                          regD tmp1, regD tmp2, regD tmp3, regD tmp4,
                          eCXRegI tmp5, eAXRegI result, eFlagsReg cr) %{
  match(Set result (EncodeISOArray src (Binary dst len)));
  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL tmp5, KILL cr);

  format %{ "Encode array $src,$dst,$len -> $result    // KILL ECX, EDX, $tmp1, $tmp2, $tmp3, $tmp4, ESI, EDI " %}
  ins_encode %{
    __ encode_iso_array($src$$Register, $dst$$Register, $len$$Register,
                        $tmp1$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister,
                        $tmp4$$XMMRegister, $tmp5$$Register, $result$$Register);
  %}
  ins_pipe( pipe_slow );
%}


//----------Control Flow Instructions------------------------------------------
// Signed compare Instructions
instruct compI_eReg(eFlagsReg cr, rRegI op1, rRegI op2) %{
  match(Set cr (CmpI op1 op2));
  effect( DEF cr, USE op1, USE op2 );
  format %{ "CMP    $op1,$op2" %}
  opcode(0x3B);  /* Opcode 3B /r */
  ins_encode( OpcP, RegReg( op1, op2) );
  ins_pipe( ialu_cr_reg_reg );
%}

instruct compI_eReg_imm(eFlagsReg cr, rRegI op1, immI op2) %{
  match(Set cr (CmpI op1 op2));
  effect( DEF cr, USE op1 );
  format %{ "CMP    $op1,$op2" %}
  opcode(0x81,0x07);  /* Opcode 81 /7 */
  // ins_encode( RegImm( op1, op2) );  /* Was CmpImm */
  ins_encode( OpcSErm( op1, op2 ), Con8or32( op2 ) );
  ins_pipe( ialu_cr_reg_imm );
%}

// Cisc-spilled version of cmpI_eReg
instruct compI_eReg_mem(eFlagsReg cr, rRegI op1, memory op2) %{
  match(Set cr (CmpI op1 (LoadI op2)));

  format %{ "CMP    $op1,$op2" %}
  ins_cost(500);
  opcode(0x3B);  /* Opcode 3B /r */
  ins_encode( OpcP, RegMem( op1, op2) );
  ins_pipe( ialu_cr_reg_mem );
%}

instruct testI_reg( eFlagsReg cr, rRegI src, immI0 zero ) %{
  match(Set cr (CmpI src zero));
  effect( DEF cr, USE src );

  format %{ "TEST   $src,$src" %}
  opcode(0x85);
  ins_encode( OpcP, RegReg( src, src ) );
  ins_pipe( ialu_cr_reg_imm );
%}

instruct testI_reg_imm( eFlagsReg cr, rRegI src, immI con, immI0 zero ) %{
  match(Set cr (CmpI (AndI src con) zero));

  format %{ "TEST   $src,$con" %}
  opcode(0xF7,0x00);
  ins_encode( OpcP, RegOpc(src), Con32(con) );
  ins_pipe( ialu_cr_reg_imm );
%}

instruct testI_reg_mem( eFlagsReg cr, rRegI src, memory mem, immI0 zero ) %{
  match(Set cr (CmpI (AndI src mem) zero));

  format %{ "TEST   $src,$mem" %}
  opcode(0x85);
  ins_encode( OpcP, RegMem( src, mem ) );
  ins_pipe( ialu_cr_reg_mem );
%}

// Unsigned compare Instructions; really, same as signed except they
// produce an eFlagsRegU instead of eFlagsReg.
instruct compU_eReg(eFlagsRegU cr, rRegI op1, rRegI op2) %{
  match(Set cr (CmpU op1 op2));

  format %{ "CMPu   $op1,$op2" %}
  opcode(0x3B);  /* Opcode 3B /r */
  ins_encode( OpcP, RegReg( op1, op2) );
  ins_pipe( ialu_cr_reg_reg );
%}

instruct compU_eReg_imm(eFlagsRegU cr, rRegI op1, immI op2) %{
  match(Set cr (CmpU op1 op2));

  format %{ "CMPu   $op1,$op2" %}
  opcode(0x81,0x07);  /* Opcode 81 /7 */
  ins_encode( OpcSErm( op1, op2 ), Con8or32( op2 ) );
  ins_pipe( ialu_cr_reg_imm );
%}

// // Cisc-spilled version of cmpU_eReg
instruct compU_eReg_mem(eFlagsRegU cr, rRegI op1, memory op2) %{
  match(Set cr (CmpU op1 (LoadI op2)));

  format %{ "CMPu   $op1,$op2" %}
  ins_cost(500);
  opcode(0x3B);  /* Opcode 3B /r */
  ins_encode( OpcP, RegMem( op1, op2) );
  ins_pipe( ialu_cr_reg_mem );
%}

// // Cisc-spilled version of cmpU_eReg
//instruct compU_mem_eReg(eFlagsRegU cr, memory op1, rRegI op2) %{
//  match(Set cr (CmpU (LoadI op1) op2));
//
//  format %{ "CMPu   $op1,$op2" %}
//  ins_cost(500);
//  opcode(0x39);  /* Opcode 39 /r */
//  ins_encode( OpcP, RegMem( op1, op2) );
//%}

instruct testU_reg( eFlagsRegU cr, rRegI src, immI0 zero ) %{
  match(Set cr (CmpU src zero));

  format %{ "TESTu  $src,$src" %}
  opcode(0x85);
  ins_encode( OpcP, RegReg( src, src ) );
  ins_pipe( ialu_cr_reg_imm );
%}

// Unsigned pointer compare Instructions
instruct compP_eReg(eFlagsRegU cr, eRegP op1, eRegP op2) %{
  match(Set cr (CmpP op1 op2));

  format %{ "CMPu   $op1,$op2" %}
  opcode(0x3B);  /* Opcode 3B /r */
  ins_encode( OpcP, RegReg( op1, op2) );
  ins_pipe( ialu_cr_reg_reg );
%}

instruct compP_eReg_imm(eFlagsRegU cr, eRegP op1, immP op2) %{
  match(Set cr (CmpP op1 op2));

  format %{ "CMPu   $op1,$op2" %}
  opcode(0x81,0x07);  /* Opcode 81 /7 */
  ins_encode( OpcSErm( op1, op2 ), Con8or32( op2 ) );
  ins_pipe( ialu_cr_reg_imm );
%}

// // Cisc-spilled version of cmpP_eReg
instruct compP_eReg_mem(eFlagsRegU cr, eRegP op1, memory op2) %{
  match(Set cr (CmpP op1 (LoadP op2)));

  format %{ "CMPu   $op1,$op2" %}
  ins_cost(500);
  opcode(0x3B);  /* Opcode 3B /r */
  ins_encode( OpcP, RegMem( op1, op2) );
  ins_pipe( ialu_cr_reg_mem );
%}

// // Cisc-spilled version of cmpP_eReg
//instruct compP_mem_eReg(eFlagsRegU cr, memory op1, eRegP op2) %{
//  match(Set cr (CmpP (LoadP op1) op2));
//
//  format %{ "CMPu   $op1,$op2" %}
//  ins_cost(500);
//  opcode(0x39);  /* Opcode 39 /r */
//  ins_encode( OpcP, RegMem( op1, op2) );
//%}

// Compare raw pointer (used in out-of-heap check).
// Only works because non-oop pointers must be raw pointers
// and raw pointers have no anti-dependencies.
instruct compP_mem_eReg( eFlagsRegU cr, eRegP op1, memory op2 ) %{
  predicate( n->in(2)->in(2)->bottom_type()->reloc() == relocInfo::none );
  match(Set cr (CmpP op1 (LoadP op2)));

  format %{ "CMPu   $op1,$op2" %}
  opcode(0x3B);  /* Opcode 3B /r */
  ins_encode( OpcP, RegMem( op1, op2) );
  ins_pipe( ialu_cr_reg_mem );
%}

//
// This will generate a signed flags result. This should be ok
// since any compare to a zero should be eq/neq.
instruct testP_reg( eFlagsReg cr, eRegP src, immP0 zero ) %{
  match(Set cr (CmpP src zero));

  format %{ "TEST   $src,$src" %}
  opcode(0x85);
  ins_encode( OpcP, RegReg( src, src ) );
  ins_pipe( ialu_cr_reg_imm );
%}

// Cisc-spilled version of testP_reg
// This will generate a signed flags result. This should be ok
// since any compare to a zero should be eq/neq.
instruct testP_Reg_mem( eFlagsReg cr, memory op, immI0 zero ) %{
  match(Set cr (CmpP (LoadP op) zero));

  format %{ "TEST   $op,0xFFFFFFFF" %}
  ins_cost(500);
  opcode(0xF7);               /* Opcode F7 /0 */
  ins_encode( OpcP, RMopc_Mem(0x00,op), Con_d32(0xFFFFFFFF) );
  ins_pipe( ialu_cr_reg_imm );
%}

// Yanked all unsigned pointer compare operations.
// Pointer compares are done with CmpP which is already unsigned.

//----------Max and Min--------------------------------------------------------
// Min Instructions
////
//   *** Min and Max using the conditional move are slower than the
//   *** branch version on a Pentium III.
// // Conditional move for min
//instruct cmovI_reg_lt( rRegI op2, rRegI op1, eFlagsReg cr ) %{
//  effect( USE_DEF op2, USE op1, USE cr );
//  format %{ "CMOVlt $op2,$op1\t! min" %}
//  opcode(0x4C,0x0F);
//  ins_encode( OpcS, OpcP, RegReg( op2, op1 ) );
//  ins_pipe( pipe_cmov_reg );
//%}
//
//// Min Register with Register (P6 version)
//instruct minI_eReg_p6( rRegI op1, rRegI op2 ) %{
//  predicate(VM_Version::supports_cmov() );
//  match(Set op2 (MinI op1 op2));
//  ins_cost(200);
//  expand %{
//    eFlagsReg cr;
//    compI_eReg(cr,op1,op2);
//    cmovI_reg_lt(op2,op1,cr);
//  %}
//%}

// Min Register with Register (generic version)
instruct minI_eReg(rRegI dst, rRegI src, eFlagsReg flags) %{
  match(Set dst (MinI dst src));
  effect(KILL flags);
  ins_cost(300);

  format %{ "MIN    $dst,$src" %}
  opcode(0xCC);
  ins_encode( min_enc(dst,src) );
  ins_pipe( pipe_slow );
%}

// Max Register with Register
//   *** Min and Max using the conditional move are slower than the
//   *** branch version on a Pentium III.
// // Conditional move for max
//instruct cmovI_reg_gt( rRegI op2, rRegI op1, eFlagsReg cr ) %{
//  effect( USE_DEF op2, USE op1, USE cr );
//  format %{ "CMOVgt $op2,$op1\t! max" %}
//  opcode(0x4F,0x0F);
//  ins_encode( OpcS, OpcP, RegReg( op2, op1 ) );
//  ins_pipe( pipe_cmov_reg );
//%}
//
// // Max Register with Register (P6 version)
//instruct maxI_eReg_p6( rRegI op1, rRegI op2 ) %{
//  predicate(VM_Version::supports_cmov() );
//  match(Set op2 (MaxI op1 op2));
//  ins_cost(200);
//  expand %{
//    eFlagsReg cr;
//    compI_eReg(cr,op1,op2);
//    cmovI_reg_gt(op2,op1,cr);
//  %}
//%}

// Max Register with Register (generic version)
instruct maxI_eReg(rRegI dst, rRegI src, eFlagsReg flags) %{
  match(Set dst (MaxI dst src));
  effect(KILL flags);
  ins_cost(300);

  format %{ "MAX    $dst,$src" %}
  opcode(0xCC);
  ins_encode( max_enc(dst,src) );
  ins_pipe( pipe_slow );
%}

// ============================================================================
// Counted Loop limit node which represents exact final iterator value.
// Note: the resulting value should fit into integer range since
// counted loops have limit check on overflow.
instruct loopLimit_eReg(eAXRegI limit, nadxRegI init, immI stride, eDXRegI limit_hi, nadxRegI tmp, eFlagsReg flags) %{
  match(Set limit (LoopLimit (Binary init limit) stride));
  effect(TEMP limit_hi, TEMP tmp, KILL flags);
  ins_cost(300);

  format %{ "loopLimit $init,$limit,$stride  # $limit = $init + $stride *( $limit - $init + $stride -1)/ $stride, kills $limit_hi" %}
  ins_encode %{
    int strd = (int)$stride$$constant;
    assert(strd != 1 && strd != -1, "sanity");
    int m1 = (strd > 0) ? 1 : -1;
    // Convert limit to long (EAX:EDX)
    __ cdql();
    // Convert init to long (init:tmp)
    __ movl($tmp$$Register, $init$$Register);
    __ sarl($tmp$$Register, 31);
    // $limit - $init
    __ subl($limit$$Register, $init$$Register);
    __ sbbl($limit_hi$$Register, $tmp$$Register);
    // + ($stride - 1)
    if (strd > 0) {
      __ addl($limit$$Register, (strd - 1));
      __ adcl($limit_hi$$Register, 0);
      __ movl($tmp$$Register, strd);
    } else {
      __ addl($limit$$Register, (strd + 1));
      __ adcl($limit_hi$$Register, -1);
      __ lneg($limit_hi$$Register, $limit$$Register);
      __ movl($tmp$$Register, -strd);
    }
    // signed devision: (EAX:EDX) / pos_stride
    __ idivl($tmp$$Register);
    if (strd < 0) {
      // restore sign
      __ negl($tmp$$Register);
    }
    // (EAX) * stride
    __ mull($tmp$$Register);
    // + init (ignore upper bits)
    __ addl($limit$$Register, $init$$Register);
  %}
  ins_pipe( pipe_slow );
%}

// ============================================================================
// Branch Instructions
// Jump Table
instruct jumpXtnd(rRegI switch_val) %{
  match(Jump switch_val);
  ins_cost(350);
  format %{  "JMP    [$constantaddress](,$switch_val,1)\n\t" %}
  ins_encode %{
    // Jump to Address(table_base + switch_reg)
    Address index(noreg, $switch_val$$Register, Address::times_1);
    __ jump(ArrayAddress($constantaddress, index));
  %}
  ins_pipe(pipe_jmp);
%}

// Jump Direct - Label defines a relative address from JMP+1
instruct jmpDir(label labl) %{
  match(Goto);
  effect(USE labl);

  ins_cost(300);
  format %{ "JMP    $labl" %}
  size(5);
  ins_encode %{
    Label* L = $labl$$label;
    __ jmp(*L, false); // Always long jump
  %}
  ins_pipe( pipe_jmp );
%}

// Jump Direct Conditional - Label defines a relative address from Jcc+1
instruct jmpCon(cmpOp cop, eFlagsReg cr, label labl) %{
  match(If cop cr);
  effect(USE labl);

  ins_cost(300);
  format %{ "J$cop    $labl" %}
  size(6);
  ins_encode %{
    Label* L = $labl$$label;
    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
  %}
  ins_pipe( pipe_jcc );
%}

// Jump Direct Conditional - Label defines a relative address from Jcc+1
instruct jmpLoopEnd(cmpOp cop, eFlagsReg cr, label labl) %{
  match(CountedLoopEnd cop cr);
  effect(USE labl);

  ins_cost(300);
  format %{ "J$cop    $labl\t# Loop end" %}
  size(6);
  ins_encode %{
    Label* L = $labl$$label;
    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
  %}
  ins_pipe( pipe_jcc );
%}

// Jump Direct Conditional - Label defines a relative address from Jcc+1
instruct jmpLoopEndU(cmpOpU cop, eFlagsRegU cmp, label labl) %{
  match(CountedLoopEnd cop cmp);
  effect(USE labl);

  ins_cost(300);
  format %{ "J$cop,u  $labl\t# Loop end" %}
  size(6);
  ins_encode %{
    Label* L = $labl$$label;
    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
  %}
  ins_pipe( pipe_jcc );
%}

instruct jmpLoopEndUCF(cmpOpUCF cop, eFlagsRegUCF cmp, label labl) %{
  match(CountedLoopEnd cop cmp);
  effect(USE labl);

  ins_cost(200);
  format %{ "J$cop,u  $labl\t# Loop end" %}
  size(6);
  ins_encode %{
    Label* L = $labl$$label;
    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
  %}
  ins_pipe( pipe_jcc );
%}

// Jump Direct Conditional - using unsigned comparison
instruct jmpConU(cmpOpU cop, eFlagsRegU cmp, label labl) %{
  match(If cop cmp);
  effect(USE labl);

  ins_cost(300);
  format %{ "J$cop,u  $labl" %}
  size(6);
  ins_encode %{
    Label* L = $labl$$label;
    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
  %}
  ins_pipe(pipe_jcc);
%}

instruct jmpConUCF(cmpOpUCF cop, eFlagsRegUCF cmp, label labl) %{
  match(If cop cmp);
  effect(USE labl);

  ins_cost(200);
  format %{ "J$cop,u  $labl" %}
  size(6);
  ins_encode %{
    Label* L = $labl$$label;
    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
  %}
  ins_pipe(pipe_jcc);
%}

instruct jmpConUCF2(cmpOpUCF2 cop, eFlagsRegUCF cmp, label labl) %{
  match(If cop cmp);
  effect(USE labl);

  ins_cost(200);
  format %{ $$template
    if ($cop$$cmpcode == Assembler::notEqual) {
      $$emit$$"JP,u   $labl\n\t"
      $$emit$$"J$cop,u   $labl"
    } else {
      $$emit$$"JP,u   done\n\t"
      $$emit$$"J$cop,u   $labl\n\t"
      $$emit$$"done:"
    }
  %}
  ins_encode %{
    Label* l = $labl$$label;
    if ($cop$$cmpcode == Assembler::notEqual) {
      __ jcc(Assembler::parity, *l, false);
      __ jcc(Assembler::notEqual, *l, false);
    } else if ($cop$$cmpcode == Assembler::equal) {
      Label done;
      __ jccb(Assembler::parity, done);
      __ jcc(Assembler::equal, *l, false);
      __ bind(done);
    } else {
       ShouldNotReachHere();
    }
  %}
  ins_pipe(pipe_jcc);
%}

// ============================================================================
// The 2nd slow-half of a subtype check.  Scan the subklass's 2ndary superklass
// array for an instance of the superklass.  Set a hidden internal cache on a
// hit (cache is checked with exposed code in gen_subtype_check()).  Return
// NZ for a miss or zero for a hit.  The encoding ALSO sets flags.
instruct partialSubtypeCheck( eDIRegP result, eSIRegP sub, eAXRegP super, eCXRegI rcx, eFlagsReg cr ) %{
  match(Set result (PartialSubtypeCheck sub super));
  effect( KILL rcx, KILL cr );

  ins_cost(1100);  // slightly larger than the next version
  format %{ "MOV    EDI,[$sub+Klass::secondary_supers]\n\t"
            "MOV    ECX,[EDI+ArrayKlass::length]\t# length to scan\n\t"
            "ADD    EDI,ArrayKlass::base_offset\t# Skip to start of data; set NZ in case count is zero\n\t"
            "REPNE SCASD\t# Scan *EDI++ for a match with EAX while CX-- != 0\n\t"
            "JNE,s  miss\t\t# Missed: EDI not-zero\n\t"
            "MOV    [$sub+Klass::secondary_super_cache],$super\t# Hit: update cache\n\t"
            "XOR    $result,$result\t\t Hit: EDI zero\n\t"
     "miss:\t" %}

  opcode(0x1); // Force a XOR of EDI
  ins_encode( enc_PartialSubtypeCheck() );
  ins_pipe( pipe_slow );
%}

instruct partialSubtypeCheck_vs_Zero( eFlagsReg cr, eSIRegP sub, eAXRegP super, eCXRegI rcx, eDIRegP result, immP0 zero ) %{
  match(Set cr (CmpP (PartialSubtypeCheck sub super) zero));
  effect( KILL rcx, KILL result );

  ins_cost(1000);
  format %{ "MOV    EDI,[$sub+Klass::secondary_supers]\n\t"
            "MOV    ECX,[EDI+ArrayKlass::length]\t# length to scan\n\t"
            "ADD    EDI,ArrayKlass::base_offset\t# Skip to start of data; set NZ in case count is zero\n\t"
            "REPNE SCASD\t# Scan *EDI++ for a match with EAX while CX-- != 0\n\t"
            "JNE,s  miss\t\t# Missed: flags NZ\n\t"
            "MOV    [$sub+Klass::secondary_super_cache],$super\t# Hit: update cache, flags Z\n\t"
     "miss:\t" %}

  opcode(0x0);  // No need to XOR EDI
  ins_encode( enc_PartialSubtypeCheck() );
  ins_pipe( pipe_slow );
%}

// ============================================================================
// Branch Instructions -- short offset versions
//
// These instructions are used to replace jumps of a long offset (the default
// match) with jumps of a shorter offset.  These instructions are all tagged
// with the ins_short_branch attribute, which causes the ADLC to suppress the
// match rules in general matching.  Instead, the ADLC generates a conversion
// method in the MachNode which can be used to do in-place replacement of the
// long variant with the shorter variant.  The compiler will determine if a
// branch can be taken by the is_short_branch_offset() predicate in the machine
// specific code section of the file.

// Jump Direct - Label defines a relative address from JMP+1
instruct jmpDir_short(label labl) %{
  match(Goto);
  effect(USE labl);

  ins_cost(300);
  format %{ "JMP,s  $labl" %}
  size(2);
  ins_encode %{
    Label* L = $labl$$label;
    __ jmpb(*L);
  %}
  ins_pipe( pipe_jmp );
  ins_short_branch(1);
%}

// Jump Direct Conditional - Label defines a relative address from Jcc+1
instruct jmpCon_short(cmpOp cop, eFlagsReg cr, label labl) %{
  match(If cop cr);
  effect(USE labl);

  ins_cost(300);
  format %{ "J$cop,s  $labl" %}
  size(2);
  ins_encode %{
    Label* L = $labl$$label;
    __ jccb((Assembler::Condition)($cop$$cmpcode), *L);
  %}
  ins_pipe( pipe_jcc );
  ins_short_branch(1);
%}

// Jump Direct Conditional - Label defines a relative address from Jcc+1
instruct jmpLoopEnd_short(cmpOp cop, eFlagsReg cr, label labl) %{
  match(CountedLoopEnd cop cr);
  effect(USE labl);

  ins_cost(300);
  format %{ "J$cop,s  $labl\t# Loop end" %}
  size(2);
  ins_encode %{
    Label* L = $labl$$label;
    __ jccb((Assembler::Condition)($cop$$cmpcode), *L);
  %}
  ins_pipe( pipe_jcc );
  ins_short_branch(1);
%}

// Jump Direct Conditional - Label defines a relative address from Jcc+1
instruct jmpLoopEndU_short(cmpOpU cop, eFlagsRegU cmp, label labl) %{
  match(CountedLoopEnd cop cmp);
  effect(USE labl);

  ins_cost(300);
  format %{ "J$cop,us $labl\t# Loop end" %}
  size(2);
  ins_encode %{
    Label* L = $labl$$label;
    __ jccb((Assembler::Condition)($cop$$cmpcode), *L);
  %}
  ins_pipe( pipe_jcc );
  ins_short_branch(1);
%}

instruct jmpLoopEndUCF_short(cmpOpUCF cop, eFlagsRegUCF cmp, label labl) %{
  match(CountedLoopEnd cop cmp);
  effect(USE labl);

  ins_cost(300);
  format %{ "J$cop,us $labl\t# Loop end" %}
  size(2);
  ins_encode %{
    Label* L = $labl$$label;
    __ jccb((Assembler::Condition)($cop$$cmpcode), *L);
  %}
  ins_pipe( pipe_jcc );
  ins_short_branch(1);
%}

// Jump Direct Conditional - using unsigned comparison
instruct jmpConU_short(cmpOpU cop, eFlagsRegU cmp, label labl) %{
  match(If cop cmp);
  effect(USE labl);

  ins_cost(300);
  format %{ "J$cop,us $labl" %}
  size(2);
  ins_encode %{
    Label* L = $labl$$label;
    __ jccb((Assembler::Condition)($cop$$cmpcode), *L);
  %}
  ins_pipe( pipe_jcc );
  ins_short_branch(1);
%}

instruct jmpConUCF_short(cmpOpUCF cop, eFlagsRegUCF cmp, label labl) %{
  match(If cop cmp);
  effect(USE labl);

  ins_cost(300);
  format %{ "J$cop,us $labl" %}
  size(2);
  ins_encode %{
    Label* L = $labl$$label;
    __ jccb((Assembler::Condition)($cop$$cmpcode), *L);
  %}
  ins_pipe( pipe_jcc );
  ins_short_branch(1);
%}

instruct jmpConUCF2_short(cmpOpUCF2 cop, eFlagsRegUCF cmp, label labl) %{
  match(If cop cmp);
  effect(USE labl);

  ins_cost(300);
  format %{ $$template
    if ($cop$$cmpcode == Assembler::notEqual) {
      $$emit$$"JP,u,s   $labl\n\t"
      $$emit$$"J$cop,u,s   $labl"
    } else {
      $$emit$$"JP,u,s   done\n\t"
      $$emit$$"J$cop,u,s  $labl\n\t"
      $$emit$$"done:"
    }
  %}
  size(4);
  ins_encode %{
    Label* l = $labl$$label;
    if ($cop$$cmpcode == Assembler::notEqual) {
      __ jccb(Assembler::parity, *l);
      __ jccb(Assembler::notEqual, *l);
    } else if ($cop$$cmpcode == Assembler::equal) {
      Label done;
      __ jccb(Assembler::parity, done);
      __ jccb(Assembler::equal, *l);
      __ bind(done);
    } else {
       ShouldNotReachHere();
    }
  %}
  ins_pipe(pipe_jcc);
  ins_short_branch(1);
%}

// ============================================================================
// Long Compare
//
// Currently we hold longs in 2 registers.  Comparing such values efficiently
// is tricky.  The flavor of compare used depends on whether we are testing
// for LT, LE, or EQ.  For a simple LT test we can check just the sign bit.
// The GE test is the negated LT test.  The LE test can be had by commuting
// the operands (yielding a GE test) and then negating; negate again for the
// GT test.  The EQ test is done by ORcc'ing the high and low halves, and the
// NE test is negated from that.

// Due to a shortcoming in the ADLC, it mixes up expressions like:
// (foo (CmpI (CmpL X Y) 0)) and (bar (CmpI (CmpL X 0L) 0)).  Note the
// difference between 'Y' and '0L'.  The tree-matches for the CmpI sections
// are collapsed internally in the ADLC's dfa-gen code.  The match for
// (CmpI (CmpL X Y) 0) is silently replaced with (CmpI (CmpL X 0L) 0) and the
// foo match ends up with the wrong leaf.  One fix is to not match both
// reg-reg and reg-zero forms of long-compare.  This is unfortunate because
// both forms beat the trinary form of long-compare and both are very useful
// on Intel which has so few registers.

// Manifest a CmpL result in an integer register.  Very painful.
// This is the test to avoid.
instruct cmpL3_reg_reg(eSIRegI dst, eRegL src1, eRegL src2, eFlagsReg flags ) %{
  match(Set dst (CmpL3 src1 src2));
  effect( KILL flags );
  ins_cost(1000);
  format %{ "XOR    $dst,$dst\n\t"
            "CMP    $src1.hi,$src2.hi\n\t"
            "JLT,s  m_one\n\t"
            "JGT,s  p_one\n\t"
            "CMP    $src1.lo,$src2.lo\n\t"
            "JB,s   m_one\n\t"
            "JEQ,s  done\n"
    "p_one:\tINC    $dst\n\t"
            "JMP,s  done\n"
    "m_one:\tDEC    $dst\n"
     "done:" %}
  ins_encode %{
    Label p_one, m_one, done;
    __ xorptr($dst$$Register, $dst$$Register);
    __ cmpl(HIGH_FROM_LOW($src1$$Register), HIGH_FROM_LOW($src2$$Register));
    __ jccb(Assembler::less,    m_one);
    __ jccb(Assembler::greater, p_one);
    __ cmpl($src1$$Register, $src2$$Register);
    __ jccb(Assembler::below,   m_one);
    __ jccb(Assembler::equal,   done);
    __ bind(p_one);
    __ incrementl($dst$$Register);
    __ jmpb(done);
    __ bind(m_one);
    __ decrementl($dst$$Register);
    __ bind(done);
  %}
  ins_pipe( pipe_slow );
%}

//======
// Manifest a CmpL result in the normal flags.  Only good for LT or GE
// compares.  Can be used for LE or GT compares by reversing arguments.
// NOT GOOD FOR EQ/NE tests.
instruct cmpL_zero_flags_LTGE( flagsReg_long_LTGE flags, eRegL src, immL0 zero ) %{
  match( Set flags (CmpL src zero ));
  ins_cost(100);
  format %{ "TEST   $src.hi,$src.hi" %}
  opcode(0x85);
  ins_encode( OpcP, RegReg_Hi2( src, src ) );
  ins_pipe( ialu_cr_reg_reg );
%}

// Manifest a CmpL result in the normal flags.  Only good for LT or GE
// compares.  Can be used for LE or GT compares by reversing arguments.
// NOT GOOD FOR EQ/NE tests.
instruct cmpL_reg_flags_LTGE( flagsReg_long_LTGE flags, eRegL src1, eRegL src2, rRegI tmp ) %{
  match( Set flags (CmpL src1 src2 ));
  effect( TEMP tmp );
  ins_cost(300);
  format %{ "CMP    $src1.lo,$src2.lo\t! Long compare; set flags for low bits\n\t"
            "MOV    $tmp,$src1.hi\n\t"
            "SBB    $tmp,$src2.hi\t! Compute flags for long compare" %}
  ins_encode( long_cmp_flags2( src1, src2, tmp ) );
  ins_pipe( ialu_cr_reg_reg );
%}

// Long compares reg < zero/req OR reg >= zero/req.
// Just a wrapper for a normal branch, plus the predicate test.
instruct cmpL_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, label labl) %{
  match(If cmp flags);
  effect(USE labl);
  predicate( _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
  expand %{
    jmpCon(cmp,flags,labl);    // JLT or JGE...
  %}
%}

// Compare 2 longs and CMOVE longs.
instruct cmovLL_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegL dst, eRegL src) %{
  match(Set dst (CMoveL (Binary cmp flags) (Binary dst src)));
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge ));
  ins_cost(400);
  format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
            "CMOV$cmp $dst.hi,$src.hi" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegReg_Lo2( dst, src ), enc_cmov(cmp), RegReg_Hi2( dst, src ) );
  ins_pipe( pipe_cmov_reg_long );
%}

instruct cmovLL_mem_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegL dst, load_long_memory src) %{
  match(Set dst (CMoveL (Binary cmp flags) (Binary dst (LoadL src))));
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge ));
  ins_cost(500);
  format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
            "CMOV$cmp $dst.hi,$src.hi" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegMem(dst, src), enc_cmov(cmp), RegMem_Hi(dst, src) );
  ins_pipe( pipe_cmov_reg_long );
%}

// Compare 2 longs and CMOVE ints.
instruct cmovII_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, rRegI dst, rRegI src) %{
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge ));
  match(Set dst (CMoveI (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cmp $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
  ins_pipe( pipe_cmov_reg );
%}

instruct cmovII_mem_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, rRegI dst, memory src) %{
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge ));
  match(Set dst (CMoveI (Binary cmp flags) (Binary dst (LoadI src))));
  ins_cost(250);
  format %{ "CMOV$cmp $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegMem( dst, src ) );
  ins_pipe( pipe_cmov_mem );
%}

// Compare 2 longs and CMOVE ints.
instruct cmovPP_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, eRegP dst, eRegP src) %{
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge ));
  match(Set dst (CMoveP (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cmp $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
  ins_pipe( pipe_cmov_reg );
%}

// Compare 2 longs and CMOVE doubles
instruct cmovDDPR_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regDPR dst, regDPR src) %{
  predicate( UseSSE<=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovDPR_regS(cmp,flags,dst,src);
  %}
%}

// Compare 2 longs and CMOVE doubles
instruct cmovDD_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regD dst, regD src) %{
  predicate( UseSSE>=2 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovD_regS(cmp,flags,dst,src);
  %}
%}

instruct cmovFFPR_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regFPR dst, regFPR src) %{
  predicate( UseSSE==0 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
  match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovFPR_regS(cmp,flags,dst,src);
  %}
%}

instruct cmovFF_reg_LTGE(cmpOp cmp, flagsReg_long_LTGE flags, regF dst, regF src) %{
  predicate( UseSSE>=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
  match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovF_regS(cmp,flags,dst,src);
  %}
%}

//======
// Manifest a CmpL result in the normal flags.  Only good for EQ/NE compares.
instruct cmpL_zero_flags_EQNE( flagsReg_long_EQNE flags, eRegL src, immL0 zero, rRegI tmp ) %{
  match( Set flags (CmpL src zero ));
  effect(TEMP tmp);
  ins_cost(200);
  format %{ "MOV    $tmp,$src.lo\n\t"
            "OR     $tmp,$src.hi\t! Long is EQ/NE 0?" %}
  ins_encode( long_cmp_flags0( src, tmp ) );
  ins_pipe( ialu_reg_reg_long );
%}

// Manifest a CmpL result in the normal flags.  Only good for EQ/NE compares.
instruct cmpL_reg_flags_EQNE( flagsReg_long_EQNE flags, eRegL src1, eRegL src2 ) %{
  match( Set flags (CmpL src1 src2 ));
  ins_cost(200+300);
  format %{ "CMP    $src1.lo,$src2.lo\t! Long compare; set flags for low bits\n\t"
            "JNE,s  skip\n\t"
            "CMP    $src1.hi,$src2.hi\n\t"
     "skip:\t" %}
  ins_encode( long_cmp_flags1( src1, src2 ) );
  ins_pipe( ialu_cr_reg_reg );
%}

// Long compare reg == zero/reg OR reg != zero/reg
// Just a wrapper for a normal branch, plus the predicate test.
instruct cmpL_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, label labl) %{
  match(If cmp flags);
  effect(USE labl);
  predicate( _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
  expand %{
    jmpCon(cmp,flags,labl);    // JEQ or JNE...
  %}
%}

// Compare 2 longs and CMOVE longs.
instruct cmovLL_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegL dst, eRegL src) %{
  match(Set dst (CMoveL (Binary cmp flags) (Binary dst src)));
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ));
  ins_cost(400);
  format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
            "CMOV$cmp $dst.hi,$src.hi" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegReg_Lo2( dst, src ), enc_cmov(cmp), RegReg_Hi2( dst, src ) );
  ins_pipe( pipe_cmov_reg_long );
%}

instruct cmovLL_mem_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegL dst, load_long_memory src) %{
  match(Set dst (CMoveL (Binary cmp flags) (Binary dst (LoadL src))));
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ));
  ins_cost(500);
  format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
            "CMOV$cmp $dst.hi,$src.hi" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegMem(dst, src), enc_cmov(cmp), RegMem_Hi(dst, src) );
  ins_pipe( pipe_cmov_reg_long );
%}

// Compare 2 longs and CMOVE ints.
instruct cmovII_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, rRegI dst, rRegI src) %{
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ));
  match(Set dst (CMoveI (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cmp $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
  ins_pipe( pipe_cmov_reg );
%}

instruct cmovII_mem_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, rRegI dst, memory src) %{
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ));
  match(Set dst (CMoveI (Binary cmp flags) (Binary dst (LoadI src))));
  ins_cost(250);
  format %{ "CMOV$cmp $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegMem( dst, src ) );
  ins_pipe( pipe_cmov_mem );
%}

// Compare 2 longs and CMOVE ints.
instruct cmovPP_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, eRegP dst, eRegP src) %{
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ));
  match(Set dst (CMoveP (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cmp $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
  ins_pipe( pipe_cmov_reg );
%}

// Compare 2 longs and CMOVE doubles
instruct cmovDDPR_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regDPR dst, regDPR src) %{
  predicate( UseSSE<=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovDPR_regS(cmp,flags,dst,src);
  %}
%}

// Compare 2 longs and CMOVE doubles
instruct cmovDD_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regD dst, regD src) %{
  predicate( UseSSE>=2 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovD_regS(cmp,flags,dst,src);
  %}
%}

instruct cmovFFPR_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regFPR dst, regFPR src) %{
  predicate( UseSSE==0 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
  match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovFPR_regS(cmp,flags,dst,src);
  %}
%}

instruct cmovFF_reg_EQNE(cmpOp cmp, flagsReg_long_EQNE flags, regF dst, regF src) %{
  predicate( UseSSE>=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
  match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovF_regS(cmp,flags,dst,src);
  %}
%}

//======
// Manifest a CmpL result in the normal flags.  Only good for LE or GT compares.
// Same as cmpL_reg_flags_LEGT except must negate src
instruct cmpL_zero_flags_LEGT( flagsReg_long_LEGT flags, eRegL src, immL0 zero, rRegI tmp ) %{
  match( Set flags (CmpL src zero ));
  effect( TEMP tmp );
  ins_cost(300);
  format %{ "XOR    $tmp,$tmp\t# Long compare for -$src < 0, use commuted test\n\t"
            "CMP    $tmp,$src.lo\n\t"
            "SBB    $tmp,$src.hi\n\t" %}
  ins_encode( long_cmp_flags3(src, tmp) );
  ins_pipe( ialu_reg_reg_long );
%}

// Manifest a CmpL result in the normal flags.  Only good for LE or GT compares.
// Same as cmpL_reg_flags_LTGE except operands swapped.  Swapping operands
// requires a commuted test to get the same result.
instruct cmpL_reg_flags_LEGT( flagsReg_long_LEGT flags, eRegL src1, eRegL src2, rRegI tmp ) %{
  match( Set flags (CmpL src1 src2 ));
  effect( TEMP tmp );
  ins_cost(300);
  format %{ "CMP    $src2.lo,$src1.lo\t! Long compare, swapped operands, use with commuted test\n\t"
            "MOV    $tmp,$src2.hi\n\t"
            "SBB    $tmp,$src1.hi\t! Compute flags for long compare" %}
  ins_encode( long_cmp_flags2( src2, src1, tmp ) );
  ins_pipe( ialu_cr_reg_reg );
%}

// Long compares reg < zero/req OR reg >= zero/req.
// Just a wrapper for a normal branch, plus the predicate test
instruct cmpL_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, label labl) %{
  match(If cmp flags);
  effect(USE labl);
  predicate( _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt || _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le );
  ins_cost(300);
  expand %{
    jmpCon(cmp,flags,labl);    // JGT or JLE...
  %}
%}

// Compare 2 longs and CMOVE longs.
instruct cmovLL_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegL dst, eRegL src) %{
  match(Set dst (CMoveL (Binary cmp flags) (Binary dst src)));
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt ));
  ins_cost(400);
  format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
            "CMOV$cmp $dst.hi,$src.hi" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegReg_Lo2( dst, src ), enc_cmov(cmp), RegReg_Hi2( dst, src ) );
  ins_pipe( pipe_cmov_reg_long );
%}

instruct cmovLL_mem_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegL dst, load_long_memory src) %{
  match(Set dst (CMoveL (Binary cmp flags) (Binary dst (LoadL src))));
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt ));
  ins_cost(500);
  format %{ "CMOV$cmp $dst.lo,$src.lo\n\t"
            "CMOV$cmp $dst.hi,$src.hi+4" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegMem(dst, src), enc_cmov(cmp), RegMem_Hi(dst, src) );
  ins_pipe( pipe_cmov_reg_long );
%}

// Compare 2 longs and CMOVE ints.
instruct cmovII_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, rRegI dst, rRegI src) %{
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt ));
  match(Set dst (CMoveI (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cmp $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
  ins_pipe( pipe_cmov_reg );
%}

instruct cmovII_mem_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, rRegI dst, memory src) %{
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt ));
  match(Set dst (CMoveI (Binary cmp flags) (Binary dst (LoadI src))));
  ins_cost(250);
  format %{ "CMOV$cmp $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegMem( dst, src ) );
  ins_pipe( pipe_cmov_mem );
%}

// Compare 2 longs and CMOVE ptrs.
instruct cmovPP_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, eRegP dst, eRegP src) %{
  predicate(VM_Version::supports_cmov() && ( _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt ));
  match(Set dst (CMoveP (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  format %{ "CMOV$cmp $dst,$src" %}
  opcode(0x0F,0x40);
  ins_encode( enc_cmov(cmp), RegReg( dst, src ) );
  ins_pipe( pipe_cmov_reg );
%}

// Compare 2 longs and CMOVE doubles
instruct cmovDDPR_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regDPR dst, regDPR src) %{
  predicate( UseSSE<=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovDPR_regS(cmp,flags,dst,src);
  %}
%}

// Compare 2 longs and CMOVE doubles
instruct cmovDD_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regD dst, regD src) %{
  predicate( UseSSE>=2 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
  match(Set dst (CMoveD (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovD_regS(cmp,flags,dst,src);
  %}
%}

instruct cmovFFPR_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regFPR dst, regFPR src) %{
  predicate( UseSSE==0 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
  match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovFPR_regS(cmp,flags,dst,src);
  %}
%}


instruct cmovFF_reg_LEGT(cmpOp_commute cmp, flagsReg_long_LEGT flags, regF dst, regF src) %{
  predicate( UseSSE>=1 && _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
  match(Set dst (CMoveF (Binary cmp flags) (Binary dst src)));
  ins_cost(200);
  expand %{
    fcmovF_regS(cmp,flags,dst,src);
  %}
%}


// ============================================================================
// Procedure Call/Return Instructions
// Call Java Static Instruction
// Note: If this code changes, the corresponding ret_addr_offset() and
//       compute_padding() functions will have to be adjusted.
instruct CallStaticJavaDirect(method meth) %{
  match(CallStaticJava);
  predicate(! ((CallStaticJavaNode*)n)->is_method_handle_invoke());
  effect(USE meth);

  ins_cost(300);
  format %{ "CALL,static " %}
  opcode(0xE8); /* E8 cd */
  ins_encode( pre_call_resets,
              Java_Static_Call( meth ),
              call_epilog,
              post_call_FPU );
  ins_pipe( pipe_slow );
  ins_alignment(4);
%}

// Call Java Static Instruction (method handle version)
// Note: If this code changes, the corresponding ret_addr_offset() and
//       compute_padding() functions will have to be adjusted.
instruct CallStaticJavaHandle(method meth, eBPRegP ebp_mh_SP_save) %{
  match(CallStaticJava);
  predicate(((CallStaticJavaNode*)n)->is_method_handle_invoke());
  effect(USE meth);
  // EBP is saved by all callees (for interpreter stack correction).
  // We use it here for a similar purpose, in {preserve,restore}_SP.

  ins_cost(300);
  format %{ "CALL,static/MethodHandle " %}
  opcode(0xE8); /* E8 cd */
  ins_encode( pre_call_resets,
              preserve_SP,
              Java_Static_Call( meth ),
              restore_SP,
              call_epilog,
              post_call_FPU );
  ins_pipe( pipe_slow );
  ins_alignment(4);
%}

// Call Java Dynamic Instruction
// Note: If this code changes, the corresponding ret_addr_offset() and
//       compute_padding() functions will have to be adjusted.
instruct CallDynamicJavaDirect(method meth) %{
  match(CallDynamicJava);
  effect(USE meth);

  ins_cost(300);
  format %{ "MOV    EAX,(oop)-1\n\t"
            "CALL,dynamic" %}
  opcode(0xE8); /* E8 cd */
  ins_encode( pre_call_resets,
              Java_Dynamic_Call( meth ),
              call_epilog,
              post_call_FPU );
  ins_pipe( pipe_slow );
  ins_alignment(4);
%}

// Call Runtime Instruction
instruct CallRuntimeDirect(method meth) %{
  match(CallRuntime );
  effect(USE meth);

  ins_cost(300);
  format %{ "CALL,runtime " %}
  opcode(0xE8); /* E8 cd */
  // Use FFREEs to clear entries in float stack
  ins_encode( pre_call_resets,
              FFree_Float_Stack_All,
              Java_To_Runtime( meth ),
              post_call_FPU );
  ins_pipe( pipe_slow );
%}

// Call runtime without safepoint
instruct CallLeafDirect(method meth) %{
  match(CallLeaf);
  effect(USE meth);

  ins_cost(300);
  format %{ "CALL_LEAF,runtime " %}
  opcode(0xE8); /* E8 cd */
  ins_encode( pre_call_resets,
              FFree_Float_Stack_All,
              Java_To_Runtime( meth ),
              Verify_FPU_For_Leaf, post_call_FPU );
  ins_pipe( pipe_slow );
%}

instruct CallLeafNoFPDirect(method meth) %{
  match(CallLeafNoFP);
  effect(USE meth);

  ins_cost(300);
  format %{ "CALL_LEAF_NOFP,runtime " %}
  opcode(0xE8); /* E8 cd */
  ins_encode(Java_To_Runtime(meth));
  ins_pipe( pipe_slow );
%}


// Return Instruction
// Remove the return address & jump to it.
instruct Ret() %{
  match(Return);
  format %{ "RET" %}
  opcode(0xC3);
  ins_encode(OpcP);
  ins_pipe( pipe_jmp );
%}

// Tail Call; Jump from runtime stub to Java code.
// Also known as an 'interprocedural jump'.
// Target of jump will eventually return to caller.
// TailJump below removes the return address.
instruct TailCalljmpInd(eRegP_no_EBP jump_target, eBXRegP method_oop) %{
  match(TailCall jump_target method_oop );
  ins_cost(300);
  format %{ "JMP    $jump_target \t# EBX holds method oop" %}
  opcode(0xFF, 0x4);  /* Opcode FF /4 */
  ins_encode( OpcP, RegOpc(jump_target) );
  ins_pipe( pipe_jmp );
%}


// Tail Jump; remove the return address; jump to target.
// TailCall above leaves the return address around.
instruct tailjmpInd(eRegP_no_EBP jump_target, eAXRegP ex_oop) %{
  match( TailJump jump_target ex_oop );
  ins_cost(300);
  format %{ "POP    EDX\t# pop return address into dummy\n\t"
            "JMP    $jump_target " %}
  opcode(0xFF, 0x4);  /* Opcode FF /4 */
  ins_encode( enc_pop_rdx,
              OpcP, RegOpc(jump_target) );
  ins_pipe( pipe_jmp );
%}

// Create exception oop: created by stack-crawling runtime code.
// Created exception is now available to this handler, and is setup
// just prior to jumping to this handler.  No code emitted.
instruct CreateException( eAXRegP ex_oop )
%{
  match(Set ex_oop (CreateEx));

  size(0);
  // use the following format syntax
  format %{ "# exception oop is in EAX; no code emitted" %}
  ins_encode();
  ins_pipe( empty );
%}


// Rethrow exception:
// The exception oop will come in the first argument position.
// Then JUMP (not call) to the rethrow stub code.
instruct RethrowException()
%{
  match(Rethrow);

  // use the following format syntax
  format %{ "JMP    rethrow_stub" %}
  ins_encode(enc_rethrow);
  ins_pipe( pipe_jmp );
%}

// inlined locking and unlocking


instruct cmpFastLock( eFlagsReg cr, eRegP object, eBXRegP box, eAXRegI tmp, eRegP scr) %{
  match( Set cr (FastLock object box) );
  effect( TEMP tmp, TEMP scr, USE_KILL box );
  ins_cost(300);
  format %{ "FASTLOCK $object,$box\t! kills $box,$tmp,$scr" %}
  ins_encode( Fast_Lock(object,box,tmp,scr) );
  ins_pipe( pipe_slow );
%}

instruct cmpFastUnlock( eFlagsReg cr, eRegP object, eAXRegP box, eRegP tmp ) %{
  match( Set cr (FastUnlock object box) );
  effect( TEMP tmp, USE_KILL box );
  ins_cost(300);
  format %{ "FASTUNLOCK $object,$box\t! kills $box,$tmp" %}
  ins_encode( Fast_Unlock(object,box,tmp) );
  ins_pipe( pipe_slow );
%}



// ============================================================================
// Safepoint Instruction
instruct safePoint_poll(eFlagsReg cr) %{
  match(SafePoint);
  effect(KILL cr);

  // TODO-FIXME: we currently poll at offset 0 of the safepoint polling page.
  // On SPARC that might be acceptable as we can generate the address with
  // just a sethi, saving an or.  By polling at offset 0 we can end up
  // putting additional pressure on the index-0 in the D$.  Because of
  // alignment (just like the situation at hand) the lower indices tend
  // to see more traffic.  It'd be better to change the polling address
  // to offset 0 of the last $line in the polling page.

  format %{ "TSTL   #polladdr,EAX\t! Safepoint: poll for GC" %}
  ins_cost(125);
  size(6) ;
  ins_encode( Safepoint_Poll() );
  ins_pipe( ialu_reg_mem );
%}


// ============================================================================
// This name is KNOWN by the ADLC and cannot be changed.
// The ADLC forces a 'TypeRawPtr::BOTTOM' output type
// for this guy.
instruct tlsLoadP(eRegP dst, eFlagsReg cr) %{
  match(Set dst (ThreadLocal));
  effect(DEF dst, KILL cr);

  format %{ "MOV    $dst, Thread::current()" %}
  ins_encode %{
    Register dstReg = as_Register($dst$$reg);
    __ get_thread(dstReg);
  %}
  ins_pipe( ialu_reg_fat );
%}



//----------PEEPHOLE RULES-----------------------------------------------------
// These must follow all instruction definitions as they use the names
// defined in the instructions definitions.
//
// peepmatch ( root_instr_name [preceding_instruction]* );
//
// peepconstraint %{
// (instruction_number.operand_name relational_op instruction_number.operand_name
//  [, ...] );
// // instruction numbers are zero-based using left to right order in peepmatch
//
// peepreplace ( instr_name  ( [instruction_number.operand_name]* ) );
// // provide an instruction_number.operand_name for each operand that appears
// // in the replacement instruction's match rule
//
// ---------VM FLAGS---------------------------------------------------------
//
// All peephole optimizations can be turned off using -XX:-OptoPeephole
//
// Each peephole rule is given an identifying number starting with zero and
// increasing by one in the order seen by the parser.  An individual peephole
// can be enabled, and all others disabled, by using -XX:OptoPeepholeAt=#
// on the command-line.
//
// ---------CURRENT LIMITATIONS----------------------------------------------
//
// Only match adjacent instructions in same basic block
// Only equality constraints
// Only constraints between operands, not (0.dest_reg == EAX_enc)
// Only one replacement instruction
//
// ---------EXAMPLE----------------------------------------------------------
//
// // pertinent parts of existing instructions in architecture description
// instruct movI(rRegI dst, rRegI src) %{
//   match(Set dst (CopyI src));
// %}
//
// instruct incI_eReg(rRegI dst, immI1 src, eFlagsReg cr) %{
//   match(Set dst (AddI dst src));
//   effect(KILL cr);
// %}
//
// // Change (inc mov) to lea
// peephole %{
//   // increment preceeded by register-register move
//   peepmatch ( incI_eReg movI );
//   // require that the destination register of the increment
//   // match the destination register of the move
//   peepconstraint ( 0.dst == 1.dst );
//   // construct a replacement instruction that sets
//   // the destination to ( move's source register + one )
//   peepreplace ( leaI_eReg_immI( 0.dst 1.src 0.src ) );
// %}
//
// Implementation no longer uses movX instructions since
// machine-independent system no longer uses CopyX nodes.
//
// peephole %{
//   peepmatch ( incI_eReg movI );
//   peepconstraint ( 0.dst == 1.dst );
//   peepreplace ( leaI_eReg_immI( 0.dst 1.src 0.src ) );
// %}
//
// peephole %{
//   peepmatch ( decI_eReg movI );
//   peepconstraint ( 0.dst == 1.dst );
//   peepreplace ( leaI_eReg_immI( 0.dst 1.src 0.src ) );
// %}
//
// peephole %{
//   peepmatch ( addI_eReg_imm movI );
//   peepconstraint ( 0.dst == 1.dst );
//   peepreplace ( leaI_eReg_immI( 0.dst 1.src 0.src ) );
// %}
//
// peephole %{
//   peepmatch ( addP_eReg_imm movP );
//   peepconstraint ( 0.dst == 1.dst );
//   peepreplace ( leaP_eReg_immI( 0.dst 1.src 0.src ) );
// %}

// // Change load of spilled value to only a spill
// instruct storeI(memory mem, rRegI src) %{
//   match(Set mem (StoreI mem src));
// %}
//
// instruct loadI(rRegI dst, memory mem) %{
//   match(Set dst (LoadI mem));
// %}
//
peephole %{
  peepmatch ( loadI storeI );
  peepconstraint ( 1.src == 0.dst, 1.mem == 0.mem );
  peepreplace ( storeI( 1.mem 1.mem 1.src ) );
%}

//----------SMARTSPILL RULES---------------------------------------------------
// These must follow all instruction definitions as they use the names
// defined in the instructions definitions.

Other Java examples (source code examples)

Here is a short list of links related to this Java x86_32.ad source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2024 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.