; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck -check-prefix=SM20 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck -check-prefix=SM35 %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}

target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-unknown-unknown"

define ptx_kernel void @foo1(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo1(
; SM20:       {
; SM20-NEXT:    .reg .b32 %r<2>;
; SM20-NEXT:    .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT:  // %bb.0:
; SM20-NEXT:    ld.param.b64 %rd1, [foo1_param_0];
; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT:    ld.param.b64 %rd3, [foo1_param_1];
; SM20-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT:    ld.global.b32 %r1, [%rd2];
; SM20-NEXT:    st.global.b32 [%rd4], %r1;
; SM20-NEXT:    ret;
;
; SM35-LABEL: foo1(
; SM35:       {
; SM35-NEXT:    .reg .b32 %r<2>;
; SM35-NEXT:    .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT:  // %bb.0:
; SM35-NEXT:    ld.param.b64 %rd1, [foo1_param_0];
; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT:    ld.param.b64 %rd3, [foo1_param_1];
; SM35-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT:    ld.global.nc.b32 %r1, [%rd2];
; SM35-NEXT:    st.global.b32 [%rd4], %r1;
; SM35-NEXT:    ret;
  %1 = load float, ptr %from
  store float %1, ptr %to
  ret void
}

define ptx_kernel void @foo2(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo2(
; SM20:       {
; SM20-NEXT:    .reg .b64 %rd<6>;
; SM20-EMPTY:
; SM20-NEXT:  // %bb.0:
; SM20-NEXT:    ld.param.b64 %rd1, [foo2_param_0];
; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT:    ld.param.b64 %rd3, [foo2_param_1];
; SM20-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT:    ld.global.b64 %rd5, [%rd2];
; SM20-NEXT:    st.global.b64 [%rd4], %rd5;
; SM20-NEXT:    ret;
;
; SM35-LABEL: foo2(
; SM35:       {
; SM35-NEXT:    .reg .b64 %rd<6>;
; SM35-EMPTY:
; SM35-NEXT:  // %bb.0:
; SM35-NEXT:    ld.param.b64 %rd1, [foo2_param_0];
; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT:    ld.param.b64 %rd3, [foo2_param_1];
; SM35-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT:    ld.global.nc.b64 %rd5, [%rd2];
; SM35-NEXT:    st.global.b64 [%rd4], %rd5;
; SM35-NEXT:    ret;
  %1 = load double, ptr %from
  store double %1, ptr %to
  ret void
}

define ptx_kernel void @foo3(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo3(
; SM20:       {
; SM20-NEXT:    .reg .b16 %rs<2>;
; SM20-NEXT:    .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT:  // %bb.0:
; SM20-NEXT:    ld.param.b64 %rd1, [foo3_param_0];
; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT:    ld.param.b64 %rd3, [foo3_param_1];
; SM20-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT:    ld.global.b16 %rs1, [%rd2];
; SM20-NEXT:    st.global.b16 [%rd4], %rs1;
; SM20-NEXT:    ret;
;
; SM35-LABEL: foo3(
; SM35:       {
; SM35-NEXT:    .reg .b16 %rs<2>;
; SM35-NEXT:    .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT:  // %bb.0:
; SM35-NEXT:    ld.param.b64 %rd1, [foo3_param_0];
; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT:    ld.param.b64 %rd3, [foo3_param_1];
; SM35-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT:    ld.global.nc.b16 %rs1, [%rd2];
; SM35-NEXT:    st.global.b16 [%rd4], %rs1;
; SM35-NEXT:    ret;
  %1 = load i16, ptr %from
  store i16 %1, ptr %to
  ret void
}

define ptx_kernel void @foo4(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo4(
; SM20:       {
; SM20-NEXT:    .reg .b32 %r<2>;
; SM20-NEXT:    .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT:  // %bb.0:
; SM20-NEXT:    ld.param.b64 %rd1, [foo4_param_0];
; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT:    ld.param.b64 %rd3, [foo4_param_1];
; SM20-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT:    ld.global.b32 %r1, [%rd2];
; SM20-NEXT:    st.global.b32 [%rd4], %r1;
; SM20-NEXT:    ret;
;
; SM35-LABEL: foo4(
; SM35:       {
; SM35-NEXT:    .reg .b32 %r<2>;
; SM35-NEXT:    .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT:  // %bb.0:
; SM35-NEXT:    ld.param.b64 %rd1, [foo4_param_0];
; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT:    ld.param.b64 %rd3, [foo4_param_1];
; SM35-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT:    ld.global.nc.b32 %r1, [%rd2];
; SM35-NEXT:    st.global.b32 [%rd4], %r1;
; SM35-NEXT:    ret;
  %1 = load i32, ptr %from
  store i32 %1, ptr %to
  ret void
}

define ptx_kernel void @foo5(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo5(
; SM20:       {
; SM20-NEXT:    .reg .b64 %rd<6>;
; SM20-EMPTY:
; SM20-NEXT:  // %bb.0:
; SM20-NEXT:    ld.param.b64 %rd1, [foo5_param_0];
; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT:    ld.param.b64 %rd3, [foo5_param_1];
; SM20-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT:    ld.global.b64 %rd5, [%rd2];
; SM20-NEXT:    st.global.b64 [%rd4], %rd5;
; SM20-NEXT:    ret;
;
; SM35-LABEL: foo5(
; SM35:       {
; SM35-NEXT:    .reg .b64 %rd<6>;
; SM35-EMPTY:
; SM35-NEXT:  // %bb.0:
; SM35-NEXT:    ld.param.b64 %rd1, [foo5_param_0];
; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT:    ld.param.b64 %rd3, [foo5_param_1];
; SM35-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT:    ld.global.nc.b64 %rd5, [%rd2];
; SM35-NEXT:    st.global.b64 [%rd4], %rd5;
; SM35-NEXT:    ret;
  %1 = load i64, ptr %from
  store i64 %1, ptr %to
  ret void
}

; i128 is non standard integer in nvptx64
define ptx_kernel void @foo6(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo6(
; SM20:       {
; SM20-NEXT:    .reg .b64 %rd<7>;
; SM20-EMPTY:
; SM20-NEXT:  // %bb.0:
; SM20-NEXT:    ld.param.b64 %rd1, [foo6_param_0];
; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT:    ld.param.b64 %rd3, [foo6_param_1];
; SM20-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT:    ld.global.v2.b64 {%rd5, %rd6}, [%rd2];
; SM20-NEXT:    st.global.v2.b64 [%rd4], {%rd5, %rd6};
; SM20-NEXT:    ret;
;
; SM35-LABEL: foo6(
; SM35:       {
; SM35-NEXT:    .reg .b64 %rd<7>;
; SM35-EMPTY:
; SM35-NEXT:  // %bb.0:
; SM35-NEXT:    ld.param.b64 %rd1, [foo6_param_0];
; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT:    ld.param.b64 %rd3, [foo6_param_1];
; SM35-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT:    ld.global.nc.v2.b64 {%rd5, %rd6}, [%rd2];
; SM35-NEXT:    st.global.v2.b64 [%rd4], {%rd5, %rd6};
; SM35-NEXT:    ret;
  %1 = load i128, ptr %from
  store i128 %1, ptr %to
  ret void
}

define ptx_kernel void @foo7(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo7(
; SM20:       {
; SM20-NEXT:    .reg .b16 %rs<3>;
; SM20-NEXT:    .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT:  // %bb.0:
; SM20-NEXT:    ld.param.b64 %rd1, [foo7_param_0];
; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT:    ld.param.b64 %rd3, [foo7_param_1];
; SM20-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT:    ld.global.v2.b8 {%rs1, %rs2}, [%rd2];
; SM20-NEXT:    st.global.v2.b8 [%rd4], {%rs1, %rs2};
; SM20-NEXT:    ret;
;
; SM35-LABEL: foo7(
; SM35:       {
; SM35-NEXT:    .reg .b16 %rs<3>;
; SM35-NEXT:    .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT:  // %bb.0:
; SM35-NEXT:    ld.param.b64 %rd1, [foo7_param_0];
; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT:    ld.param.b64 %rd3, [foo7_param_1];
; SM35-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT:    ld.global.nc.v2.b8 {%rs1, %rs2}, [%rd2];
; SM35-NEXT:    st.global.v2.b8 [%rd4], {%rs1, %rs2};
; SM35-NEXT:    ret;
  %1 = load <2 x i8>, ptr %from
  store <2 x i8> %1, ptr %to
  ret void
}

define ptx_kernel void @foo8(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo8(
; SM20:       {
; SM20-NEXT:    .reg .b32 %r<2>;
; SM20-NEXT:    .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT:  // %bb.0:
; SM20-NEXT:    ld.param.b64 %rd1, [foo8_param_0];
; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT:    ld.param.b64 %rd3, [foo8_param_1];
; SM20-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT:    ld.global.b32 %r1, [%rd2];
; SM20-NEXT:    st.global.b32 [%rd4], %r1;
; SM20-NEXT:    ret;
;
; SM35-LABEL: foo8(
; SM35:       {
; SM35-NEXT:    .reg .b32 %r<2>;
; SM35-NEXT:    .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT:  // %bb.0:
; SM35-NEXT:    ld.param.b64 %rd1, [foo8_param_0];
; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT:    ld.param.b64 %rd3, [foo8_param_1];
; SM35-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT:    ld.global.nc.b32 %r1, [%rd2];
; SM35-NEXT:    st.global.b32 [%rd4], %r1;
; SM35-NEXT:    ret;
  %1 = load <2 x i16>, ptr %from
  store <2 x i16> %1, ptr %to
  ret void
}

define ptx_kernel void @foo9(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo9(
; SM20:       {
; SM20-NEXT:    .reg .b32 %r<3>;
; SM20-NEXT:    .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT:  // %bb.0:
; SM20-NEXT:    ld.param.b64 %rd1, [foo9_param_0];
; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT:    ld.param.b64 %rd3, [foo9_param_1];
; SM20-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT:    ld.global.v2.b32 {%r1, %r2}, [%rd2];
; SM20-NEXT:    st.global.v2.b32 [%rd4], {%r1, %r2};
; SM20-NEXT:    ret;
;
; SM35-LABEL: foo9(
; SM35:       {
; SM35-NEXT:    .reg .b32 %r<3>;
; SM35-NEXT:    .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT:  // %bb.0:
; SM35-NEXT:    ld.param.b64 %rd1, [foo9_param_0];
; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT:    ld.param.b64 %rd3, [foo9_param_1];
; SM35-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT:    ld.global.nc.v2.b32 {%r1, %r2}, [%rd2];
; SM35-NEXT:    st.global.v2.b32 [%rd4], {%r1, %r2};
; SM35-NEXT:    ret;
  %1 = load <2 x i32>, ptr %from
  store <2 x i32> %1, ptr %to
  ret void
}

define ptx_kernel void @foo10(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo10(
; SM20:       {
; SM20-NEXT:    .reg .b64 %rd<7>;
; SM20-EMPTY:
; SM20-NEXT:  // %bb.0:
; SM20-NEXT:    ld.param.b64 %rd1, [foo10_param_0];
; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT:    ld.param.b64 %rd3, [foo10_param_1];
; SM20-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT:    ld.global.v2.b64 {%rd5, %rd6}, [%rd2];
; SM20-NEXT:    st.global.v2.b64 [%rd4], {%rd5, %rd6};
; SM20-NEXT:    ret;
;
; SM35-LABEL: foo10(
; SM35:       {
; SM35-NEXT:    .reg .b64 %rd<7>;
; SM35-EMPTY:
; SM35-NEXT:  // %bb.0:
; SM35-NEXT:    ld.param.b64 %rd1, [foo10_param_0];
; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT:    ld.param.b64 %rd3, [foo10_param_1];
; SM35-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT:    ld.global.nc.v2.b64 {%rd5, %rd6}, [%rd2];
; SM35-NEXT:    st.global.v2.b64 [%rd4], {%rd5, %rd6};
; SM35-NEXT:    ret;
  %1 = load <2 x i64>, ptr %from
  store <2 x i64> %1, ptr %to
  ret void
}

define ptx_kernel void @foo11(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo11(
; SM20:       {
; SM20-NEXT:    .reg .b64 %rd<6>;
; SM20-EMPTY:
; SM20-NEXT:  // %bb.0:
; SM20-NEXT:    ld.param.b64 %rd1, [foo11_param_0];
; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT:    ld.param.b64 %rd3, [foo11_param_1];
; SM20-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT:    ld.global.b64 %rd5, [%rd2];
; SM20-NEXT:    st.global.b64 [%rd4], %rd5;
; SM20-NEXT:    ret;
;
; SM35-LABEL: foo11(
; SM35:       {
; SM35-NEXT:    .reg .b64 %rd<6>;
; SM35-EMPTY:
; SM35-NEXT:  // %bb.0:
; SM35-NEXT:    ld.param.b64 %rd1, [foo11_param_0];
; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT:    ld.param.b64 %rd3, [foo11_param_1];
; SM35-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT:    ld.global.nc.b64 %rd5, [%rd2];
; SM35-NEXT:    st.global.b64 [%rd4], %rd5;
; SM35-NEXT:    ret;
  %1 = load <2 x float>, ptr %from
  store <2 x float> %1, ptr %to
  ret void
}

define ptx_kernel void @foo12(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo12(
; SM20:       {
; SM20-NEXT:    .reg .b64 %rd<7>;
; SM20-EMPTY:
; SM20-NEXT:  // %bb.0:
; SM20-NEXT:    ld.param.b64 %rd1, [foo12_param_0];
; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT:    ld.param.b64 %rd3, [foo12_param_1];
; SM20-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT:    ld.global.v2.b64 {%rd5, %rd6}, [%rd2];
; SM20-NEXT:    st.global.v2.b64 [%rd4], {%rd5, %rd6};
; SM20-NEXT:    ret;
;
; SM35-LABEL: foo12(
; SM35:       {
; SM35-NEXT:    .reg .b64 %rd<7>;
; SM35-EMPTY:
; SM35-NEXT:  // %bb.0:
; SM35-NEXT:    ld.param.b64 %rd1, [foo12_param_0];
; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT:    ld.param.b64 %rd3, [foo12_param_1];
; SM35-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT:    ld.global.nc.v2.b64 {%rd5, %rd6}, [%rd2];
; SM35-NEXT:    st.global.v2.b64 [%rd4], {%rd5, %rd6};
; SM35-NEXT:    ret;
  %1 = load <2 x double>, ptr %from
  store <2 x double> %1, ptr %to
  ret void
}

define ptx_kernel void @foo13(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo13(
; SM20:       {
; SM20-NEXT:    .reg .b32 %r<2>;
; SM20-NEXT:    .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT:  // %bb.0:
; SM20-NEXT:    ld.param.b64 %rd1, [foo13_param_0];
; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT:    ld.param.b64 %rd3, [foo13_param_1];
; SM20-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT:    ld.global.b32 %r1, [%rd2];
; SM20-NEXT:    st.global.b32 [%rd4], %r1;
; SM20-NEXT:    ret;
;
; SM35-LABEL: foo13(
; SM35:       {
; SM35-NEXT:    .reg .b32 %r<2>;
; SM35-NEXT:    .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT:  // %bb.0:
; SM35-NEXT:    ld.param.b64 %rd1, [foo13_param_0];
; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT:    ld.param.b64 %rd3, [foo13_param_1];
; SM35-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT:    ld.global.nc.b32 %r1, [%rd2];
; SM35-NEXT:    st.global.b32 [%rd4], %r1;
; SM35-NEXT:    ret;
  %1 = load <4 x i8>, ptr %from
  store <4 x i8> %1, ptr %to
  ret void
}

define ptx_kernel void @foo14(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo14(
; SM20:       {
; SM20-NEXT:    .reg .b32 %r<3>;
; SM20-NEXT:    .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT:  // %bb.0:
; SM20-NEXT:    ld.param.b64 %rd1, [foo14_param_0];
; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT:    ld.param.b64 %rd3, [foo14_param_1];
; SM20-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT:    ld.global.v2.b32 {%r1, %r2}, [%rd2];
; SM20-NEXT:    st.global.v2.b32 [%rd4], {%r1, %r2};
; SM20-NEXT:    ret;
;
; SM35-LABEL: foo14(
; SM35:       {
; SM35-NEXT:    .reg .b32 %r<3>;
; SM35-NEXT:    .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT:  // %bb.0:
; SM35-NEXT:    ld.param.b64 %rd1, [foo14_param_0];
; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT:    ld.param.b64 %rd3, [foo14_param_1];
; SM35-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT:    ld.global.nc.v2.b32 {%r1, %r2}, [%rd2];
; SM35-NEXT:    st.global.v2.b32 [%rd4], {%r1, %r2};
; SM35-NEXT:    ret;
  %1 = load <4 x i16>, ptr %from
  store <4 x i16> %1, ptr %to
  ret void
}

define ptx_kernel void @foo15(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo15(
; SM20:       {
; SM20-NEXT:    .reg .b32 %r<5>;
; SM20-NEXT:    .reg .b64 %rd<5>;
; SM20-EMPTY:
; SM20-NEXT:  // %bb.0:
; SM20-NEXT:    ld.param.b64 %rd1, [foo15_param_0];
; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT:    ld.param.b64 %rd3, [foo15_param_1];
; SM20-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT:    ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2];
; SM20-NEXT:    st.global.v4.b32 [%rd4], {%r1, %r2, %r3, %r4};
; SM20-NEXT:    ret;
;
; SM35-LABEL: foo15(
; SM35:       {
; SM35-NEXT:    .reg .b32 %r<5>;
; SM35-NEXT:    .reg .b64 %rd<5>;
; SM35-EMPTY:
; SM35-NEXT:  // %bb.0:
; SM35-NEXT:    ld.param.b64 %rd1, [foo15_param_0];
; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT:    ld.param.b64 %rd3, [foo15_param_1];
; SM35-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT:    ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd2];
; SM35-NEXT:    st.global.v4.b32 [%rd4], {%r1, %r2, %r3, %r4};
; SM35-NEXT:    ret;
  %1 = load <4 x i32>, ptr %from
  store <4 x i32> %1, ptr %to
  ret void
}

define ptx_kernel void @foo16(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo16(
; SM20:       {
; SM20-NEXT:    .reg .b64 %rd<7>;
; SM20-EMPTY:
; SM20-NEXT:  // %bb.0:
; SM20-NEXT:    ld.param.b64 %rd1, [foo16_param_0];
; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT:    ld.param.b64 %rd3, [foo16_param_1];
; SM20-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT:    ld.global.v2.b64 {%rd5, %rd6}, [%rd2];
; SM20-NEXT:    st.global.v2.b64 [%rd4], {%rd5, %rd6};
; SM20-NEXT:    ret;
;
; SM35-LABEL: foo16(
; SM35:       {
; SM35-NEXT:    .reg .b64 %rd<7>;
; SM35-EMPTY:
; SM35-NEXT:  // %bb.0:
; SM35-NEXT:    ld.param.b64 %rd1, [foo16_param_0];
; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT:    ld.param.b64 %rd3, [foo16_param_1];
; SM35-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT:    ld.global.nc.v2.b64 {%rd5, %rd6}, [%rd2];
; SM35-NEXT:    st.global.v2.b64 [%rd4], {%rd5, %rd6};
; SM35-NEXT:    ret;
  %1 = load <4 x float>, ptr %from
  store <4 x float> %1, ptr %to
  ret void
}

define ptx_kernel void @foo17(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo17(
; SM20:       {
; SM20-NEXT:    .reg .b64 %rd<9>;
; SM20-EMPTY:
; SM20-NEXT:  // %bb.0:
; SM20-NEXT:    ld.param.b64 %rd1, [foo17_param_0];
; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT:    ld.param.b64 %rd3, [foo17_param_1];
; SM20-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT:    ld.global.v2.b64 {%rd5, %rd6}, [%rd2];
; SM20-NEXT:    ld.global.v2.b64 {%rd7, %rd8}, [%rd2+16];
; SM20-NEXT:    st.global.v2.b64 [%rd4+16], {%rd7, %rd8};
; SM20-NEXT:    st.global.v2.b64 [%rd4], {%rd5, %rd6};
; SM20-NEXT:    ret;
;
; SM35-LABEL: foo17(
; SM35:       {
; SM35-NEXT:    .reg .b64 %rd<9>;
; SM35-EMPTY:
; SM35-NEXT:  // %bb.0:
; SM35-NEXT:    ld.param.b64 %rd1, [foo17_param_0];
; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT:    ld.param.b64 %rd3, [foo17_param_1];
; SM35-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT:    ld.global.nc.v2.b64 {%rd5, %rd6}, [%rd2];
; SM35-NEXT:    ld.global.nc.v2.b64 {%rd7, %rd8}, [%rd2+16];
; SM35-NEXT:    st.global.v2.b64 [%rd4+16], {%rd7, %rd8};
; SM35-NEXT:    st.global.v2.b64 [%rd4], {%rd5, %rd6};
; SM35-NEXT:    ret;
  %1 = load <4 x double>, ptr %from
  store <4 x double> %1, ptr %to
  ret void
}

define ptx_kernel void @foo18(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: foo18(
; SM20:       {
; SM20-NEXT:    .reg .b64 %rd<6>;
; SM20-EMPTY:
; SM20-NEXT:  // %bb.0:
; SM20-NEXT:    ld.param.b64 %rd1, [foo18_param_0];
; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM20-NEXT:    ld.param.b64 %rd3, [foo18_param_1];
; SM20-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM20-NEXT:    ld.global.b64 %rd5, [%rd2];
; SM20-NEXT:    st.global.b64 [%rd4], %rd5;
; SM20-NEXT:    ret;
;
; SM35-LABEL: foo18(
; SM35:       {
; SM35-NEXT:    .reg .b64 %rd<6>;
; SM35-EMPTY:
; SM35-NEXT:  // %bb.0:
; SM35-NEXT:    ld.param.b64 %rd1, [foo18_param_0];
; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd1;
; SM35-NEXT:    ld.param.b64 %rd3, [foo18_param_1];
; SM35-NEXT:    cvta.to.global.u64 %rd4, %rd3;
; SM35-NEXT:    ld.global.nc.b64 %rd5, [%rd2];
; SM35-NEXT:    st.global.b64 [%rd4], %rd5;
; SM35-NEXT:    ret;
  %1 = load ptr, ptr %from
  store ptr %1, ptr %to
  ret void
}

; Test that we can infer a cached load for a pointer induction variable.
define ptx_kernel void @foo19(ptr noalias readonly %from, ptr %to, i32 %n) {
; SM20-LABEL: foo19(
; SM20:       {
; SM20-NEXT:    .reg .pred %p<2>;
; SM20-NEXT:    .reg .b32 %r<10>;
; SM20-NEXT:    .reg .b64 %rd<8>;
; SM20-EMPTY:
; SM20-NEXT:  // %bb.0: // %entry
; SM20-NEXT:    ld.param.b32 %r8, [foo19_param_2];
; SM20-NEXT:    ld.param.b64 %rd5, [foo19_param_0];
; SM20-NEXT:    cvta.to.global.u64 %rd7, %rd5;
; SM20-NEXT:    ld.param.b64 %rd6, [foo19_param_1];
; SM20-NEXT:    cvta.to.global.u64 %rd2, %rd6;
; SM20-NEXT:    mov.b32 %r9, 0f00000000;
; SM20-NEXT:  $L__BB18_1: // %loop
; SM20-NEXT:    // =>This Inner Loop Header: Depth=1
; SM20-NEXT:    ld.global.b32 %r7, [%rd7];
; SM20-NEXT:    add.rn.f32 %r9, %r7, %r9;
; SM20-NEXT:    add.s64 %rd7, %rd7, 4;
; SM20-NEXT:    add.s32 %r8, %r8, -1;
; SM20-NEXT:    setp.ne.b32 %p1, %r8, 0;
; SM20-NEXT:    @%p1 bra $L__BB18_1;
; SM20-NEXT:  // %bb.2: // %exit
; SM20-NEXT:    st.global.b32 [%rd2], %r9;
; SM20-NEXT:    ret;
;
; SM35-LABEL: foo19(
; SM35:       {
; SM35-NEXT:    .reg .pred %p<2>;
; SM35-NEXT:    .reg .b32 %r<10>;
; SM35-NEXT:    .reg .b64 %rd<8>;
; SM35-EMPTY:
; SM35-NEXT:  // %bb.0: // %entry
; SM35-NEXT:    ld.param.b32 %r8, [foo19_param_2];
; SM35-NEXT:    ld.param.b64 %rd5, [foo19_param_0];
; SM35-NEXT:    cvta.to.global.u64 %rd7, %rd5;
; SM35-NEXT:    ld.param.b64 %rd6, [foo19_param_1];
; SM35-NEXT:    cvta.to.global.u64 %rd2, %rd6;
; SM35-NEXT:    mov.b32 %r9, 0f00000000;
; SM35-NEXT:  $L__BB18_1: // %loop
; SM35-NEXT:    // =>This Inner Loop Header: Depth=1
; SM35-NEXT:    ld.global.nc.b32 %r7, [%rd7];
; SM35-NEXT:    add.rn.f32 %r9, %r7, %r9;
; SM35-NEXT:    add.s64 %rd7, %rd7, 4;
; SM35-NEXT:    add.s32 %r8, %r8, -1;
; SM35-NEXT:    setp.ne.b32 %p1, %r8, 0;
; SM35-NEXT:    @%p1 bra $L__BB18_1;
; SM35-NEXT:  // %bb.2: // %exit
; SM35-NEXT:    st.global.b32 [%rd2], %r9;
; SM35-NEXT:    ret;
entry:
  br label %loop

loop:
  %i = phi i32 [ 0, %entry ], [ %nexti, %loop ]
  %sum = phi float [ 0.0, %entry ], [ %nextsum, %loop ]
  %ptr = getelementptr inbounds float, ptr %from, i32 %i
  %value = load float, ptr %ptr, align 4
  %nextsum = fadd float %value, %sum
  %nexti = add nsw i32 %i, 1
  %exitcond = icmp eq i32 %nexti, %n
  br i1 %exitcond, label %exit, label %loop

exit:
  store float %nextsum, ptr %to
  ret void
}

; This test captures the case of a non-kernel function. In a
; non-kernel function, without interprocedural analysis, we do not
; know that the parameter is global. We also do not know that the
; pointed-to memory is never written to (for the duration of the
; kernel). For both reasons, we cannot use a cached load here.
define void @notkernel(ptr noalias readonly %from, ptr %to) {
; SM20-LABEL: notkernel(
; SM20:       {
; SM20-NEXT:    .reg .b32 %r<2>;
; SM20-NEXT:    .reg .b64 %rd<3>;
; SM20-EMPTY:
; SM20-NEXT:  // %bb.0:
; SM20-NEXT:    ld.param.b64 %rd1, [notkernel_param_0];
; SM20-NEXT:    ld.b32 %r1, [%rd1];
; SM20-NEXT:    ld.param.b64 %rd2, [notkernel_param_1];
; SM20-NEXT:    st.b32 [%rd2], %r1;
; SM20-NEXT:    ret;
;
; SM35-LABEL: notkernel(
; SM35:       {
; SM35-NEXT:    .reg .b32 %r<2>;
; SM35-NEXT:    .reg .b64 %rd<3>;
; SM35-EMPTY:
; SM35-NEXT:  // %bb.0:
; SM35-NEXT:    ld.param.b64 %rd1, [notkernel_param_0];
; SM35-NEXT:    ld.b32 %r1, [%rd1];
; SM35-NEXT:    ld.param.b64 %rd2, [notkernel_param_1];
; SM35-NEXT:    st.b32 [%rd2], %r1;
; SM35-NEXT:    ret;
  %1 = load float, ptr %from
  store float %1, ptr %to
  ret void
}

; As @notkernel, but with the parameter explicitly marked as global. We still
; do not know that the parameter is never written to (for the duration of the
; kernel). This case does not currently come up normally since we do not infer
; that pointers are global interprocedurally as of 2015-08-05.
define void @notkernel2(ptr addrspace(1) noalias readonly %from, ptr %to) {
; SM20-LABEL: notkernel2(
; SM20:       {
; SM20-NEXT:    .reg .b32 %r<2>;
; SM20-NEXT:    .reg .b64 %rd<3>;
; SM20-EMPTY:
; SM20-NEXT:  // %bb.0:
; SM20-NEXT:    ld.param.b64 %rd1, [notkernel2_param_0];
; SM20-NEXT:    ld.global.b32 %r1, [%rd1];
; SM20-NEXT:    ld.param.b64 %rd2, [notkernel2_param_1];
; SM20-NEXT:    st.b32 [%rd2], %r1;
; SM20-NEXT:    ret;
;
; SM35-LABEL: notkernel2(
; SM35:       {
; SM35-NEXT:    .reg .b32 %r<2>;
; SM35-NEXT:    .reg .b64 %rd<3>;
; SM35-EMPTY:
; SM35-NEXT:  // %bb.0:
; SM35-NEXT:    ld.param.b64 %rd1, [notkernel2_param_0];
; SM35-NEXT:    ld.global.b32 %r1, [%rd1];
; SM35-NEXT:    ld.param.b64 %rd2, [notkernel2_param_1];
; SM35-NEXT:    st.b32 [%rd2], %r1;
; SM35-NEXT:    ret;
  %1 = load float, ptr addrspace(1) %from
  store float %1, ptr %to
  ret void
}
