; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %}
target triple = "nvptx64-nvidia-cuda"


define i16  @test_v2i8(i16 %a) {
; CHECK-LABEL: test_v2i8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<5>;
; CHECK-NEXT:    .reg .b32 %r<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.u16 %rs1, [test_v2i8_param_0];
; CHECK-NEXT:    cvt.s16.s8 %rs2, %rs1;
; CHECK-NEXT:    shr.s16 %rs3, %rs1, 8;
; CHECK-NEXT:    add.s16 %rs4, %rs2, %rs3;
; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
; CHECK-NEXT:    ret;
  %v = bitcast i16 %a to <2 x i8>
  %r0 = extractelement <2 x i8> %v, i64 0
  %r1 = extractelement <2 x i8> %v, i64 1
  %r0i = sext i8 %r0 to i16
  %r1i = sext i8 %r1 to i16
  %r01 = add i16 %r0i, %r1i
  ret i16 %r01
}

define i1  @test_v2i8_load(ptr %a) {
; CHECK-LABEL: test_v2i8_load(
; CHECK:       {
; CHECK-NEXT:    .reg .pred %p<2>;
; CHECK-NEXT:    .reg .b16 %rs<7>;
; CHECK-NEXT:    .reg .b32 %r<2>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.u64 %rd1, [test_v2i8_load_param_0];
; CHECK-NEXT:    ld.v2.u8 {%rs1, %rs2}, [%rd1];
; CHECK-NEXT:    or.b16 %rs5, %rs1, %rs2;
; CHECK-NEXT:    and.b16 %rs6, %rs5, 255;
; CHECK-NEXT:    setp.eq.s16 %p1, %rs6, 0;
; CHECK-NEXT:    selp.u32 %r1, 1, 0, %p1;
; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
; CHECK-NEXT:    ret;
  %v = load <2 x i8>, ptr %a, align 4
  %r0 = extractelement <2 x i8> %v, i64 0
  %r1 = extractelement <2 x i8> %v, i64 1
  %icmp = icmp eq i8 %r0, 0
  %icmp3 = icmp eq i8 %r1, 0
  %select = select i1 %icmp, i1 %icmp3, i1 false
  ret i1 %select
}
define i16  @test_v4i8(i32 %a) {
; CHECK-LABEL: test_v4i8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<8>;
; CHECK-NEXT:    .reg .b32 %r<7>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.u32 %r1, [test_v4i8_param_0];
; CHECK-NEXT:    bfe.s32 %r2, %r1, 0, 8;
; CHECK-NEXT:    cvt.s8.s32 %rs1, %r2;
; CHECK-NEXT:    bfe.s32 %r3, %r1, 8, 8;
; CHECK-NEXT:    cvt.s8.s32 %rs2, %r3;
; CHECK-NEXT:    bfe.s32 %r4, %r1, 16, 8;
; CHECK-NEXT:    cvt.s8.s32 %rs3, %r4;
; CHECK-NEXT:    bfe.s32 %r5, %r1, 24, 8;
; CHECK-NEXT:    cvt.s8.s32 %rs4, %r5;
; CHECK-NEXT:    add.s16 %rs5, %rs1, %rs2;
; CHECK-NEXT:    add.s16 %rs6, %rs3, %rs4;
; CHECK-NEXT:    add.s16 %rs7, %rs5, %rs6;
; CHECK-NEXT:    cvt.u32.u16 %r6, %rs7;
; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
; CHECK-NEXT:    ret;
  %v = bitcast i32 %a to <4 x i8>
  %r0 = extractelement <4 x i8> %v, i64 0
  %r1 = extractelement <4 x i8> %v, i64 1
  %r2 = extractelement <4 x i8> %v, i64 2
  %r3 = extractelement <4 x i8> %v, i64 3
  %r0i = sext i8 %r0 to i16
  %r1i = sext i8 %r1 to i16
  %r2i = sext i8 %r2 to i16
  %r3i = sext i8 %r3 to i16
  %r01 = add i16 %r0i, %r1i
  %r23 = add i16 %r2i, %r3i
  %r = add i16 %r01, %r23
  ret i16 %r
}

define i32  @test_v4i8_s32(i32 %a) {
; CHECK-LABEL: test_v4i8_s32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.u32 %r1, [test_v4i8_s32_param_0];
; CHECK-NEXT:    bfe.s32 %r2, %r1, 0, 8;
; CHECK-NEXT:    bfe.s32 %r3, %r1, 8, 8;
; CHECK-NEXT:    bfe.s32 %r4, %r1, 16, 8;
; CHECK-NEXT:    bfe.s32 %r5, %r1, 24, 8;
; CHECK-NEXT:    add.s32 %r6, %r2, %r3;
; CHECK-NEXT:    add.s32 %r7, %r4, %r5;
; CHECK-NEXT:    add.s32 %r8, %r6, %r7;
; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
; CHECK-NEXT:    ret;
  %v = bitcast i32 %a to <4 x i8>
  %r0 = extractelement <4 x i8> %v, i64 0
  %r1 = extractelement <4 x i8> %v, i64 1
  %r2 = extractelement <4 x i8> %v, i64 2
  %r3 = extractelement <4 x i8> %v, i64 3
  %r0i = sext i8 %r0 to i32
  %r1i = sext i8 %r1 to i32
  %r2i = sext i8 %r2 to i32
  %r3i = sext i8 %r3 to i32
  %r01 = add i32 %r0i, %r1i
  %r23 = add i32 %r2i, %r3i
  %r = add i32 %r01, %r23
  ret i32 %r
}

define i32  @test_v4i8_u32(i32 %a) {
; CHECK-LABEL: test_v4i8_u32(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<9>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.u32 %r1, [test_v4i8_u32_param_0];
; CHECK-NEXT:    bfe.u32 %r2, %r1, 0, 8;
; CHECK-NEXT:    bfe.u32 %r3, %r1, 8, 8;
; CHECK-NEXT:    bfe.u32 %r4, %r1, 16, 8;
; CHECK-NEXT:    bfe.u32 %r5, %r1, 24, 8;
; CHECK-NEXT:    add.s32 %r6, %r2, %r3;
; CHECK-NEXT:    add.s32 %r7, %r4, %r5;
; CHECK-NEXT:    add.s32 %r8, %r6, %r7;
; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
; CHECK-NEXT:    ret;
  %v = bitcast i32 %a to <4 x i8>
  %r0 = extractelement <4 x i8> %v, i64 0
  %r1 = extractelement <4 x i8> %v, i64 1
  %r2 = extractelement <4 x i8> %v, i64 2
  %r3 = extractelement <4 x i8> %v, i64 3
  %r0i = zext i8 %r0 to i32
  %r1i = zext i8 %r1 to i32
  %r2i = zext i8 %r2 to i32
  %r3i = zext i8 %r3 to i32
  %r01 = add i32 %r0i, %r1i
  %r23 = add i32 %r2i, %r3i
  %r = add i32 %r01, %r23
  ret i32 %r
}



define i16  @test_v8i8(i64 %a) {
; CHECK-LABEL: test_v8i8(
; CHECK:       {
; CHECK-NEXT:    .reg .b16 %rs<16>;
; CHECK-NEXT:    .reg .b32 %r<12>;
; CHECK-NEXT:    .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.u64 %rd1, [test_v8i8_param_0];
; CHECK-NEXT:    { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd1; }
; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
; CHECK-NEXT:    cvt.s8.s32 %rs1, %r3;
; CHECK-NEXT:    bfe.s32 %r4, %r2, 8, 8;
; CHECK-NEXT:    cvt.s8.s32 %rs2, %r4;
; CHECK-NEXT:    bfe.s32 %r5, %r2, 16, 8;
; CHECK-NEXT:    cvt.s8.s32 %rs3, %r5;
; CHECK-NEXT:    bfe.s32 %r6, %r2, 24, 8;
; CHECK-NEXT:    cvt.s8.s32 %rs4, %r6;
; CHECK-NEXT:    bfe.s32 %r7, %r1, 0, 8;
; CHECK-NEXT:    cvt.s8.s32 %rs5, %r7;
; CHECK-NEXT:    bfe.s32 %r8, %r1, 8, 8;
; CHECK-NEXT:    cvt.s8.s32 %rs6, %r8;
; CHECK-NEXT:    bfe.s32 %r9, %r1, 16, 8;
; CHECK-NEXT:    cvt.s8.s32 %rs7, %r9;
; CHECK-NEXT:    bfe.s32 %r10, %r1, 24, 8;
; CHECK-NEXT:    cvt.s8.s32 %rs8, %r10;
; CHECK-NEXT:    add.s16 %rs9, %rs1, %rs2;
; CHECK-NEXT:    add.s16 %rs10, %rs3, %rs4;
; CHECK-NEXT:    add.s16 %rs11, %rs5, %rs6;
; CHECK-NEXT:    add.s16 %rs12, %rs7, %rs8;
; CHECK-NEXT:    add.s16 %rs13, %rs9, %rs10;
; CHECK-NEXT:    add.s16 %rs14, %rs11, %rs12;
; CHECK-NEXT:    add.s16 %rs15, %rs13, %rs14;
; CHECK-NEXT:    cvt.u32.u16 %r11, %rs15;
; CHECK-NEXT:    st.param.b32 [func_retval0], %r11;
; CHECK-NEXT:    ret;
  %v = bitcast i64 %a to <8 x i8>
  %r0 = extractelement <8 x i8> %v, i64 0
  %r1 = extractelement <8 x i8> %v, i64 1
  %r2 = extractelement <8 x i8> %v, i64 2
  %r3 = extractelement <8 x i8> %v, i64 3
  %r4 = extractelement <8 x i8> %v, i64 4
  %r5 = extractelement <8 x i8> %v, i64 5
  %r6 = extractelement <8 x i8> %v, i64 6
  %r7 = extractelement <8 x i8> %v, i64 7
  %r0i = sext i8 %r0 to i16
  %r1i = sext i8 %r1 to i16
  %r2i = sext i8 %r2 to i16
  %r3i = sext i8 %r3 to i16
  %r4i = sext i8 %r4 to i16
  %r5i = sext i8 %r5 to i16
  %r6i = sext i8 %r6 to i16
  %r7i = sext i8 %r7 to i16
  %r01 = add i16 %r0i, %r1i
  %r23 = add i16 %r2i, %r3i
  %r45 = add i16 %r4i, %r5i
  %r67 = add i16 %r6i, %r7i
  %r0123 = add i16 %r01, %r23
  %r4567 = add i16 %r45, %r67
  %r = add i16 %r0123, %r4567
  ret i16 %r
}
