|
| 1 | +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 |
| 2 | +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s |
| 3 | + |
| 4 | +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0 |
| 5 | +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0 |
| 6 | + |
| 7 | +define void @SHA256_Compress_Generic(ptr noundef %ctx) #1 { |
| 8 | +; CHECK-LABEL: SHA256_Compress_Generic: |
| 9 | +; CHECK: # %bb.0: # %entry |
| 10 | +; CHECK-NEXT: movbel 0, %eax |
| 11 | +; CHECK-NEXT: movbel 12(%rdi), %ecx |
| 12 | +; CHECK-NEXT: vmovd %eax, %xmm0 |
| 13 | +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,0,1,2,3,128,128,128,128,128,128,128,128] |
| 14 | +; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm2 |
| 15 | +; CHECK-NEXT: vpsrld $17, %xmm2, %xmm0 |
| 16 | +; CHECK-NEXT: vpslld $15, %xmm2, %xmm3 |
| 17 | +; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0 |
| 18 | +; CHECK-NEXT: vpsrld $19, %xmm2, %xmm3 |
| 19 | +; CHECK-NEXT: vpslld $13, %xmm2, %xmm4 |
| 20 | +; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 |
| 21 | +; CHECK-NEXT: vpxor %xmm3, %xmm0, %xmm0 |
| 22 | +; CHECK-NEXT: vpxor %xmm2, %xmm0, %xmm0 |
| 23 | +; CHECK-NEXT: vmovd %ecx, %xmm3 |
| 24 | +; CHECK-NEXT: vpshufb %xmm1, %xmm3, %xmm1 |
| 25 | +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm1 |
| 26 | +; CHECK-NEXT: vpsrld $17, %xmm1, %xmm0 |
| 27 | +; CHECK-NEXT: vpslld $15, %xmm1, %xmm3 |
| 28 | +; CHECK-NEXT: vpor %xmm0, %xmm3, %xmm0 |
| 29 | +; CHECK-NEXT: vpsrld $19, %xmm1, %xmm3 |
| 30 | +; CHECK-NEXT: vpslld $13, %xmm1, %xmm4 |
| 31 | +; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 |
| 32 | +; CHECK-NEXT: vpxor %xmm3, %xmm0, %xmm0 |
| 33 | +; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 |
| 34 | +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 |
| 35 | +; CHECK-NEXT: vpsrld $17, %xmm0, %xmm3 |
| 36 | +; CHECK-NEXT: vpslld $15, %xmm0, %xmm4 |
| 37 | +; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 |
| 38 | +; CHECK-NEXT: vpsrld $19, %xmm0, %xmm4 |
| 39 | +; CHECK-NEXT: vpslld $13, %xmm0, %xmm5 |
| 40 | +; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 |
| 41 | +; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3 |
| 42 | +; CHECK-NEXT: vpsrld $10, %xmm0, %xmm0 |
| 43 | +; CHECK-NEXT: vpxor %xmm0, %xmm3, %xmm0 |
| 44 | +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 |
| 45 | +; CHECK-NEXT: vpsrld $17, %xmm0, %xmm3 |
| 46 | +; CHECK-NEXT: vpslld $15, %xmm0, %xmm4 |
| 47 | +; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 |
| 48 | +; CHECK-NEXT: vpsrld $19, %xmm0, %xmm4 |
| 49 | +; CHECK-NEXT: vpslld $13, %xmm0, %xmm5 |
| 50 | +; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 |
| 51 | +; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3 |
| 52 | +; CHECK-NEXT: vpsrld $10, %xmm0, %xmm4 |
| 53 | +; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3 |
| 54 | +; CHECK-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3] |
| 55 | +; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,0,2,3] |
| 56 | +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 |
| 57 | +; CHECK-NEXT: vpsrld $17, %xmm2, %xmm3 |
| 58 | +; CHECK-NEXT: vpslld $15, %xmm2, %xmm4 |
| 59 | +; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 |
| 60 | +; CHECK-NEXT: vpsrld $19, %xmm2, %xmm4 |
| 61 | +; CHECK-NEXT: vpslld $13, %xmm2, %xmm5 |
| 62 | +; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 |
| 63 | +; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3 |
| 64 | +; CHECK-NEXT: vpsrld $10, %xmm2, %xmm2 |
| 65 | +; CHECK-NEXT: vpxor %xmm2, %xmm3, %xmm2 |
| 66 | +; CHECK-NEXT: vpsrlq $32, %xmm1, %xmm3 |
| 67 | +; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm1 |
| 68 | +; CHECK-NEXT: vpsrld $17, %xmm1, %xmm2 |
| 69 | +; CHECK-NEXT: vpslld $15, %xmm1, %xmm4 |
| 70 | +; CHECK-NEXT: vpor %xmm2, %xmm4, %xmm2 |
| 71 | +; CHECK-NEXT: vpsrld $19, %xmm1, %xmm4 |
| 72 | +; CHECK-NEXT: vpslld $13, %xmm1, %xmm5 |
| 73 | +; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 |
| 74 | +; CHECK-NEXT: vpxor %xmm4, %xmm2, %xmm2 |
| 75 | +; CHECK-NEXT: vpsrld $10, %xmm1, %xmm4 |
| 76 | +; CHECK-NEXT: vpxor %xmm4, %xmm2, %xmm2 |
| 77 | +; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2 |
| 78 | +; CHECK-NEXT: vpsrld $17, %xmm2, %xmm3 |
| 79 | +; CHECK-NEXT: vpslld $15, %xmm2, %xmm4 |
| 80 | +; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 |
| 81 | +; CHECK-NEXT: vpsrld $19, %xmm2, %xmm4 |
| 82 | +; CHECK-NEXT: vpslld $13, %xmm2, %xmm5 |
| 83 | +; CHECK-NEXT: vpor %xmm4, %xmm5, %xmm4 |
| 84 | +; CHECK-NEXT: vpxor %xmm4, %xmm3, %xmm3 |
| 85 | +; CHECK-NEXT: vpsrld $10, %xmm2, %xmm2 |
| 86 | +; CHECK-NEXT: vpxor %xmm2, %xmm3, %xmm2 |
| 87 | +; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 |
| 88 | +; CHECK-NEXT: vpsrld $17, %xmm0, %xmm2 |
| 89 | +; CHECK-NEXT: vpslld $15, %xmm0, %xmm3 |
| 90 | +; CHECK-NEXT: vpor %xmm2, %xmm3, %xmm2 |
| 91 | +; CHECK-NEXT: vpsrld $19, %xmm0, %xmm3 |
| 92 | +; CHECK-NEXT: vpslld $13, %xmm0, %xmm4 |
| 93 | +; CHECK-NEXT: vpor %xmm3, %xmm4, %xmm3 |
| 94 | +; CHECK-NEXT: vpxor %xmm3, %xmm2, %xmm2 |
| 95 | +; CHECK-NEXT: vpsrld $10, %xmm0, %xmm3 |
| 96 | +; CHECK-NEXT: vpxor %xmm3, %xmm2, %xmm2 |
| 97 | +; CHECK-NEXT: vpsllq $32, %xmm1, %xmm3 |
| 98 | +; CHECK-NEXT: vpaddd %xmm2, %xmm3, %xmm2 |
| 99 | +; CHECK-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 |
| 100 | +; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[2] |
| 101 | +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] |
| 102 | +; CHECK-NEXT: vmovdqu %ymm0, 132(%rdi) |
| 103 | +; CHECK-NEXT: vzeroupper |
| 104 | +; CHECK-NEXT: retq |
| 105 | +entry: |
| 106 | + %0 = load i32, ptr null, align 4 |
| 107 | + %1 = tail call i32 asm "bswap $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %0) #3 |
| 108 | + %arrayidx14 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 3 |
| 109 | + %2 = load i32, ptr %arrayidx14, align 4 |
| 110 | + %3 = tail call i32 asm "bswap $0", "=r,0,~{dirflag},~{fpsr},~{flags}"(i32 %2) #3 |
| 111 | + %4 = insertelement <2 x i32> zeroinitializer, i32 %1, i64 1 |
| 112 | + %5 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %4, <2 x i32> %4, <2 x i32> <i32 15, i32 15>) |
| 113 | + %6 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %4, <2 x i32> %4, <2 x i32> <i32 13, i32 13>) |
| 114 | + %7 = xor <2 x i32> %5, %6 |
| 115 | + %8 = lshr <2 x i32> %4, zeroinitializer |
| 116 | + %9 = xor <2 x i32> %7, %8 |
| 117 | + %10 = insertelement <2 x i32> zeroinitializer, i32 %3, i64 0 |
| 118 | + %11 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %10, <2 x i32> <i32 1, i32 2> |
| 119 | + %12 = add <2 x i32> %11, %9 |
| 120 | + %13 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %12, <2 x i32> %12, <2 x i32> <i32 15, i32 15>) |
| 121 | + %14 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %12, <2 x i32> %12, <2 x i32> <i32 13, i32 13>) |
| 122 | + %15 = xor <2 x i32> %13, %14 |
| 123 | + %16 = lshr <2 x i32> %12, zeroinitializer |
| 124 | + %17 = xor <2 x i32> %15, %16 |
| 125 | + %18 = add <2 x i32> %4, %17 |
| 126 | + %19 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %18, <2 x i32> %18, <2 x i32> <i32 15, i32 15>) |
| 127 | + %20 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %18, <2 x i32> %18, <2 x i32> <i32 13, i32 13>) |
| 128 | + %21 = xor <2 x i32> %19, %20 |
| 129 | + %22 = lshr <2 x i32> %18, <i32 10, i32 10> |
| 130 | + %23 = xor <2 x i32> %21, %22 |
| 131 | + %24 = add <2 x i32> %4, %23 |
| 132 | + %25 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %24, <2 x i32> %24, <2 x i32> <i32 15, i32 15>) |
| 133 | + %26 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %24, <2 x i32> %24, <2 x i32> <i32 13, i32 13>) |
| 134 | + %27 = xor <2 x i32> %25, %26 |
| 135 | + %28 = lshr <2 x i32> %24, <i32 10, i32 10> |
| 136 | + %29 = xor <2 x i32> %27, %28 |
| 137 | + %30 = shufflevector <2 x i32> %4, <2 x i32> %12, <2 x i32> <i32 1, i32 2> |
| 138 | + %31 = add <2 x i32> %30, %29 |
| 139 | + %32 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %31, <2 x i32> %31, <2 x i32> <i32 15, i32 15>) |
| 140 | + %33 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %31, <2 x i32> %31, <2 x i32> <i32 13, i32 13>) |
| 141 | + %34 = xor <2 x i32> %32, %33 |
| 142 | + %35 = lshr <2 x i32> %31, <i32 10, i32 10> |
| 143 | + %36 = xor <2 x i32> %34, %35 |
| 144 | + %37 = shufflevector <2 x i32> %12, <2 x i32> zeroinitializer, <2 x i32> <i32 1, i32 2> |
| 145 | + %38 = add <2 x i32> %37, %36 |
| 146 | + %arrayidx918 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 33 |
| 147 | + store <2 x i32> %38, ptr %arrayidx918, align 4 |
| 148 | + %arrayidx1012 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 35 |
| 149 | + %39 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %38, <2 x i32> %38, <2 x i32> <i32 15, i32 15>) |
| 150 | + %40 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %38, <2 x i32> %38, <2 x i32> <i32 13, i32 13>) |
| 151 | + %41 = xor <2 x i32> %39, %40 |
| 152 | + %42 = lshr <2 x i32> %38, <i32 10, i32 10> |
| 153 | + %43 = xor <2 x i32> %41, %42 |
| 154 | + %44 = add <2 x i32> %37, %43 |
| 155 | + store <2 x i32> zeroinitializer, ptr %arrayidx1012, align 4 |
| 156 | + %arrayidx1106 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 37 |
| 157 | + %45 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %44, <2 x i32> %44, <2 x i32> <i32 15, i32 15>) |
| 158 | + %46 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %44, <2 x i32> %44, <2 x i32> <i32 13, i32 13>) |
| 159 | + %47 = xor <2 x i32> %45, %46 |
| 160 | + %48 = lshr <2 x i32> %44, <i32 10, i32 10> |
| 161 | + %49 = xor <2 x i32> %47, %48 |
| 162 | + %50 = lshr <2 x i32> %24, zeroinitializer |
| 163 | + %51 = add <2 x i32> %50, %49 |
| 164 | + store <2 x i32> %51, ptr %arrayidx1106, align 4 |
| 165 | + %arrayidx1200 = getelementptr inbounds [64 x i32], ptr %ctx, i64 0, i64 39 |
| 166 | + %52 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %51, <2 x i32> %51, <2 x i32> <i32 15, i32 15>) |
| 167 | + %53 = tail call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %51, <2 x i32> %51, <2 x i32> <i32 13, i32 13>) |
| 168 | + %54 = xor <2 x i32> %52, %53 |
| 169 | + %55 = lshr <2 x i32> %51, <i32 10, i32 10> |
| 170 | + %56 = xor <2 x i32> %54, %55 |
| 171 | + %57 = shufflevector <2 x i32> %38, <2 x i32> zeroinitializer, <2 x i32> <i32 poison, i32 0> |
| 172 | + %58 = insertelement <2 x i32> %57, i32 0, i64 0 |
| 173 | + %59 = add <2 x i32> %58, %56 |
| 174 | + store <2 x i32> %59, ptr %arrayidx1200, align 4 |
| 175 | + ret void |
| 176 | + |
| 177 | +; uselistorder directives |
| 178 | + uselistorder <2 x i32> %4, { 7, 0, 1, 6, 5, 4, 3, 2 } |
| 179 | + uselistorder <2 x i32> %38, { 6, 5, 4, 3, 2, 1, 0 } |
| 180 | +} |
| 181 | + |
| 182 | +declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) #2 |
| 183 | + |
| 184 | +; uselistorder directives |
| 185 | +uselistorder ptr @llvm.fshl.v2i32, { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 } |
| 186 | + |
| 187 | +attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } |
| 188 | +attributes #1 = { nounwind sspstrong memory(argmem: readwrite) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "probe-stack"="inline-asm" "stack-protector-buffer-size"="8" "target-cpu"="skylake" "target-features"="+adx,+aes,+avx,+avx2,+bmi,+bmi2,+clflushopt,+cmov,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" } |
| 189 | +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } |
| 190 | +attributes #3 = { nounwind memory(none) } |
0 commit comments