Add AMD BFI_INT optimizations to SHA-2 and RIPEMD160 hash functions.
The full effect of this optimization will come later.
This commit is contained in:
@@ -726,12 +726,22 @@ sha2_256_init(uint *out)
|
||||
|
||||
/* The state variable remapping is really contorted */
|
||||
#define sha2_stvar(vals, i, v) vals[(64+v-i) % 8]
|
||||
#define sha2_s0(a) (rotate(a, 30U) ^ rotate(a, 19U) ^ rotate(a, 10U))
|
||||
#define sha2_s1(a) (rotate(a, 26U) ^ rotate(a, 21U) ^ rotate(a, 7U))
|
||||
#if defined(AMD_BFI_INT)
|
||||
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
|
||||
#define sha2_ch(a, b, c) amd_bytealign(a, b, c)
|
||||
#define sha2_ma(a, b, c) amd_bytealign((a^c), b, a)
|
||||
#else
|
||||
#define sha2_ch(a, b, c) (c ^ (a & (b ^ c)))
|
||||
#define sha2_ma(a, b, c) ((a & c) | (b & (a | c)))
|
||||
#endif
|
||||
|
||||
void
|
||||
sha2_256_block(uint *out, uint *in)
|
||||
{
|
||||
int i;
|
||||
uint state[8], s0, s1, t1, t2;
|
||||
uint state[8], t1, t2;
|
||||
#ifdef UNROLL_MAX
|
||||
#pragma unroll UNROLL_MAX
|
||||
#endif
|
||||
@@ -751,18 +761,17 @@ sha2_256_block(uint *out, uint *in)
|
||||
}
|
||||
|
||||
/* Compute the t1, t2 augmentations */
|
||||
t1 = sha2_stvar(state, i, 4);
|
||||
t2 = sha2_stvar(state, i, 0);
|
||||
s0 = (rotate(t2, 30U) ^ rotate(t2, 19U) ^ rotate(t2, 10U));
|
||||
s1 = (rotate(t1, 26U) ^ rotate(t1, 21U) ^ rotate(t1, 7U));
|
||||
|
||||
t1 = (sha2_stvar(state, i, 7) + s1 + sha2_k[i] + in[i % 16] +
|
||||
((t1 & sha2_stvar(state, i, 5)) ^
|
||||
(~t1 & sha2_stvar(state, i, 6))));
|
||||
t2 = s0 + ((t2 & sha2_stvar(state, i, 1)) ^
|
||||
(t2 & sha2_stvar(state, i, 2)) ^
|
||||
(sha2_stvar(state, i, 1) & sha2_stvar(state, i, 2)));
|
||||
|
||||
t1 = (sha2_stvar(state, i, 7) +
|
||||
sha2_s1(sha2_stvar(state, i, 4)) +
|
||||
sha2_ch(sha2_stvar(state, i, 4),
|
||||
sha2_stvar(state, i, 5),
|
||||
sha2_stvar(state, i, 6)) +
|
||||
sha2_k[i] +
|
||||
in[i % 16]);
|
||||
t2 = (sha2_s0(sha2_stvar(state, i, 0)) +
|
||||
sha2_ma(sha2_stvar(state, i, 0),
|
||||
sha2_stvar(state, i, 1),
|
||||
sha2_stvar(state, i, 2)));
|
||||
sha2_stvar(state, i, 3) += t1;
|
||||
sha2_stvar(state, i, 7) = t1 + t2;
|
||||
}
|
||||
@@ -817,11 +826,19 @@ __constant uchar ripemd160_rlp[] = {
|
||||
|
||||
#define ripemd160_val(v, i, n) (v)[(80+(n)-(i)) % 5]
|
||||
#define ripemd160_valp(v, i, n) (v)[5 + ((80+(n)-(i)) % 5)]
|
||||
#if defined(AMD_BFI_INT)
|
||||
#define ripemd160_f0(x, y, z) (x ^ y ^ z)
|
||||
#define ripemd160_f1(x, y, z) amd_bytealign(x, y, z)
|
||||
#define ripemd160_f2(x, y, z) (z ^ (x | ~y))
|
||||
#define ripemd160_f3(x, y, z) amd_bytealign(z, x, y)
|
||||
#define ripemd160_f4(x, y, z) (x ^ (y | ~z))
|
||||
#else
|
||||
#define ripemd160_f0(x, y, z) (x ^ y ^ z)
|
||||
#define ripemd160_f1(x, y, z) ((x & y) | (~x & z))
|
||||
#define ripemd160_f2(x, y, z) ((x | ~y) ^ z)
|
||||
#define ripemd160_f2(x, y, z) (z ^ (x | ~y))
|
||||
#define ripemd160_f3(x, y, z) ((x & z) | (y & ~z))
|
||||
#define ripemd160_f4(x, y, z) (x ^ (y | ~z))
|
||||
#endif
|
||||
#define ripemd160_round(i, in, vals, f, fp, t) do { \
|
||||
ripemd160_val(vals, i, 0) = \
|
||||
rotate(ripemd160_val(vals, i, 0) + \
|
||||
|
||||
206
oclvanitygen.c
206
oclvanitygen.c
@@ -264,11 +264,11 @@ vg_ocl_device_getulong(cl_device_id did, cl_device_info param)
|
||||
return val;
|
||||
}
|
||||
|
||||
size_t
|
||||
cl_uint
|
||||
vg_ocl_device_getuint(cl_device_id did, cl_device_info param)
|
||||
{
|
||||
cl_int ret;
|
||||
size_t val;
|
||||
cl_uint val;
|
||||
size_t size_ret;
|
||||
ret = clGetDeviceInfo(did, param, sizeof(val), &val, &size_ret);
|
||||
if (ret != CL_SUCCESS) {
|
||||
@@ -288,8 +288,9 @@ vg_ocl_dump_info(vg_ocl_context_t *vocp)
|
||||
did = vocp->voc_ocldid;
|
||||
printf("Device: %s\n",
|
||||
vg_ocl_device_getstr(did, CL_DEVICE_NAME));
|
||||
printf("Vendor: %s\n",
|
||||
vg_ocl_device_getstr(did, CL_DEVICE_VENDOR));
|
||||
printf("Vendor: %s (%04x)\n",
|
||||
vg_ocl_device_getstr(did, CL_DEVICE_VENDOR),
|
||||
vg_ocl_device_getuint(did, CL_DEVICE_VENDOR_ID));
|
||||
printf("Driver: %s\n",
|
||||
vg_ocl_device_getstr(did, CL_DRIVER_VERSION));
|
||||
printf("Profile: %s\n",
|
||||
@@ -385,29 +386,32 @@ enum {
|
||||
VG_OCL_UNROLL_LOOPS = (1 << 0),
|
||||
VG_OCL_EXPENSIVE_BRANCHES = (1 << 1),
|
||||
VG_OCL_DEEP_VLIW = (1 << 2),
|
||||
VG_OCL_NV_VERBOSE = (1 << 3),
|
||||
VG_OCL_BROKEN = (1 << 4),
|
||||
VG_OCL_NO_BINARIES = (1 << 5),
|
||||
VG_OCL_AMD_BFI_INT = (1 << 3),
|
||||
VG_OCL_NV_VERBOSE = (1 << 4),
|
||||
VG_OCL_BROKEN = (1 << 5),
|
||||
VG_OCL_NO_BINARIES = (1 << 6),
|
||||
|
||||
VG_OCL_OPTIMIZATIONS = (VG_OCL_UNROLL_LOOPS |
|
||||
VG_OCL_EXPENSIVE_BRANCHES |
|
||||
VG_OCL_DEEP_VLIW),
|
||||
VG_OCL_DEEP_VLIW |
|
||||
VG_OCL_AMD_BFI_INT),
|
||||
|
||||
};
|
||||
|
||||
int
|
||||
vg_ocl_get_quirks(vg_ocl_context_t *vocp)
|
||||
{
|
||||
const char *vend;
|
||||
uint32_t vend;
|
||||
const char *dvn;
|
||||
unsigned int quirks = 0;
|
||||
|
||||
/* Loop unrolling for devices other than CPUs */
|
||||
if (!(vg_ocl_device_gettype(vocp->voc_ocldid) & CL_DEVICE_TYPE_CPU))
|
||||
quirks |= VG_OCL_UNROLL_LOOPS;
|
||||
|
||||
vend = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_VENDOR);
|
||||
if (!strcmp(vend, "NVIDIA Corporation") ||
|
||||
!strcmp(vend, "NVIDIA")) {
|
||||
vend = vg_ocl_device_getuint(vocp->voc_ocldid, CL_DEVICE_VENDOR_ID);
|
||||
switch (vend) {
|
||||
case 0x10de: /* NVIDIA */
|
||||
quirks |= VG_OCL_NV_VERBOSE;
|
||||
#ifdef WIN32
|
||||
if (strcmp(vg_ocl_device_getstr(vocp->voc_ocldid,
|
||||
@@ -420,15 +424,22 @@ vg_ocl_get_quirks(vg_ocl_context_t *vocp)
|
||||
quirks |= VG_OCL_BROKEN;
|
||||
}
|
||||
#endif
|
||||
} else if (!strcmp(vend, "Advanced Micro Devices, Inc.") ||
|
||||
!strcmp(vend, "AMD")) {
|
||||
break;
|
||||
case 0x1002: /* AMD/ATI */
|
||||
quirks |= VG_OCL_EXPENSIVE_BRANCHES;
|
||||
quirks |= VG_OCL_DEEP_VLIW;
|
||||
vend = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_NAME);
|
||||
if (!strcmp(vend, "ATI RV710")) {
|
||||
dvn = vg_ocl_device_getstr(vocp->voc_ocldid,
|
||||
CL_DEVICE_EXTENSIONS);
|
||||
if (dvn && strstr(dvn, "cl_amd_media_ops"))
|
||||
quirks |= VG_OCL_AMD_BFI_INT;
|
||||
dvn = vg_ocl_device_getstr(vocp->voc_ocldid, CL_DEVICE_NAME);
|
||||
if (!strcmp(dvn, "ATI RV710")) {
|
||||
quirks &= ~VG_OCL_OPTIMIZATIONS;
|
||||
quirks |= VG_OCL_NO_BINARIES;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return quirks;
|
||||
}
|
||||
@@ -481,19 +492,147 @@ vg_ocl_hash_program(vg_ocl_context_t *vocp, const char *opts,
|
||||
MD5_Final(hash_out, &ctx);
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
unsigned char e_ident[16];
|
||||
uint16_t e_type;
|
||||
uint16_t e_machine;
|
||||
uint32_t e_version;
|
||||
uint32_t e_entry;
|
||||
uint32_t e_phoff;
|
||||
uint32_t e_shoff;
|
||||
uint32_t e_flags;
|
||||
uint16_t e_ehsize;
|
||||
uint16_t e_phentsize;
|
||||
uint16_t e_phnum;
|
||||
uint16_t e_shentsize;
|
||||
uint16_t e_shnum;
|
||||
uint16_t e_shstrndx;
|
||||
} vg_elf32_header_t;
|
||||
|
||||
typedef struct {
|
||||
uint32_t sh_name;
|
||||
uint32_t sh_type;
|
||||
uint32_t sh_flags;
|
||||
uint32_t sh_addr;
|
||||
uint32_t sh_offset;
|
||||
uint32_t sh_size;
|
||||
uint32_t sh_link;
|
||||
uint32_t sh_info;
|
||||
uint32_t sh_addralign;
|
||||
uint32_t sh_entsize;
|
||||
} vg_elf32_shdr_t;
|
||||
|
||||
int
|
||||
vg_ocl_amd_patch_inner(unsigned char *binary, size_t size)
|
||||
{
|
||||
vg_elf32_header_t *ehp;
|
||||
vg_elf32_shdr_t *shp, *nshp;
|
||||
uint32_t *instr;
|
||||
size_t off;
|
||||
int i, n, txt2idx, patched;
|
||||
|
||||
ehp = (vg_elf32_header_t *) binary;
|
||||
if ((size < sizeof(*ehp)) ||
|
||||
memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\x64", 8) ||
|
||||
!ehp->e_shoff)
|
||||
return 0;
|
||||
|
||||
off = ehp->e_shoff + (ehp->e_shstrndx * ehp->e_shentsize);
|
||||
nshp = (vg_elf32_shdr_t *) (binary + off);
|
||||
if ((off + sizeof(*nshp)) > size)
|
||||
return 0;
|
||||
|
||||
shp = (vg_elf32_shdr_t *) (binary + ehp->e_shoff);
|
||||
n = 0;
|
||||
txt2idx = 0;
|
||||
for (i = 0; i < ehp->e_shnum; i++) {
|
||||
off = nshp->sh_offset + shp[i].sh_name;
|
||||
if (((off + 6) >= size) ||
|
||||
memcmp(binary + off, ".text", 6))
|
||||
continue;
|
||||
n++;
|
||||
if (n == 2)
|
||||
txt2idx = i;
|
||||
}
|
||||
if (n != 2)
|
||||
return 0;
|
||||
|
||||
off = shp[txt2idx].sh_offset;
|
||||
instr = (uint32_t *) (binary + off);
|
||||
n = shp[txt2idx].sh_size / 4;
|
||||
patched = 0;
|
||||
for (i = 0; i < n; i += 2) {
|
||||
if (((instr[i] & 0x02001000) == 0) &&
|
||||
((instr[i+1] & 0x9003f000) == 0x0001a000)) {
|
||||
instr[i+1] ^= (0x0001a000 ^ 0x0000c000);
|
||||
patched++;
|
||||
}
|
||||
}
|
||||
|
||||
return patched;
|
||||
}
|
||||
|
||||
int
|
||||
vg_ocl_amd_patch(vg_ocl_context_t *vocp, unsigned char *binary, size_t size)
|
||||
{
|
||||
vg_context_t *vcp = vocp->base.vxc_vc;
|
||||
vg_elf32_header_t *ehp;
|
||||
unsigned char *ptr;
|
||||
size_t offset = 1;
|
||||
int ninner = 0, nrun, npatched = 0;
|
||||
|
||||
ehp = (vg_elf32_header_t *) binary;
|
||||
if ((size < sizeof(*ehp)) ||
|
||||
memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\0", 8) ||
|
||||
!ehp->e_shoff)
|
||||
return 0;
|
||||
|
||||
offset = 1;
|
||||
while (offset < (size - 8)) {
|
||||
ptr = (unsigned char *) memchr(binary + offset,
|
||||
0x7f,
|
||||
size - offset);
|
||||
if (!ptr)
|
||||
return npatched;
|
||||
offset = ptr - binary;
|
||||
ehp = (vg_elf32_header_t *) ptr;
|
||||
if (((size - offset) < sizeof(*ehp)) ||
|
||||
memcmp(ehp->e_ident, "\x7f" "ELF\1\1\1\x64", 8) ||
|
||||
!ehp->e_shoff) {
|
||||
offset += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
ninner++;
|
||||
nrun = vg_ocl_amd_patch_inner(ptr, size - offset);
|
||||
npatched += nrun;
|
||||
if (vcp->vc_verbose > 1)
|
||||
printf("AMD BFI_INT: patched %d instructions "
|
||||
"in kernel %d\n",
|
||||
nrun, ninner);
|
||||
npatched++;
|
||||
offset += 1;
|
||||
}
|
||||
return npatched;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp,
|
||||
const char *filename, const char *opts)
|
||||
{
|
||||
FILE *kfp;
|
||||
char *buf, *tbuf;
|
||||
int len, fromsource = 0;
|
||||
int len, fromsource = 0, patched = 0;
|
||||
size_t sz, szr;
|
||||
cl_program prog;
|
||||
cl_int ret, sts;
|
||||
unsigned char prog_hash[16];
|
||||
char bin_name[64];
|
||||
|
||||
if (vcp->vc_verbose > 1)
|
||||
printf("OpenCL compiler flags: %s\n", opts ? opts : "");
|
||||
|
||||
sz = 128 * 1024;
|
||||
buf = (char *) malloc(sz);
|
||||
if (!buf) {
|
||||
@@ -568,6 +707,7 @@ vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp,
|
||||
}
|
||||
}
|
||||
fclose(kfp);
|
||||
rebuild:
|
||||
prog = clCreateProgramWithBinary(vocp->voc_oclctx,
|
||||
1, &vocp->voc_ocldid,
|
||||
&szr,
|
||||
@@ -581,24 +721,22 @@ vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp,
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (vcp->vc_verbose > 1)
|
||||
printf("OpenCL compiler flags: %s\n", opts ? opts : "");
|
||||
|
||||
if (vcp->vc_verbose > 0) {
|
||||
if (fromsource) {
|
||||
if (fromsource && !patched) {
|
||||
printf("Compiling kernel...");
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
ret = clBuildProgram(prog, 1, &vocp->voc_ocldid, opts, NULL, NULL);
|
||||
if (ret != CL_SUCCESS) {
|
||||
if ((vcp->vc_verbose > 0) && fromsource)
|
||||
if ((vcp->vc_verbose > 0) && fromsource && !patched)
|
||||
printf("failure.\n");
|
||||
vg_ocl_error(NULL, ret, "clBuildProgram");
|
||||
} else if ((vcp->vc_verbose > 0) && fromsource) {
|
||||
} else if ((vcp->vc_verbose > 0) && fromsource && !patched) {
|
||||
printf("done!\n");
|
||||
}
|
||||
if ((ret != CL_SUCCESS) || (vcp->vc_verbose > 1)) {
|
||||
if ((ret != CL_SUCCESS) ||
|
||||
((vcp->vc_verbose > 1) && fromsource && !patched)) {
|
||||
vg_ocl_buildlog(vocp, prog);
|
||||
}
|
||||
if (ret != CL_SUCCESS) {
|
||||
@@ -641,6 +779,23 @@ vg_ocl_load_program(vg_context_t *vcp, vg_ocl_context_t *vocp,
|
||||
goto out;
|
||||
}
|
||||
|
||||
if ((vocp->voc_quirks & VG_OCL_AMD_BFI_INT) && !patched) {
|
||||
patched = vg_ocl_amd_patch(vocp,
|
||||
(unsigned char *) buf, szr);
|
||||
if (patched > 0) {
|
||||
if (vcp->vc_verbose > 1)
|
||||
printf("AMD BFI_INT patch complete\n");
|
||||
clReleaseProgram(prog);
|
||||
goto rebuild;
|
||||
}
|
||||
printf("WARNING: AMD BFI_INT patching failed\n");
|
||||
if (patched < 0) {
|
||||
/* Program was incompletely modified */
|
||||
free(buf);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
kfp = fopen(bin_name, "wb");
|
||||
if (!kfp) {
|
||||
printf("WARNING: could not save CL kernel binary: %s\n",
|
||||
@@ -743,6 +898,9 @@ vg_ocl_init(vg_context_t *vcp, vg_ocl_context_t *vocp, cl_device_id did,
|
||||
if (vocp->voc_quirks & VG_OCL_DEEP_VLIW)
|
||||
end += snprintf(optbuf + end, sizeof(optbuf) - end,
|
||||
"-DDEEP_VLIW ");
|
||||
if (vocp->voc_quirks & VG_OCL_AMD_BFI_INT)
|
||||
end += snprintf(optbuf + end, sizeof(optbuf) - end,
|
||||
"-DAMD_BFI_INT ");
|
||||
if (vocp->voc_quirks & VG_OCL_NV_VERBOSE)
|
||||
end += snprintf(optbuf + end, sizeof(optbuf) - end,
|
||||
"-cl-nv-verbose ");
|
||||
|
||||
Reference in New Issue
Block a user