Re: Kiedy będzie milion rdzeni? - Grupy dyskusyjne w eGospodarka.pl

eGospodarka.pl › Grupy › pl.comp.programming › Kiedy będzie milion rdzeni? › Re: Kiedy będzie milion rdzeni?

X-Received: by 2002:ac8:342a:: with SMTP id u39mr12203090qtb.7.1569148403252; Sun, 22
Sep 2019 03:33:23 -0700 (PDT)
X-Received: by 2002:ac8:342a:: with SMTP id u39mr12203090qtb.7.1569148403252; Sun, 22
Sep 2019 03:33:23 -0700 (PDT)
Path: news-archive.icm.edu.pl!news.icm.edu.pl!wsisiz.edu.pl!goblin1!goblin.stu.neva.r
u!o24no9238470qtl.0!news-out.google.com!x7ni1355qtf.0!nntp.google.com!o24no9238
465qtl.0!postnews.google.com!glegroupsg2000goo.googlegroups.com!not-for-mail
Newsgroups: pl.comp.programming
Date: Sun, 22 Sep 2019 03:33:22 -0700 (PDT)
In-Reply-To: <3...@g...com>
Complaints-To: g...@g...com
Injection-Info: glegroupsg2000goo.googlegroups.com; posting-host=5.172.255.70;
posting-account=Sb6m8goAAABbWsBL7gouk3bfLsuxwMgN
NNTP-Posting-Host: 5.172.255.70
References: <d...@g...com>
<b...@g...com>
<8...@g...com>
<b...@g...com>
<3...@g...com>
User-Agent: G2/1.0
MIME-Version: 1.0
Message-ID: <9...@g...com>
Subject: Re: Kiedy będzie milion rdzeni?
From: fir <p...@g...com>
Injection-Date: Sun, 22 Sep 2019 10:33:23 +0000
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
Xref: news-archive.icm.edu.pl pl.comp.programming:214084
[ ukryj nagłówki ]
W dniu piątek, 20 września 2019 15:00:16 UTC+2 użytkownik M.M. napisał:
> On Thursday, September 19, 2019 at 4:55:42 AM UTC+2, k...@g...com wrote:
> > W dniu wtorek, 17 września 2019 21:15:49 UTC+2 użytkownik M.M. napisał:
> > > Ciekawe jak w praktyce wygląda przyspieszenie obliczeń na Tesli
> > > względem Xenon Phi. I ciekawe czy w ogole warto inwestować w te drogie
> > > rozwiązania, jak za 150usd można kupić: GeForce GTX 1650 - tania,
> > > wydajna, a 10 takich kart z lekkim underclockingiem na niektórych
> > > obliczeniach pobiera tylko 500 wat mocy. No ale na GPU trzeba się
> > > nauczyć jakiegoś OpenCL albo CUDA.
> > Jak ktoś umie napisać sensownie równoległy kod to OpenCL/CUDA nie są
> > żadnym problemem. Owszem, to nie są jakieś przepiękne API, ale
> > praktycznie każdy ostro sprzętowy kod w C wygląda równie źle;)
> > Do tego można się tego uczyć bezproblemowo na dowolnym komputerze
> > z jakąś sensowną kartą graficzną, odpalenie Hello World na CUDA
> > (domyślny program to bodajże jakieś równoległe dodawanie wektorów
> > czy tam sortowanie) zajmuje jakieś 15 minut, z czego 10 to rejestracja
> > na stronie nvidii żeby ściągnać toolchain. To jest wręcz przerażająco
> > łatwe w porównaniu do dawnego oprogramowywania "grubych" platform
> > obliczeniowych gdzie potrzeba było komercyjnego kompilatora za grubą
> > kasę na uczelnianym klastrze i wczytywania się w dokumentację żeby
> > coś się w ogóle uruchomiło.
> >
> > Pozdrawiam,
> > --
> > Karol Piotrowski
>
> Możesz polecić jakiś praktyczny tutorial od podstaw OpenCLa dla kogoś, kto wie
> co to C++ i programowanie rownoległe na CPU, ale z GPU nie miał nigdy
> do czynienia?
>
> Pozdrawiam

moge wkleic jakies moje stare notatki/proby z opencl (z 2015, nie pamietam czy to
ostatnia wersja ale pamietam ze to dzialalo, i wersja opencl renderowania madelbrota
byla najszybsza, pisalem o tym na grupie wiec jest o tym gdzies watek w 2015)

ale szczerze mowiac nie wiem czy polecalbym sie tym zajmowac, jesli ktos chce zbierac
odznaki programistycznego skauta i wpisac opencl do cv to na pewno
- w innym wypadku chyba nie

#include "fist.h"
////////////////////////////////////////////////////
////////////////////////////

//#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
//#include <string.h>
#include <math.h>
//#include <unistd.h>
//#include <sys/types.h>
//#include <sys/stat.h>
//#include <OpenCL/opencl.h>
//#include "clew.c"
#include "clew.h"
int clewInit(const char* path);

////////////////////////////////////////////////////
////////////////////////////

// Use a static data size for simplicity
//
//const int DATA_SIZE = 130*1024;

const int OCL_INPUT_DATA_SIZE = 1*1024;
const int OCL_OUTPUT_DATA_SIZE = 256*256;
const int OCL_PROCESSING_RANGE = 256*256;

//int DATA_SIZE = DATA_SIZE_MAX/10;

////////////////////////////////////////////////////
////////////////////////////

// Simple compute kernel which computes the square of an input array
//
// const char *KernelSource = "\n" \
// "__kernel void square( \n" \
// " __global float* input, \n" \
// " __global float* output, \n" \
// " const unsigned int count) \n" \
// "{ \n" \
// " int i = get_global_id(0); \n" \
// " if(i < count) \n" \
// " output[i] = input[i] * input[i]; \n" \
// "} \n" \
// "\n";

// const char *KernelSource = "\n" \
// "__kernel void square( \n" \
// " __global int* input, \n" \
// " __global int* output, \n" \
// " const unsigned int count) \n" \
// "{ \n" \
// " int i = get_global_id(0); \n" \
// " if(i < count) \n" \
// " output[i] = i+input[i] *2 ; \n" \
// "} \n" \
// "\n";

const char *KernelSource = "\n" \
// "__kernel void square( \n" \
// " __global int* input, \n" \
// " __global int* output, \n" \
// " const unsigned int count) \n" \
// "{ \n" \
// " int i = get_global_id(0); \n" \
// " if(i < count) \n" \
// " { int x = i%550; int y=i/550;
\n" \
// " output[i] = x+input[i] +y ; \n" \
// " } \n" \
// "} \n" \
// "\n";
"__kernel void square( \n" \
" __global int* input, \n" \
" __global int* output, \n" \
" const unsigned int count) \n" \
"{ \n" \
" int i = get_global_id(0); \n" \
" if(i < count) \n" \
" { \n" \
" int x = i%256; \n" \
" // if(x>=256) return; \n" \
" int y = i/256; \n" \
" // if(y>=256) return; \n" \
" float cRe = -0.5 + -1.5 + x/256.*3.; \n" \
" float cIm = 0.0 + -1.5 + y/256.*3.; \n" \
" float re = 0; \n" \
" float im = 0; \n" \
" int n = 0; \n" \
" for( n=0; n<=1000; n++) { \n" \
" if( re * re + im * im > 4.0 ) { output[256*y+x] = n + 256*n + 256*256*n;
return;} \n" \
" float re_n = re * re - im * im + cRe; \n" \
" float im_n = 2 * re * im + cIm; \n" \
" re = re_n; \n" \
" im = im_n; \n" \
" } \n" \
" output[256*y+x] = 250<<8; \n" \
" } \n" \
"} \n" \
"\n";

int data[OCL_INPUT_DATA_SIZE]; // original data set given to device
int results[OCL_OUTPUT_DATA_SIZE]; // results returned from device

// void SetupInputData()
// {
// // Fill our data set with random float values
// //
// int i = 0;
// unsigned int count = DATA_SIZE;
// for(i = 0; i < count; i++)
// data[i] = rand() / (float)RAND_MAX;
// }
////////////////////////////////////////////////////
////////////////////////////
int err; // error code returned from api calls
unsigned int correct; // number of correct results returned

size_t global; // global domain size for our calculation
size_t local; // local domain size for our calculation

cl_device_id device_id; // compute device id
cl_context context; // compute context
cl_command_queue commands; // compute command queue
cl_program program; // compute program
cl_kernel kernel; // compute kernel

cl_mem input; // device memory used for the input array
cl_mem output; // device memory used for the output array

int SetupCL()
{
static int initialised =0;
if(initialised) return 0;
initialised=1;

clewInit("OpenCl.dll");

int writeCLInfo();

// writeCLInfo();

//////// platform
static cl_platform_id platform_id[10] = {0};
cl_uint no_of_platforms_found = 0;

int ret = clGetPlatformIDs(10, platform_id, &no_of_platforms_found);
// if(ret == CL_SUCCESS ) ERROR_("clGetPlatformIDs success") ;
// alert(" %d platforms found\n",no_of_platforms_found ) ;

// Connect to a compute device
//
int gpu = 1;
err = clGetDeviceIDs(platform_id[0], gpu ? CL_DEVICE_TYPE_GPU :
CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
if (err != CL_SUCCESS)
{
ERROR_("Error: Failed to create a device group!\n");
return EXIT_FAILURE;
}

// Create a compute context
//
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!context)
{
ERROR_("Error: Failed to create a compute context!\n");
return EXIT_FAILURE;
}

// Create a command commands
//
commands = clCreateCommandQueue(context, device_id, 0, &err);
if (!commands)
{
ERROR_("Error: Failed to create a command commands!\n");
return EXIT_FAILURE;
}

// Create the compute program from the source buffer
//
program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource,
NULL, &err);
if (!program)
{
ERROR_("Error: Failed to create compute program!\n");
return EXIT_FAILURE;
}

// Build the program executable
//
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048];

ERROR_("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
sizeof(buffer), buffer, &len);
ERROR__("%s\n", buffer);
exit(1);
}

// Create the compute kernel in the program we wish to run
//
kernel = clCreateKernel(program, "square", &err);
if (!kernel || err != CL_SUCCESS)
{
ERROR_("Error: Failed to create compute kernel! \n");
exit(1);
}

// Create the input and output arrays in device memory for our calculation
//
if(OCL_INPUT_DATA_SIZE>0)
input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) *
OCL_INPUT_DATA_SIZE, NULL, NULL);

output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) *
OCL_OUTPUT_DATA_SIZE, NULL, NULL);

if (!input || !output)
{
ERROR_("Error: Failed to allocate device memory!\n");
exit(1);
}

////////////////////////

// Set the arguments to our compute kernel
//
err = 0;
int count = OCL_PROCESSING_RANGE;

err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);

if (err != CL_SUCCESS)
{
ERROR__("Error: Failed to set kernel arguments! %d\n", err);
exit(1);
}

// Get the maximum work group size for executing the kernel on the device
//
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE,
sizeof(local), &local, NULL);
if (err != CL_SUCCESS)
{
ERROR__("Error: Failed to retrieve kernel work group info! %d\n", err);
exit(1);
}

return 0;
}

///////////////////

int RunOpenCLtask()
{
// DATA_SIZE = frame_size_x*frame_size_y/4;

// SetupInputData();

if(0)
for(int i = 0; i < OCL_INPUT_DATA_SIZE; i++)
{
data[i] = frame_bitmap[i] ;
}

// Write our data set into the input array in device memory
//
if(OCL_INPUT_DATA_SIZE>0)
{
err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) *
OCL_INPUT_DATA_SIZE, data, 0, NULL, NULL);

if (err != CL_SUCCESS)
ERROR_EXIT("Error: Failed to write to source array!\n");
}

// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
//
global = OCL_PROCESSING_RANGE;

err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL,
NULL);
if (err)
ERROR_EXIT("Error: Failed to execute kernel!\n");

// Wait for the command commands to get serviced before reading back results
//
clFinish(commands);

// Read back the results from the device to verify the output
//
err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) *
OCL_OUTPUT_DATA_SIZE, results, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
ERROR__("Error: Failed to read output array! %d\n", err);
exit(1);
}

if(1)
for(int i = 0; i < OCL_OUTPUT_DATA_SIZE; i++)
{
int x = i%256;
int y = i/256;
frame_bitmap[y*550+x] = results[i];
}

// Validate our results
//
// correct = 0;
// for(int i = 0; i < count; i++)
// {
// if(results[i] - data[i] * data[i]<0.001)
// correct++;
// }
//
// for(int i=0; i<1024; i++)
// {
// printf("data %f square %f \n", data[i], data[i]*data[i]);
// printf("result %f \n", results[i]);
// }

// Print a brief summary detailing the results
//
// printf("Computed '%d/%d' correct values!\n", correct, count);
// alert("Computed '%d/%d' correct values!\n", correct, count);

}

void ShutdownOpenCl()
{
// Shutdown and cleanup
//
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(commands);
clReleaseContext(context);

}

int writeCLInfo()
{

int i, j;
char* value;
size_t valueSize;
cl_uint platformCount;
cl_platform_id* platforms;
cl_uint deviceCount;
cl_device_id* devices;
cl_uint maxComputeUnits;

// get all platforms

clGetPlatformIDs(0, NULL, &platformCount);

platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);

clGetPlatformIDs(platformCount, platforms, NULL);

for (i = 0; i < platformCount; i++) {

// get all devices

clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);

devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);

clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);

// for each device print critical attributes

for (j = 0; j < deviceCount; j++) {

// print device name

clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize);

value = (char*) malloc(valueSize);

clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL);

printf("%d. Device: %s\n", j+1, value);

free(value);

// print hardware device version

clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize);

value = (char*) malloc(valueSize);

clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL);

printf(" %d.%d Hardware version: %s\n", j+1, 1, value);

free(value);

// print software driver version

clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize);

value = (char*) malloc(valueSize);

clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL);

printf(" %d.%d Software version: %s\n", j+1, 2, value);

free(value);

// print c version supported by compiler for device

clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL,
&valueSize);

value = (char*) malloc(valueSize);

clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value,
NULL);

printf(" %d.%d OpenCL C version: %s\n", j+1, 3, value);

free(value);

// print parallel compute units

clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS,

sizeof(maxComputeUnits), &maxComputeUnits, NULL);

printf(" %d.%d Parallel compute units: %d\n", j+1, 4, maxComputeUnits);

}

free(devices);

}

free(platforms);

return 0;
}