#include #include #include #include int main(int argc, char ** argv) { int p = 256; // threads per block int lengthvector = 10240; dim3 threads(p,1,1); // threads per block dim3 grid(lengthvector/p,1,1); // lengthvector should be multiple of p // numbers of blocks to cover array int sharedMemSize = 0; unsigned int memSize = lengthvector*sizeof(float); // vector on host float* h_vector = (float*) malloc(memSize); // put something in the vector for(int i=0;i>> (d_vector); // transfer memory back CUDA_SAFE_CALL(cudaMemcpy(h_vector, d_vector, memSize, cudaMemcpyDeviceToHost)); // Check if kernel execution generated an error CUT_CHECK_ERROR("Kernel execution failed"); printf("after\n"); for(int i=0;i<20;i++) { // print out part of vector printf("%.1f \n", h_vector[i]); } // cleanup CUDA_SAFE_CALL(cudaFree(d_vector)); free(h_vector); // Use CUDA Utility Tool to exit cleanly CUT_EXIT(argc, argv); }