This is my first post, so I am thrilled to get some new insights and enlarge my knowledge. Currently I am working on a C-project where a binary raw file with 3d-data is loaded, processed in CUDA and saved in a new binary raw file.
This is based on the simpleTexture3D project from CUDA Samples:
This is my cpp
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, cuda
#include <vector_types.h>
#include <driver_functions.h>
#include <cuda_runtime.h>
// CUDA utilities and system includes
#include <helper_cuda.h>
#include <helper_functions.h>
#include <vector_types.h>
typedef unsigned int  uint;
typedef unsigned char uchar;
const char *sSDKsample = "simpleTexture3D";
const char *volumeFilename = "Bucky.raw";
const cudaExtent volumeSize = make_cudaExtent(32, 32, 32);
const uint width = 64, height = 64, depth=64;
//const char *volumeFilename = "TestOCT.raw";
//const cudaExtent volumeSize = make_cudaExtent(1024, 512, 512);
//
//const uint width = 1024, height = 512, depth=512;
const dim3 blockSize(8, 8, 8);
const dim3 gridSize(width / blockSize.x, height / blockSize.y, depth / blockSize.z);
uint *d_output = NULL;
int *pArgc = NULL;
char **pArgv = NULL;
extern "C" void cleanup();
extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize);
extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, uint imageD);
void loadVolumeData(char *exec_path);
// render image using CUDA
void render()
{
    // call CUDA kernel
    render_kernel(gridSize, blockSize, d_output, width, height, depth);
    getLastCudaError("render_kernel failed");
}
void cleanup()
{
    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    checkCudaErrors(cudaDeviceReset());
}
// Load raw data from disk
uchar *loadRawFile(const char *filename, size_t size)
{
    FILE *fp = fopen(filename, "rb");
    if (!fp)
    {
        fprintf(stderr, "Error opening file '%s'\n", filename);
        return 0;
    }
    uchar *data = (uchar *) malloc(size);
    size_t read = fread(data, 1, size, fp);
    fclose(fp);
    printf("Read '%s', %lu bytes\n", filename, read);
    return data;
}
// write raw data to disk
int writeRawFile(const char *filename, uchar *data, size_t size)
{
    int returnState=0;
    // cut file extension from filename
    char *a=strdup(filename);   //via strdup you dumb a const char to char, you must free it yourself
    int len = strlen(a);
    a[len-4] = '\0';    //deletes '.raw'
    //printf("%s\n",a);
    char b[50];
    sprintf(b, "_%dx%dx%d_out.raw", width, height, depth);
    //char b[]="_out.raw";  //Add suffix out to filename
    char buffer[256]; // <- danger, only storage for 256 characters.
    strncpy(buffer, a, sizeof(buffer));
    strncat(buffer, b, sizeof(buffer));
    free(a);
    FILE *fp = fopen(buffer, "wb"); //Open or create file for writing as binary, all existing data is cleared
    if (!fp)
    {
        fprintf(stderr, "Error opening or creating file '%s'\n", buffer);
        return 0;
    }
    size_t write = fwrite(data, 1, size, fp);
    fclose(fp);
    if (write==size)
    {    
         printf("Wrote %lu bytes to '%s'\n", write, buffer);
         return 0;
    }
    else
    {
        printf("Error writing data to file '%s'\n", buffer);    
        return 1;
    }
}
// General initialization call for CUDA Device
int chooseCudaDevice(int argc, char **argv)
{
    int result = 0;
    result = findCudaDevice(argc, (const char **)argv);
    return result;
}
void runAutoTest(char *exec_path, char *PathToFile)
{
    // set path
    char *path;
    if (PathToFile == NULL)
    {
        path = sdkFindFilePath(volumeFilename, exec_path);
    }
    else
    {
        path = PathToFile;
    }
    if (path == NULL)
    {
        fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", volumeFilename);
        exit(EXIT_FAILURE);
    }
    // Allocate output memory
    checkCudaErrors(cudaMalloc((void **)&d_output, width*height*depth*sizeof(uchar)));
    // zero out the output array with cudaMemset
    cudaMemset(d_output, 0, width*height*depth*sizeof(uchar));
    // render the volumeData
    render_kernel(gridSize, blockSize, d_output, width, height, depth);
    checkCudaErrors(cudaDeviceSynchronize());
    getLastCudaError("render_kernel failed");
    uchar *h_output = (uchar*)malloc(width*height*depth);
    checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*depth*sizeof(uchar), cudaMemcpyDeviceToHost));
    int wState=writeRawFile(path,h_output,width*height*depth);
    checkCudaErrors(cudaFree(d_output));
    free(h_output);
    // cudaDeviceReset causes the driver to clean up all state. While
    // not mandatory in normal operation, it is good practice.  It is also
    // needed to ensure correct operation when the application is being
    // profiled. Calling cudaDeviceReset causes all profile data to be
    // flushed before the application exits
    cudaDeviceReset();
    //exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
void loadVolumeData(char *exec_path, char *PathToFile)
{
    char *path;
    // load volume data
    if (PathToFile == NULL)
    {
        path = sdkFindFilePath(volumeFilename, exec_path);
    }
    else
    {
        path = PathToFile;
    }
    if (path == NULL)
    {
        fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", volumeFilename);
        exit(EXIT_FAILURE);
    }
    size_t size = volumeSize.width*volumeSize.height*volumeSize.depth;
    uchar *h_volume = loadRawFile(path, size);
    //int wState=writeRawFile(path,h_volume,size);
    initCuda(h_volume, volumeSize);
    free(h_volume);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
    pArgc = &argc;
    pArgv = argv;
    char *image_file = NULL;
    printf("%s Starting...\n\n", sSDKsample);
    if (checkCmdLineFlag(argc, (const char **)argv, "file"))    //Note cmd line argument is -file "PathToFile/File.raw"
    {                                                           // for example -file "C:\ProgramData\NVIDIA Corporation\CUDA Samples\v7.0\2_Graphics\simpleTexture3D_FanBeamCorr\data\TestOCT_Kopie.raw"            
        getCmdLineArgumentString(argc, (const char **)argv, "file", &image_file);
    }
    if (image_file)
    {
        chooseCudaDevice(argc, argv);
        loadVolumeData(argv[0],image_file);
        runAutoTest(argv[0],image_file);
    }
    else
    {
        // use command-line specified CUDA device, otherwise use device with highest Gflops/s
        chooseCudaDevice(argc, argv);
        loadVolumeData(argv[0],NULL);
        runAutoTest(argv[0],NULL);
    }
    printf("I am finished...\n"
           "Can I get some ice cream please\n");
    exit(EXIT_SUCCESS);
}
And this is my .cu
#ifndef _SIMPLETEXTURE3D_KERNEL_CU_
#define _SIMPLETEXTURE3D_KERNEL_CU_
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <helper_cuda.h>
#include <helper_math.h>
typedef unsigned int  uint;
typedef unsigned char uchar;
texture<uchar, 3, cudaReadModeNormalizedFloat> tex;  // 3D texture
cudaArray *d_volumeArray = 0;
__global__ void
d_render(uint *d_output, uint imageW, uint imageH, uint imageD)
{
    uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
    uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
    uint z = __umul24(blockIdx.z, blockDim.z) + threadIdx.z;
 //   float u = x / (float) imageW;
 //   float v = y / (float) imageH;
    //float w = z / (float) imageD;
 //   // read from 3D texture
 //   float voxel = tex3D(tex, u, v, w);
    uint ps=__umul24(imageW,imageH);
    if ((x < imageW) && (y < imageH) && (z < imageD))
    {
        // write output color
        uint i = __umul24(z,ps) +__umul24(y, imageW) + x;
        d_output[1] = (uchar) 255;//+0*voxel*255;
    }
}
extern "C"
void initCuda(const uchar *h_volume, cudaExtent volumeSize)
{
    // create 3D array
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
    checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));
    // copy data to 3D array
    cudaMemcpy3DParms copyParams = {0};
    copyParams.srcPtr   = make_cudaPitchedPtr((void *)h_volume, volumeSize.width*sizeof(uchar), volumeSize.width, volumeSize.height);
    copyParams.dstArray = d_volumeArray;
    copyParams.extent   = volumeSize;
    copyParams.kind     = cudaMemcpyHostToDevice;
    checkCudaErrors(cudaMemcpy3D(©Params));
    // set texture parameters
    tex.normalized = true;                      // access with normalized texture coordinates
    tex.filterMode = cudaFilterModeLinear;      // linear interpolation
    tex.addressMode[0] = cudaAddressModeBorder;   // wrap texture coordinates
    tex.addressMode[1] = cudaAddressModeBorder;
    tex.addressMode[2] = cudaAddressModeBorder;
    // bind array to 3D texture
    checkCudaErrors(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));
}
extern "C"
void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, uint imageD)
{
    d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, imageD);
}
#endif // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_
As you can see, currently, I set all values to zero except the index = 1, which is set to 255. Yet when I now open the image stack in Fiji, I see that the fourth pixel on the first slide is white. If I use index=i instead, I get white vertical lines across the image stack periodically every four columns. Generally spoken, it seems that only every fourth element is beeing indexed in the CudaArray. So I am wondering if there is somekind of error here resulting from sizeof(uchar)=1 and sizeof(uint)=4. There would obviously be the factor 4 :)
I am eager to here from you experts
Cheers Mika
                        
I figured it out by myself. The kernel works with uint* d_output while the copy to the host is written into a uchar* h_output
This led to this strange behavior