I am using driver API cuTexRefSetAddress2D() for binding to texture. I get correct results as long as I bind to 32 bit accesses, but get wrong results if I bind to 16 or 8 bit accesses. The value read is in the vicinity of the correct location, but off by one or two bytes.

In an parallel setup, where I use the runtime API cudaBindTexture2D() for texture binding, I get correct results for all accesses (32, 16 and 8 bit).

Any pointers regarding this?

I am compiling for compute architecture 3.0 with CUDA 5.5 and running on GTX 650 Ti card, with Ubuntu on host PC.

Sample code is pasted below. The application can be switched between driver or runtime APIs using command argument 0 / 1. Difference can be seen in the respective outputs.

Here is the sample code for kernel.cu

Code:
/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/

/* System include files */
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <assert.h>
#include <stdio.h>


/*****************************************************************************/
/* Global Variable Definitions                                               */
/*****************************************************************************/

/* Texture refernce for input and reference frames */
texture<unsigned char, cudaTextureType2D, cudaReadModeElementType> g_dev_ref_tex_u1;
texture<unsigned short, cudaTextureType2D, cudaReadModeElementType> g_dev_ref_tex_u2;
texture<unsigned int, cudaTextureType2D, cudaReadModeElementType> g_dev_ref_tex_u4;

__device__ cudaTextureObject_t g_dev_tex_obj;

__global__ void func_test()
{
    int x, y;

    x = 176;
    y = 60;

    /* Expected output: ec */
    printf("Ref u1: %x\n", tex2D(g_dev_ref_tex_u1, x, y));

    /* Expected output: edec */
    printf("Ref u2: %x\n", tex2D(g_dev_ref_tex_u2, x / sizeof(unsigned short), y));

    /* Expected output: efeeedec */
    printf("Ref u4: %x\n", tex2D(g_dev_ref_tex_u4, x / sizeof(unsigned int), y));

    /* Expected output: ec */
    //printf("Obj u1: %x\n", tex2D<unsigned char>(g_dev_tex_obj, x, y));

    return;
}

int main_runtime_api(unsigned char *ref_frm, int frm_wd, int frm_ht)
{
    cudaError_t cudaStatus;
    unsigned int alloc_pitch;
    void *dptr_ref_frm;
    cudaChannelFormatDesc tex_desc;

    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  CUDA-capable GPU not installed");
    return -1;
    }

    /* Reference frame */
    cudaStatus = cudaMallocPitch(&dptr_ref_frm, (size_t *)&alloc_pitch, frm_wd, frm_ht);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc<dptr_ref_frm> failed: %s\n!", cudaGetErrorString(cudaStatus));
        return -1;
    }

    cudaStatus = cudaMemcpy2D(dptr_ref_frm, alloc_pitch,
                              ref_frm, frm_wd,
                              frm_wd, frm_ht,
                              cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy<dptr_ref_frm> failed: %s\n!", cudaGetErrorString(cudaStatus));
        return -1;
    }

    /* Create CUDA texture channel descriptor */
    tex_desc = cudaCreateChannelDesc<unsigned char>();

    /* Bind reference frame to 2D texture */
    cudaStatus = cudaBindTexture2D(NULL, &g_dev_ref_tex_u1, dptr_ref_frm,
                                   &tex_desc, frm_wd,
                                   frm_ht, alloc_pitch);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaBindTexture2D<g_dev_ref_tex_u1> failed: %d, %s\n!", cudaStatus, cudaGetErrorString(cudaStatus));
        return -1;
    }

    /* Create CUDA texture channel descriptor */
    tex_desc = cudaCreateChannelDesc<unsigned short>();

    /* Bind reference frame to 2D texture */
    cudaStatus = cudaBindTexture2D(NULL, &g_dev_ref_tex_u2, dptr_ref_frm,
                                   &tex_desc, frm_wd / sizeof(unsigned short),
                                   frm_ht, alloc_pitch);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaBindTexture2D<g_dev_ref_tex_u2> failed: %d, %s\n!", cudaStatus, cudaGetErrorString(cudaStatus));
        return -1;
    }

    /* Create CUDA texture channel descriptor */
    tex_desc = cudaCreateChannelDesc<unsigned int>();

    /* Bind reference frame to 2D texture */
    cudaStatus = cudaBindTexture2D(NULL, &g_dev_ref_tex_u4, dptr_ref_frm,
                                   &tex_desc, frm_wd / sizeof(unsigned int),
                                   frm_ht, alloc_pitch);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaBindTexture2D<g_dev_ref_tex_u4> failed: %d, %s\n!", cudaStatus, cudaGetErrorString(cudaStatus));
        return -1;
    }

    /* Kernel launch */
    func_test<<<1, 1>>>();

    cudaFree(dptr_ref_frm);

    return 0;
}
Here is sample code for main.cu

Code:
/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/

/* System include files */
#include <stdio.h>
#include <string.h>

/* CUDA include files */
#include <cuda.h>
#include <builtin_types.h>
#include <drvapi_error_string.h>

#define CU_SIZE_T size_t

extern __global__ void func_test();

extern int main_runtime_api(unsigned char *ref_frm, int frm_wd, int frm_ht);

////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions

// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err)  __checkCudaErrors (err, __FILE__, __LINE__)

// These are the inline versions for all of the SDK helper functions
inline int __checkCudaErrors(CUresult error, const char *file, const int line)
{
    if (CUDA_SUCCESS != error)
    {
        fprintf(stderr, "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, line %i.\n",
                error, getCudaDrvErrorString(error), file, line);
    }
    return error;
}

int main_driver_api(unsigned char *ref_frm, int frm_wd, int frm_ht)
{
    int err;
    int dev_count;
    CUdevice cu_device;
    CUcontext cu_context;
    CUmodule cu_module;
    CUdeviceptr dptr_ref_frm;
    CUDA_MEMCPY2D cu_copy;
    CUtexref cu_tex_ref;
    CUDA_ARRAY_DESCRIPTOR cu_arr_desc;
    CUfunction hme_coarse_kernel;
    CU_SIZE_T alloc_pitch;
    char ptx_fname[100];

    /* Initialize CUDA */
    err = checkCudaErrors(cuInit(0));
    if(CUDA_SUCCESS != err)
        return -1;

    /* Get installed devices */
    err = checkCudaErrors(cuDeviceGetCount(&dev_count));
    if(CUDA_SUCCESS != err)
        return -1;

    if(0 == dev_count)
    {
        printf("No CUDA device installed!!!\n");
        return -1;
    }

    /* Get device handle (device 0 is assumed) */
    err = checkCudaErrors(cuDeviceGet(&cu_device, 0));
    if(CUDA_SUCCESS != err)
        return -1;

    /* Create context */
    err = checkCudaErrors(cuCtxCreate(&cu_context, 0, cu_device));
    if(CUDA_SUCCESS != err)
        return -1;

    /* Load PTX file */
    sprintf(ptx_fname, "./kernel.ptx");
    err = checkCudaErrors(cuModuleLoad(&cu_module, ptx_fname));
    if(CUDA_SUCCESS != err)
        return -1;

    printf("Launching CUDA kernel from %s ...\n", ptx_fname);


    /***** >> START REFERENCE FRAME >> *****/

    /* Allocate device memory for reference frame */
    err = checkCudaErrors(cuMemAllocPitch(
                                     &dptr_ref_frm,
                                     &alloc_pitch,
                                     (CU_SIZE_T)frm_wd,
                                     (CU_SIZE_T)frm_ht,
                                     sizeof(unsigned int)));
    if(CUDA_SUCCESS != err)
        return -1;

    /* Copy reference frame to device memory */
    cu_copy.srcXInBytes   = 0;
    cu_copy.srcY          = 0;
    cu_copy.srcMemoryType = CU_MEMORYTYPE_HOST;
    cu_copy.srcHost       = ref_frm;
    cu_copy.srcPitch      = frm_wd;
    cu_copy.dstXInBytes   = 0;
    cu_copy.dstY          = 0;
    cu_copy.dstMemoryType = CU_MEMORYTYPE_DEVICE;
    cu_copy.dstDevice     = dptr_ref_frm;
    cu_copy.dstPitch      = alloc_pitch;
    cu_copy.WidthInBytes  = frm_wd;
    cu_copy.Height        = frm_ht;
    err = checkCudaErrors(cuMemcpy2D(&cu_copy));
    if(CUDA_SUCCESS != err)
        return -1;

    /* Use texture reference interfaces */
    {
        /* Get texture reference to reference frame */
        err = checkCudaErrors(cuModuleGetTexRef(&cu_tex_ref,
                                                cu_module,
                                                "g_dev_ref_tex_u1"));
        if(CUDA_SUCCESS != err)
            return -1;

        /* Bind reference frame to 2D texture */
        cu_arr_desc.Format      = CU_AD_FORMAT_UNSIGNED_INT8;
        cu_arr_desc.NumChannels = 1;
        cu_arr_desc.Width       = frm_wd;
        cu_arr_desc.Height      = frm_ht;
        err = checkCudaErrors(cuTexRefSetAddress2D(cu_tex_ref,
                                                   &cu_arr_desc,
                                                   dptr_ref_frm,
                                                   alloc_pitch));
        if(CUDA_SUCCESS != err)
            return -1;

        /* Get texture reference to reference frame */
        err = checkCudaErrors(cuModuleGetTexRef(&cu_tex_ref,
                                                cu_module,
                                                "g_dev_ref_tex_u2"));
        if(CUDA_SUCCESS != err)
            return -1;

        /* Bind reference frame to 2D texture */
        cu_arr_desc.Format      = CU_AD_FORMAT_UNSIGNED_INT16;
        cu_arr_desc.NumChannels = 1;
        cu_arr_desc.Width       = frm_wd / sizeof(unsigned short);
        cu_arr_desc.Height      = frm_ht;
        err = checkCudaErrors(cuTexRefSetAddress2D(cu_tex_ref,
                                                   &cu_arr_desc,
                                                   dptr_ref_frm,
                                                   alloc_pitch));
        if(CUDA_SUCCESS != err)
            return -1;

        /* Get texture reference to reference frame */
        err = checkCudaErrors(cuModuleGetTexRef(&cu_tex_ref,
                                                cu_module,
                                                "g_dev_ref_tex_u4"));
        if(CUDA_SUCCESS != err)
            return -1;

        /* Bind reference frame to 2D texture */
        cu_arr_desc.Format      = CU_AD_FORMAT_UNSIGNED_INT32;
        cu_arr_desc.NumChannels = 1;
        cu_arr_desc.Width       = frm_wd / sizeof(unsigned int);
        cu_arr_desc.Height      = frm_ht;
        err = checkCudaErrors(cuTexRefSetAddress2D(cu_tex_ref,
                                                   &cu_arr_desc,
                                                   dptr_ref_frm,
                                                   alloc_pitch));
        if(CUDA_SUCCESS != err)
            return -1;
    }

#if 0
    /* Use texture object interfaces */
    {
        CUtexObject pTexObject;
        CUDA_RESOURCE_DESC pResDesc;
        CUDA_TEXTURE_DESC pTexDesc;
        CUdeviceptr dptr_global_tex;

        pResDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
        pResDesc.res.pitch2D.devPtr = dptr_ref_frm;
        pResDesc.res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT8;
        pResDesc.res.pitch2D.numChannels = 1;
        pResDesc.res.pitch2D.width = frm_wd;
        pResDesc.res.pitch2D.height = frm_ht;
        pResDesc.res.pitch2D.pitchInBytes = alloc_pitch;
        pResDesc.flags = 0;

        memset(&pTexDesc, 0, sizeof(CUDA_TEXTURE_DESC));

        err = checkCudaErrors(cuTexObjectCreate(&pTexObject, &pResDesc, &pTexDesc, NULL));
        if(CUDA_SUCCESS != err)
            return -1;

        err = checkCudaErrors(cuModuleGetGlobal(&dptr_global_tex,
                                                &size,
                                                cu_module,
                                                "g_dev_tex_obj"));
        if(CUDA_SUCCESS != err)
            return -1;

        /* Copy control data to device global */
        err = checkCudaErrors(cuMemcpyHtoD(dptr_global_tex,
                                           &pTexObject,
                                           size));
        if(CUDA_SUCCESS != err)
            return -1;
    }
#endif

    /***** >> END REFERENCE FRAME >> *****/

    /***** >> START KERNEL LAUNCH >> *****/

    /* Get kernel function from module */
    err = checkCudaErrors(cuModuleGetFunction(&hme_coarse_kernel,
                                              cu_module,
                                              "_Z9func_testv"));
    if(CUDA_SUCCESS != err)
        return -1;

    /* Kernel launch */
    err = checkCudaErrors(cuLaunchKernel(
                            hme_coarse_kernel,
                            1, 1, 1,
                            1, 1, 1,
                            0, /* Shared memory per block */
                            NULL, NULL, NULL));
    if(CUDA_SUCCESS != err)
        return -1;

    /* Wait for kernel to complete */
    err = checkCudaErrors(cuCtxSynchronize());
    if(CUDA_SUCCESS != err)
        return -1;

    /***** >> END KERNEL LAUNCH >> *****/

    /* Free device memory for output frame */
    err = checkCudaErrors(cuMemFree(dptr_ref_frm));
    if(CUDA_SUCCESS != err)
        return -1;

    /* Destroy context */
    err = checkCudaErrors(cuCtxDestroy(cu_context));
    if(CUDA_SUCCESS != err)
        return -1;

    return 0;
}


int main(int argc, char *argv[])
{
    int frm_wd = 512, frm_ht = 176;
    unsigned char *ref_frm;

    if(2 > argc)
    {
        printf("Usage: ./kernel <0/1>\n");
        printf("0 = Driver API, 1 = Runtime API\n");
        return 0;
    }

    /* Populate frame data */
    {
        int x, y;

        ref_frm = (unsigned char *)malloc(frm_wd * frm_ht);
        if(NULL == ref_frm)
        {
            printf("Malloc failed\n");
            return -1;
        }

        for(y = 0; y < frm_ht; y++)
            for(x = 0; x < frm_wd; x++)
            {
                *(ref_frm + (y * frm_wd) + x) = y + x;
            }
    }

    if(0 == atoi(argv[1]))
    {
        printf("\nUsing driver APIs...\n");
        main_driver_api(ref_frm, frm_wd, frm_ht);
    }
    else
    {
        printf("\nUsing runtime APIs...\n");
        main_runtime_api(ref_frm, frm_wd, frm_ht);
    }

    free(ref_frm);

    return 0;
}
The makefile is as below:

Code:
include ./findcudalib.mk

DEFINES = -DDESIGN16
CACHE_FLAGS = -Xptxas -dlcm=ca

ifeq ($(CACHE),'L2')
CACHE_FLAGS = -Xptxas -dlcm=cg
endif 

# Location of the CUDA Toolkit
CUDA_PATH ?= "/usr/local/cuda-5.5"

# internal flags
NVCCFLAGS   := -m${OS_SIZE} ${CACHE_FLAGS}
CCFLAGS     := 
NVCCLDFLAGS :=
LDFLAGS     :=

# Extra user flags
EXTRA_NVCCFLAGS   ?=
EXTRA_NVCCLDFLAGS ?=
EXTRA_LDFLAGS     ?=
EXTRA_CCFLAGS     ?=

# OS-specific build flags
ifneq ($(DARWIN),) 
  LDFLAGS += -rpath $(CUDA_PATH)/lib
  CCFLAGS += -arch $(OS_ARCH) $(STDLIB)  
else
  ifeq ($(OS_ARCH),armv7l)
    ifeq ($(abi),gnueabi)
      CCFLAGS += -mfloat-abi=softfp
    else
      # default to gnueabihf
      override abi := gnueabihf
      LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
      CCFLAGS += -mfloat-abi=hard
    endif
  endif
endif

ifeq ($(ARMv7),1)
NVCCFLAGS += -target-cpu-arch ARM
ifneq ($(TARGET_FS),) 
CCFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-$(abi)
endif
endif

# Debug build flags
ifeq ($(dbg),1)
      NVCCFLAGS += -g -G -DDEBUG
      TARGET := debug
else
      TARGET := release
endif

ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
ALL_CCFLAGS += -I/usr/local/cuda-5.5/include

ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(NVCCLDFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(EXTRA_NVCCLDFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))

# Common includes and paths for CUDA
INCLUDES  := -I$(CUDA_PATH)/samples/common/inc
LIBRARIES   += $(CUDALINK) -lcuda

################################################################################

# CUDA code generation flags
ifneq ($(OS_ARCH),armv7l)
#GENCODE_SM10    := -gencode arch=compute_10,code=sm_10
endif
#GENCODE_SM20    := -gencode arch=compute_20,code=sm_20
#GENCODE_SM30    := -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=\"sm_35,compute_35\"
GENCODE_SM30    := -gencode arch=compute_30,code=sm_30 
GENCODE_FLAGS   := $(GENCODE_SM10) $(GENCODE_SM20) $(GENCODE_SM30)

################################################################################

# Target rules
all: build

build: kernel

main.o: main.cu
    $(NVCC) $(DEFINES) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<

kernel.o: kernel.cu
    $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<

kernel.ptx: kernel.cu
    $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -ptx $<

kernel: main.o kernel.o kernel.ptx
    $(NVCC) $(ALL_LDFLAGS) -o $@ main.o kernel.o $(LIBRARIES)

clean:
    rm -f kernel main.o kernel.o kernel.ptx

clobber: clean
./findcudalib.mk can be found from one of the examples in CUDA installation. Please note that I am using CUDA 5.5. Thanks.