One problem of parallization of Moller-trumbore algorithm on CUDA

I attempt to compute the intersection of the rays and the triangle mesh by the Moller-Trumbore algorithm. And I test my code by one triangle mesh with 474048 triangles. After implementing the algorithm on CPU, I attempt to parallelize the code on CUDA. Below is my code:

__device__ float3 CrossProduct(float3 a, float3 b)
{
    float3 c;
    c.x = a.y*b.z - a.z*b.y;
    c.y = a.z*b.x - a.x*b.z;
    c.z = a.x*b.y - a.y*b.x;
    return c;
}

__device__ float3 IsIntersected(float3 ro, float3 rd, float3 v0, float3 v1, float3 v2)
{
    float3 s;
    s.x = -v0.x; s.y = -v0.y; s.z = -v0.z;
    ro.x += s.x; ro.y += s.y; ro.z += s.z;
    v0.x += s.x; v0.y += s.y; v0.z += s.z;
    v1.x += s.x; v1.y += s.y; v1.z += s.z;
    v2.x += s.x; v2.y += s.y; v2.z += s.z;

    float3 e1 = v1, e2 = v2;
    float3 de2 = CrossProduct(rd, e2), se1 = CrossProduct(ro, e1);

    float3 tuv;
    tuv.x = se1.x*e2.x + se1.y*e2.y + se1.z*e2.z;
    tuv.y = de2.x*ro.x + de2.y*ro.y + de2.z*ro.z;
    tuv.z = se1.x*rd.x + se1.y*rd.y + se1.z*rd.z;

    float r = de2.x*e1.x + de2.y*e1.y + de2.z*e1.z;
    tuv.x = tuv.x / r; tuv.y = tuv.y / r; tuv.z = tuv.z / r;

    return tuv;
}

__global__ void IntersectionComputation(float4 *intersection_array, float3 *ro, float3 *rd, float3 *v0, float3 *v1, float3 *v2, int num_triangle, int length_array)
{
    int triangle_id = blockIdx.x;

    int idx = threadIdx.x, idy = blockIdx.y, ray_id = idx + idy * blockDim.x;

    float3 tuv= IsIntersected(ro[ray_id], rd[ray_id], v0[triangle_id], v1[triangle_id], v2[triangle_id]);

    if (tuv.x >= 0 && tuv.y >= 0 && tuv.z >= 0 && tuv.y + tuv.z <= 1)
        {
        float4 diuv;
        diuv.w = tuv.x;
        diuv.x = triangle_id;
        diuv.y = tuv.y;
        diuv.z = tuv.z;

        for (int i = 0; i < length_array; i++)
        {
            if (intersection_array[i + length_array * ray_id].w < 0)
            {
                intersection_array[i + length_array * ray_id] = diuv;
                i = length_array;
            }
        }
    }
}

__global__ void ConvertIntersectionArrayToIntersectionPoint(float3 *intersection_point,float4 *intersection_array, int length_array)
{
    int idx = threadIdx.x, idy = blockIdx.x, id = idx + idy * blockDim.x;

    int index = -1;
    for (int i = 0; i < length_array; i++)
    {
        if (intersection_array[i + length_array * id].w >= 0)
        {
            if (index < 0)
                index = i;
            else if (intersection_array[i + length_array * id].w < intersection_array[index + length_array * id].w)
                index = i;
        }
    }

    if (index != -1)
    {
        intersection_point[id].x = intersection_array[index + length_array * id].x;
        intersection_point[id].y = intersection_array[index + length_array * id].y;
        intersection_point[id].z = intersection_array[index + length_array * id].z;
    }
}

IsIntersected computes the barycentric coordinates of the intersection of one ray and one triangle, together with the distance between the intersection and the origin of the ray.

The kernel function IntersectionComputation judges if one ray intersects with one triangle by calling IsIntersected. If intersected, I storage the distance and intersection coordinates together with the id of the triangle in the float4 array intersection_array.

Then I call the kernel function by the following code:

extern "C" __host__ void RayIntersection(float3 *iv, float3 *ro, float3 *rd, float3 *v0,float3 *v1,float3 *v2, int width,int height,int num_triangle)
{
    int num_ray = width * height;

    int length_array = 10;

    //Allocate space for data from host
    float3 *dev_ro = 0, *dev_rd = 0, *dev_v0 = 0, *dev_v1 = 0, *dev_v2 = 0;
    cudaMalloc((void**)&dev_ro, num_ray * sizeof(float3));
    cudaMalloc((void**)&dev_rd, num_ray * sizeof(float3));
    cudaMalloc((void**)&dev_v0, num_triangle * sizeof(float3));
    cudaMalloc((void**)&dev_v1, num_triangle * sizeof(float3));
    cudaMalloc((void**)&dev_v2, num_triangle * sizeof(float3));

    //Allocate space for intersection point vector in device
    float3 *dev_iv = 0;
    cudaStatus = cudaMalloc((void**)&dev_iv, num_ray * sizeof(float3));

    //Allocate space for intersection point array in device
    float4 *dev_intersection_array = 0;
    cudaMalloc((void**)&dev_intersection_array, length_array * num_ray * sizeof(float4));

    //Copy data from host to device
    cudaMemcpy(dev_ro, ro, num_ray * sizeof(float3), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_rd, rd, num_ray * sizeof(float3), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_v0, v0, num_triangle * sizeof(float3), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_v1, v1, num_triangle * sizeof(float3), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_v2, v2, num_triangle * sizeof(float3), cudaMemcpyHostToDevice);

    //Set initial value for dev_iv
    thrust::device_ptr<float3> dev_iv_ptr(dev_iv);
    float3 iv_value; iv_value.x = -1; iv_value.y = -1; iv_value.z = -1;
    thrust::fill(dev_iv_ptr, dev_iv_ptr + num_ray, iv_value);
    cudaDeviceSynchronize();

    //Set initial value for dev_intersection_array
    thrust::device_ptr<float4> dev_array_ptr(dev_intersection_array);
    float4 array_value; array_value.w = -1; array_value.x = -1; array_value.y = -1; array_value.z = -1;
    thrust::fill(dev_array_ptr, dev_array_ptr + length_array * num_ray, array_value);
    cudaDeviceSynchronize();

    IntersectionComputation << <dim3(num_triangle, height, 1), dim3(width, 1, 1) >> > (dev_intersection_array, dev_ro, dev_rd, dev_v0, dev_v1, dev_v2, num_triangle, length_array);
    cudaDeviceSynchronize();
    ConvertIntersectionArrayToIntersectionPoint << <dim3(height, 1, 1), dim3(width, 1, 1) >> > (dev_iv, dev_intersection_array, length_array);
    cudaDeviceSynchronize();

    cudaMemcpy(iv, dev_iv, num_ray * sizeof(float3), cudaMemcpyDeviceToHost);

    cudaFree(dev_intersection_array);
    cudaFree(dev_iv); cudaFree(dev_rd); cudaFree(dev_ro);
    cudaFree(dev_v0); cudaFree(dev_v1); cudaFree(dev_v2);
}

width and height define the number of the rays, it means I want to compute the intersections of one triangle mesh and width*height rays. num_triangle is the size of the triangle mesh.

Now I come across the problem. When width=512, height=1, the same as the cpu mode, the result iv is normal. But if I keep width invariant, and set height>3, this will keep the block dimension invariant and increase the grid dimension, then all element of iv is

-4.31602e+08,-4.31602e+08,-4.31602e+08

Even if I comment out all the code in ConvertIntersectionArrayToIntersectionPoint and keep only the calling of IsIntersected in IntersectionComputation, I still get this wrong result.

Besides, if I only call IntersectionComputation in the host function, since IntersectionComputation only changes the value of intersection_array, without the calling of ConvertIntersectionArrayToIntersectionPoint, all elements of dev_iv should keep the inital value, is it right? But this is not my situation. After copying dev_iv to iv by cudaMemcpy, all elements of iv are

-4.31602e+08,-4.31602e+08,-4.31602e+08

But I have already assign

-1,-1,-1

as the initial value of elements in dev_iv by thrust::fill.

So dose anyone know the cause of the error of the result? Is it caused by the limitation of the size of the grid, because when I set num_triangle=1, keep height<100, and comment out all code in the two kernel functions, elements in iv are all -1, which is the initial value of dev_iv.

============================An update============================

After commenting out most part of code in the kernel function, it seems that rewriting IntersectionComputation as following is enough to cause this problem

__global__ void IntersectionComputation(float4 *intersection_array, float3 *ro, float3 *rd, float3 *v0, float3 *v1, float3 *v2, int num_triangle, int length_array)
{
    int triangle_id = blockIdx.x;

    int idx = threadIdx.x, idy = blockIdx.y, ray_id = idx + idy * blockDim.x;

    float3 dro=ro[ray_id], drd=rd[ray_id], dv0=v0[triangle_id], dv1=v1[triangle_id], dv2=v2[triangle_id];
}

This really bothers me now. So this problem is caused by memory access conflict?

============================Solved Now============================

Now this problem is solved, partly due to my code and partly due to windows. In Windows, as the default setting, the WDDR(Microsoft Windows Display Driver Model) will reset the driver if the GPU is unresponsive for more than two seconds. This is designed to avoid rebooting the system in the case GPU is unresponsive. As a result, in my code, if I inrease the rays I want to compute intersection for, the kernel function will act more than two seconds, the GPU driver will be reset and I will see cuda launch failed. Turn off the WDDR will avoid this problem, but the fundamental cause for the problem is still the lack of optimization of my code.

All Questions Answered

Search This Blog

Donate. I desperately need donations to survive due to my health

Get paid by answering surveys Click here

Click here to donate

Remote/Work from Home jobs

One problem of parallization of Moller-trumbore algorithm on CUDA

Comments

Post a Comment