如何在cython prange块中使用gsl集成？

oz1 发表于 Dev

盎司1

为了使事情更笼统，以gsl_integration_qag为例（因为它需要一个工作区来管理时间间隔）。

这是一个经过精心设计的简单example（problem.pyx）：

from libc.math cimport cos
from cython.parallel import prange
from cython_gsl cimport *

cdef double mean(double[:] arr):
    cdef Py_ssize_t i
    cdef double sum_ = 0.0
    for i in range(arr.shape[0]):
        sum_ += arr[i]
    return sum_ / arr.shape[0]

cdef double integrand(double x, void *params) nogil:
    cdef double a = (<double*>params)[0]
    cdef double b = (<double*>params)[1]
    return a*x + b

def use_gsl_integration_sequential():
    cdef double result, error
    cdef Py_ssize_t i, j
    cdef double result_sum=0.0
    cdef double results[3]
    cdef double a=0.0, b=0.0
    cdef double da = 0.1
    cdef size_t space_size = 1000

    cdef gsl_function gsl_func
    cdef double params[2]
    gsl_func.function = &integrand

    cdef gsl_integration_workspace *ws
    ws= gsl_integration_workspace_alloc(space_size)

    for i in range(3):
        a += da
        result_sum = 0.0 # reset the sum(thank J.J.Hakala for pointing out this)
        for j in range(4): # here may be range(100000) in practice!
            b = a + j
            params[0] = a; params[1] = b
            gsl_func.params = params
            gsl_integration_qag(&gsl_func, 0.0, 1.0, 1e-8, 1e-8, space_size, GSL_INTEG_GAUSS15, ws, &result, &error)
            result_sum += result
        results[i] = result_sum

    gsl_integration_workspace_free(ws)
    # do some other stuff with the 'results', for example:
    return mean(results)


# Now, I want to parallel the inner loop
def use_gsl_integration_parallel():
    cdef double result, error
    cdef Py_ssize_t i, j
    cdef double result_sum=0.0
    cdef double results[3]
    cdef double a, b
    cdef double da = 0.1
    cdef size_t space_size = 1000

    cdef gsl_function gsl_func
    cdef double params[2]
    gsl_func.function = &integrand

    cdef gsl_integration_workspace *ws
    ws= gsl_integration_workspace_alloc(space_size)

    for i in range(3):
        # a should be read-only in the follow prange block.
        a += da 
        # here may be prange(100000,...) in practice!
        for j in prange(4, nogil=True): 
            # b should be local private.
            b = a + j

            # params also should be local private
            params[0] = a; params[1] = b

            # gsl_func is shared, only its params differ.
            # According to the gsl manual(see follow link), workspaces should be allocated on a per-thread basis, but how?
            gsl_func.params = params
            gsl_integration_qag(&gsl_func, 0.0, 1.0, 1e-8, 1e-8, space_size, GSL_INTEG_GAUSS15, ws, &result, &error)

            result_sum += result

        results[i] = result_sum
        gsl_integration_workspace_free(ws)
    return mean(results)

有点长的代码，仅用于完整（用于复制），但是我认为很简单（可读）（๑•ᴗ•๑）

然后编译并链接：

cython -a -2 problem.pyx

gcc -O3 -fopenmp -c problem.c -o problem.o -IC:\gsl2.1x86\include -IC:\python-2.7.10\include

gcc -shared -fPIC -fopenmp -o problem.pyd -LC:\gsl2.1x86\libs -LC:\python-2.7.10\libs problem.o -l:libgsl.a -l:libgslcblas.dll.a -l:libpython27.dll.a

在IPython中：

In [1]: from problem import *
In [2]: use_gsl_integration_sequential()
Out[2]: 7.2

结果use_gsl_integration_parallel()是不确定的，我尝试了几次，最多一次，它会崩溃，有些幸运，得到了不确定的值，所以一定有问题！我只是找不到这样一个平行的例子。有人可以帮我吗？

环境：

win10-64位，gcc（tdm-1）5.1.0 32位，python-2.7.10 32位，cython0.24，gsl-2.1

一些有用的参考资料？：

https://github.com/twiecki/CythonGSL

https://www.gnu.org/software/gsl/manual/html_node/Thread_002dsafety.html#Thread_002dsafety

（对不起，无法发布更多信息，两个链接是我的限制...

哈卡拉（JJ Hakala）

当我测试该代码时，出现了段错误（Linux，gcc 4.9.2）。

我认为原始代码存在以下问题

四个线程共享多个变量，每个线程应有自己的变量
在这些线程中使用相同的工作空间

以下版本给出了7.2。是连续的版本实际上是正确的，因为它不设置result_sum = 0.0在

for i in range(3):
    a += da
    result_sum = 0.0

from openmp cimport omp_get_max_threads, omp_get_thread_num

def use_gsl_integration_parallel():
    cdef double result
    cdef Py_ssize_t i, j
    cdef double result_sum
    cdef double results[3]
    cdef double results2[4]
    cdef double a
    cdef double b[4]
    cdef double error[4]
    cdef double da = 0.1
    cdef size_t space_size = 1000

    cdef gsl_function gsl_func[4]
    cdef double params[4][2]
    cdef int tid

    cdef gsl_integration_workspace ** ws = <gsl_integration_workspace **>malloc(omp_get_max_threads() * sizeof(gsl_integration_workspace *))

    for j in range(omp_get_max_threads()):
        ws[j] = gsl_integration_workspace_alloc(space_size)

    for i in range(3):
        a += da

        for j in prange(4, nogil=True):
            tid = omp_get_thread_num()
            b[tid] = a + j

            params[tid][0] = a;
            params[tid][1] = b[tid]

            gsl_func[tid].function = &integrand
            gsl_func[tid].params = params[tid]
            gsl_integration_qag(&gsl_func[tid], 0.0, 1.0, 1e-8, 1e-8,
                                space_size, GSL_INTEG_GAUSS15, ws[tid],
                                &results2[j], &error[j])
            # printf("tid %d\n", tid)
        results[i] = sum(results2)

    for j in range(omp_get_max_threads()):
        gsl_integration_workspace_free(ws[j])
    free(ws)

    return mean(results)