Julia并行:多线程与多进程

并行测试基于JuliaPP,代码参考了来源1来源2,结果如图所示,单线程单进程

using BenchmarkTools

println("Number of threads: $(Threads.nthreads())")

function evaluatefunctions(N)
    #x = linspace(-1500.0, 1500.0, N)
    x = range(-1500.0, stop=1500.0, length=N)
    M = 10000
    for i in 1:M
        y = sin.(x)
        x = asin.(y)
        y = cos.(x)
        x = acos.(y)
        y = tan.(x)
        x = atan.(y)
    end
end

@btime evaluatefunctions(2000)
[misaraty@master test]$ julia test.jl 
Number of threads: 1
  1.099 s (60000 allocations: 922.85 MiB)
using BenchmarkTools

println("Number of threads: $(Threads.nthreads())")

N=200

A = ones(N, N)
@btime for i = 1:N
    A[i, i] = 6
    B = sqrt(A)
end
[misaraty@master test]$ julia test.jl 
Number of threads: 1
  2.349 s (4401 allocations: 320.14 MiB)

多线程

using BenchmarkTools

println("Number of threads: $(Threads.nthreads())")  
  
function evaluatefunctions(N)
    #x = linspace(-1500.0, 1500.0, N)
    x = range(-1500.0, stop=1500.0, length=N)
    M = 10000
    Threads.@threads for i in 1:M
        y = sin.(x)
        x = asin.(y)
        y = cos.(x)
        x = acos.(y)
        y = tan.(x)
        x = atan.(y)
    end
end

@btime evaluatefunctions(2000)
[misaraty@master test]$ julia -t 1 test.jl 
Number of threads: 1
  1.141 s (240009 allocations: 926.51 MiB)
[misaraty@master test]$ julia -t 4 test.jl 
Number of threads: 4
  310.887 ms (240027 allocations: 926.52 MiB)
[misaraty@master test]$ julia -t 10 test.jl 
Number of threads: 10
  129.303 ms (240062 allocations: 926.52 MiB)
using BenchmarkTools

println("Number of threads: $(Threads.nthreads())")

N=200

A = ones(N, N)
@btime Threads.@threads for i = 1:N
    A[i, i] = 6
    B = sqrt(A)
end
[misaraty@master test]$ julia -t 1 test.jl 
Number of threads: 1
  2.145 s (4207 allocations: 320.13 MiB)
[misaraty@master test]$ julia -t 4 test.jl 
Number of threads: 4
  1.058 s (4223 allocations: 320.13 MiB)
[misaraty@master test]$ julia -t 10 test.jl 
Number of threads: 10
  496.962 ms (4252 allocations: 320.13 MiB)

多进程

(1)Distributed

using BenchmarkTools  
using Distributed

println("Number of process: $(length(procs()))")

function evaluatefunctions(N)
    #x = linspace(-1500.0, 1500.0, N)
    x = range(-1500.0, stop=1500.0, length=N)
    M = 10000
    @sync @distributed for i in 1:M
        y = sin.(x)
        x = asin.(y)
        y = cos.(x)
        x = acos.(y)
        y = tan.(x)
        x = atan.(y)
    end
end
  
@btime evaluatefunctions(2000)
[misaraty@master test]$ julia test.jl 
Number of process: 1
  1.059 s (240060 allocations: 926.52 MiB)
[misaraty@master test]$ julia -p 3 test.jl 
Number of process: 4
  374.000 ms (396 allocations: 14.55 KiB)
[misaraty@master test]$ julia -p 9 test.jl 
Number of process: 10
  139.397 ms (1265 allocations: 45.80 KiB)
using BenchmarkTools
using Distributed

println("Number of process: $(length(procs()))")

N=200

A = ones(N, N)
@btime @sync @distributed for i = 1:N
    A[i, i] = 6
    B = sqrt(A)
end
[misaraty@master test]$ julia test.jl 
Number of process: 1
  2.160 s (4258 allocations: 320.13 MiB)
[misaraty@master test]$ julia -p 3 test.jl 
Number of process: 4
  469.861 ms (333 allocations: 13.06 KiB)
[misaraty@master test]$ julia -p 9 test.jl 
Number of process: 10
  134.422 ms (1074 allocations: 41.38 KiB)

(2)MPI.jl

使用MPI.jl前,需自行安装openmpi4.1.2,或使用JuliaPP附带已编译的openmpi4.1.2,最后修改环境变量即可使用。openmpi1.8.8+intel安装可参考Misaraty-Openmpi

tar -zxvf openmpi-4.1.2.tar.gz
./configure --prefix=/home/misaraty/soft/openmpi --enable-static
#./configure --prefix=/home/misaraty/soft/openmpi --enable-static CC=gcc CXX=g++ F77=gfortran FC=gfortran F90=gfortran
make
make install

修改.bashrc或.bash_profile,

# openmpi
export PATH=$PATH:/home/misaraty/soft/openmpi/bin
export LD_LIBRARY_PATH=/home/misaraty/soft/openmpi/lib

下载编译后的openmpi4.1.2。

using BenchmarkTools  
using MPI

MPI.Init()

# println("Number of process: $(MPI.Comm_size(MPI.COMM_WORLD))")

function evaluatefunctions(M)

    comm = MPI.COMM_WORLD
    nprocs = MPI.Comm_size(comm)
    myrank = MPI.Comm_rank(comm)

    N=10000
    ista,iend,nbun = start_and_end(N,comm)

    #x = linspace(-1500.0, 1500.0, M)
    x = range(-1500.0, stop=1500.0, length=M)
    for i in ista:iend
        y = sin.(x)
        x = asin.(y)
        y = cos.(x)
        x = acos.(y)
        y = tan.(x)
        x = atan.(y)
    end
end


function start_and_end(N,comm)
    nprocs = MPI.Comm_size(comm)
    myrank = MPI.Comm_rank(comm)
    if N % nprocs != 0
        println("error! N%procs should be 0.")
    end
    nbun = div(N,nprocs)
    ista = myrank*nbun+1
    iend = ista + nbun-1
    return ista,iend,nbun
end

@btime evaluatefunctions(2000)

MPI.Finalize()
[misaraty@master test]$ mpiexecjl -n 1 julia test.jl
  1.059 s (60000 allocations: 922.85 MiB)
[misaraty@master test]$ mpiexecjl -n 4 julia test.jl
  262.151 ms (15000 allocations: 230.71 MiB)
  266.585 ms (15000 allocations: 230.71 MiB)
  276.770 ms (15000 allocations: 230.71 MiB)
  272.871 ms (15000 allocations: 230.71 MiB)
[misaraty@master test]$ mpiexecjl -n 10 julia test.jl
  114.083 ms (6000 allocations: 92.29 MiB)
  114.008 ms (6000 allocations: 92.29 MiB)
  113.992 ms (  113.953 ms (6000 allocations: 92.29 MiB)
6000 allocations: 92.29 MiB)
  113.850 ms (6000 allocations: 92.29 MiB)
  114.623 ms (6000 allocations: 92.29 MiB)
  142.945 ms (6000 allocations: 92.29 MiB)
  114.124 ms (6000 allocations: 92.29 MiB)
  115.842 ms (6000 allocations: 92.29 MiB)
  114.577 ms (6000 allocations: 92.29 MiB)
using BenchmarkTools
using MPI

MPI.Init()

# println("Number of process: $(MPI.Comm_size(MPI.COMM_WORLD))")

function test()

    comm = MPI.COMM_WORLD
    nprocs = MPI.Comm_size(comm)
    myrank = MPI.Comm_rank(comm)

    N=200
    ista,iend,nbun = start_and_end(N,comm)
    A = ones(N, N)
    
    for i = ista:iend
        A[i, i] = 6
        B = sqrt(A)
    end    
    
end

function start_and_end(N,comm)
    nprocs = MPI.Comm_size(comm)
    myrank = MPI.Comm_rank(comm)
    if N % nprocs != 0
        println("error! N%procs should be 0.")
    end
    nbun = div(N,nprocs)
    ista = myrank*nbun+1
    iend = ista + nbun-1
    return ista,iend,nbun
end

@btime test()

MPI.Finalize()
[misaraty@master test]$ mpiexecjl -n 1 julia test.jl
  1.756 s (4202 allocations: 320.43 MiB)
[misaraty@master test]$ mpiexecjl -n 4 julia test.jl
  352.608 ms (1052 allocations: 80.34 MiB)
  408.630 ms (1052 allocations: 80.34 MiB)
  572.909 ms (1052 allocations: 80.34 MiB)
  576.099 ms (1052 allocations: 80.34 MiB)
[misaraty@master test]$ mpiexecjl -n 10 julia test.jl
  181.883 ms (  197.525 ms (  422 allocations: 32.32 MiB)
172.983 ms (422 allocations: 32.32 MiB)
422 allocations: 32.32 MiB)
  163.729 ms (422 allocations: 32.32 MiB)
  309.789 ms (422 allocations: 32.32 MiB)
  223.014 ms (422 allocations: 32.32 MiB)
  225.081 ms (422 allocations: 32.32 MiB)
  138.454 ms (422 allocations: 32.32 MiB)
  135.505 ms (422 allocations: 32.32 MiB)
  122.916 ms (422 allocations: 32.32 MiB)

参考

Juliaで数値計算 その4:コードサンプル〜MPI並列計算編

MPI.jl

How to config julia to use MUMPS and MPI

PencilFFTs

Basic Comparison of Various Computing Languages

多进程和分布式计算

Parallel computing

Julia并发编程-循环中正确使用并发的方式

Julia并发编程-多线程与多进程的一些使用建议

The Different Flavors of Parallelism

How can I set my computer for Julia multi threading?

julia多线程-小光amateur

Using Julia on CARC systems

Julia on the HPC Clusters

Running Julia jobs on an HPC cluster