diff --git a/docs/.gitignore b/docs/.gitignore index a25b4f6..70e4f28 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,7 +1,5 @@ build/ site/ -docs/src/notebook-output -Manifest.toml -*.md -!index.md -!getting_started_with_julia.md \ No newline at end of file +docs/src/notebook-output/ +docs/src/notebooks/ +Manifest.toml \ No newline at end of file diff --git a/docs/make.jl b/docs/make.jl index 7d3990b..abd1828 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -13,7 +13,7 @@ EditURL = "https://github.com/fverdugo/XM_40017/blob/main/docs/src/notebooks/SCR
In this section, we want to examine another parallel algorithm called Successive Over-relaxation (SOR).
-The SOR algorithm is an iterative method used to solve Laplace equations. The underlying data structure of the SOR algorithm is a two-dimensional grid, whose elements are updated iteratively through some weighted function that considers the old value as well as the values of neighbouring cells.
-
This algorithm is applied, for instance, in physics to simulate the climate or temperature of some object.
-| Climate Simulation | -Temperature of a metal plate | -
|---|---|
![]() |
- ![]() |
-
The sequential SOR algorithm is as follows (where f is some update function and N,M are the grid sizes):
grid = zeros(N, M)
-grid_new = zeros(N, M)
-for step in 1:NSTEPS
- for i in 2:N #update grid
- for j in 2:M
- grid_new[i,j] = f(grid[i,j], grid[i-1,j], grid[i+1,j], grid[i,j-1], grid[i,j+1])
- end
- end
-end
-Consider the diffusion of a chemical substance on a two-dimensional grid. The concentration of the chemical is given as $c(x,y)$, a function of the coordinates $x$ and $y$. We will consider a square grid with $0 \leq x,y \leq 1$ and the boundary conditions $c(x,y=1) = 1$ and $c(x,y=0) = 0$. That is, the concentration at the top of the grid is always 1 and the concentration at the very bottom is always 0. Furthermore, in the x-direction we will assume periodic boundary conditions, i.e. $c(x=0, y) = c(x=1,y)$.
-
We will take the initial condition $c(x,y) = 0$ for $0 \leq x \leq 1, 0 \leq y < 1$.
-The stable state of the diffusion, that is, when the concentraion does not change anymore, can be described by the Laplace equation
-$$ -\nabla^2 c = 0. -$$Numerically, we can approximate the solution with the Jacobi iteration
-$$ -\frac{1}{4}(c^k_{i,j+1} + c^k_{i,j-1} + c^k_{i+1,j} + c^k_{i-1,j}) = c^{k+1}_{i,j}. -$$The superscript $k$ denotes the $k$-th iteration. The algorithm stepwise updates the cells of the grid, until a steady state is reached. To determine the end of the algorithm, we use the stopping condition
-$$ -\max_{i,j} \lvert c^{k+1}_{i,j} - c^{k}_{i,j} \rvert < \epsilon. -$$That is, we stop when all changes to cell values are smaller than some small number, say $\epsilon = 10^{-5}$.
-Furthermore, for this set of initial and boundary conditions, there exists an analytical solution for the stable state, namely -$$ -c(x,y) = y. -$$ -That is, the concentration profile is the identity function of the y-coordinate.
- -# Update grid until threshold reached
-function update_grid!(new_grid, grid, M, N)
- @assert grid[1,:] == grid[M-1,:]
- @assert grid[M,:] == grid[2,:]
- @assert grid[:,1] == fill(0,M) == new_grid[:,1]
- @assert grid[:,N] == fill(1,M) == new_grid[:,N]
- # Remove ghost cells
- g_left = grid[1:M-2, 2:N-1]
- g_right = grid[3:M, 2:N-1]
- g_up = grid[2:M-1, 3:N]
- g_down = grid[2:M-1, 1:N-2]
- # Jacobi iteration
- new_grid[2:M-1,2:N-1] = 0.25 * (g_up + g_down + g_left + g_right)
- # Update ghost cells
- new_grid[1,:] = new_grid[M-1,:]
- new_grid[M,:] = new_grid[2,:]
- return new_grid
-end
-
-function SOR!(new_grid, grid, ϵ, M, N)
- grid_old = true
- while true
- if grid_old
- update_grid!(new_grid, grid, M, N)
- else
- update_grid!(grid, new_grid, M, N)
- end
- # Flip boolean value
- grid_old = !grid_old
- diffs = abs.(new_grid[2:M-1, :] - grid[2:M-1, :])
- if maximum(diffs) < ϵ
- break
- end
- end
-end
-SOR! (generic function with 1 method)-
using Test, Plots
-N = 100
-M = N + 2
-ϵ = 10^-5
-
-# initialize grid
-grid = zeros(M,N)
-grid[:,N] .= 1
-new_grid = zeros(M,N)
-new_grid[:,N] .= 1
-
-update_grid!(new_grid, grid, M, N)
-
-# Test if first iteration successful
-@test all(new_grid[:,N-1] .≈ fill(0.25, M))
-@test all(new_grid[:,N-2] .≈ fill(0,M))
-@test all(new_grid[:,N] == fill(1,M))
-@test all(new_grid[:,1] == fill(0,M))
-Test Passed
-N = 100
-M = N + 2
-ϵ = 10^-5
-
-# initialize grid
-grid = zeros(M,N)
-grid[:,N] .= 1
-new_grid = zeros(M,N)
-new_grid[:,N] .= 1
-
-SOR!(new_grid, grid, ϵ, M, N)
-plt = heatmap(transpose(new_grid[2:M-1,:]))
-display(plt)
-# analytical solution
-function analytical_solution(N)
- # Returns the analytical solution as a square grid of size N
- grid = zeros(N,N)
- for i in 2:N
- grid[:,i] .= (i-1)/(N-1)
- end
- return grid
-end
-
-# Test if solution is identical with analytical solution
-sol = analytical_solution(N)
-@test maximum(abs.(sol - new_grid[2:M-1,:])) < 0.01 * N
-Test Passed
-#] add MPIClusterManagers DelimitedFiles
-# to import MPIManager
-using MPIClusterManagers
-
-# need to also import Distributed to use addprocs()
-using Distributed
-
-# specify, number of mpi workers, launch cmd, etc.
-manager=MPIWorkerManager(9)
-
-# start mpi workers and add them as julia workers too.
-addprocs(manager)
-
-@mpi_do manager begin
-
-function calculate_partition(p, N, nrows, ncols)
- # Calculates the row and column indices of this processor
- # Get row and column number for processor p
- if mod(p,ncols) == 0
- i = div(p,ncols)
- else
- i = floor(div(p,ncols)) + 1
- end
- j = p - (i-1)*ncols
- # Rows
- if mod(N,nrows) == 0
- prows = div(N, nrows)
- row_range =((i-1) * prows + 1) : (i*prows)
- else
- # nlower processors get the smaller partition
- nlower = nrows - (mod(N,nrows))
- n_floor = floor(div(N,nrows))
- if i <= nlower
- row_range =((i-1) * n_floor + 1) : (i*n_floor)
- else
- row_range = ((i-1) * n_floor + (i - nlower)) : (i*n_floor + (i-nlower))
- end
- end
- # Columns
- if mod(N,ncols) == 0
- prows = div(N, ncols)
- col_range =((j-1) * prows + 1) : (j*prows)
- else
- nlower = ncols - (mod(N,ncols))
- n_floor = floor(div(N,ncols))
- if j <= nlower
- col_range =((j-1) * n_floor + 1) : (j*n_floor)
- else
- col_range = ((j-1) * n_floor + (j - nlower)) : (j*n_floor + (j-nlower))
- end
- end
- # Add 1 to each column index because of ghost cells
- col_range = col_range .+ 1
- return row_range, col_range
-end
-
-
-function update_grid(grid)
- # Returns the updated grid as M-2 x N-2 matrix where M and N are sizes of grid
- M = size(grid, 1)
- N = size(grid, 2)
- # Remove ghost cells
- g_left = grid[2:M - 1,1:N-2]
- g_right = grid[2:M - 1, 3:N]
- g_up = grid[1:M-2, 2:N-1]
- g_down = grid[3:M, 2:N-1]
- # Jacobi iteration
- return 0.25 * (g_up + g_down + g_left + g_right)
-end
-
- using MPI
- comm=MPI.COMM_WORLD
- id = MPI.Comm_rank(comm) + 1
-
-
- M = 50
- N = M + 2
- ϵ = 10^-5 # Stopping threshold
- nrows = 3 # Number of grid rows
- ncols = 3 # Number of grid columns
- n_procs = nrows * ncols
- @assert n_procs == MPI.Comm_size(comm)
- max_diffs = ones(n_procs) # Differences between iterations
- max_diff_buf = MPI.UBuffer(max_diffs,1) # Buffer to store maximum differences
-
-
- # initialize grid
- if id == 1
- grid_a = zeros(M,N)
- grid_a[1,:] .= 1
- grid_b = zeros(M,N)
- grid_b[1,:] .= 1
- else
- grid_a = nothing
- grid_b = nothing
- end
-
- # Broadcast matrix to other processors
- grid_a = MPI.bcast(grid_a, 0, comm)
- grid_b = MPI.bcast(grid_b, 0, comm)
-
- # Determine if processor is in top or bottom row of grid
- top_pos = id <= ncols
- bottom_pos = id > ((nrows-1) * ncols)
- local grid_a_old = false # Grid a is the source grid for the first update
-
- # Get local partition
- ind_rows, ind_cols = calculate_partition(id, M, nrows, ncols)
- println("Proc $(id) gets rows $(ind_rows) and columns $(ind_cols)")
-
- # Determine neighbors
- n_left = id - 1
- n_right = id + 1
- n_down = id + ncols
- n_up = id - ncols
- if mod(id, ncols) == 1
- # Left neighbor is last in row
- n_left = id + ncols - 1
- end
- if mod(id, ncols) == 0
- # Right neighbor is first in row
- n_right = id - ncols + 1
- end
-
- #println("Proc $(id) has neighbors left $(n_left) and right $(n_right) and up $(n_up) and down $(n_down)")
-
- local finished = false
-
- #Perform SOR
- while !finished
- # Flip old and new grid
- grid_a_old = !grid_a_old
-
- # Determine which grid is updated
- if grid_a_old
- old_grid = grid_a
- new_grid = grid_b
- else
- old_grid = grid_b
- new_grid = grid_a
- end
-
- # send left and right columns
- left_ind = first(ind_cols)
- right_ind = last(ind_cols)
- left_col = old_grid[ind_rows, left_ind]
- right_col = old_grid[ind_rows, right_ind]
- slreq = MPI.Isend(left_col, comm; dest=n_left-1)
- srreq = MPI.Isend(right_col, comm; dest=n_right-1)
-
- # Send bottom row if not bottom
- bottom_ind = last(ind_rows)
- if !bottom_pos
- bottom_col = old_grid[bottom_ind, ind_cols]
- sbreq = MPI.Isend(bottom_col, comm; dest=n_down -1)
- end
-
- # Send top row if not at the top
- top_ind = first(ind_rows)
- if !top_pos
- top_row = old_grid[top_ind, ind_cols]
- streq = MPI.Isend(top_row, comm; dest = n_up -1)
- end
-
- # Receive left and right column
- left_buf = Array{Float64,1}(undef, length(ind_rows))
- right_buf = Array{Float64,1}(undef, length(ind_rows))
- rlreq = MPI.Irecv!(left_buf, comm; source=n_left -1)
- rrreq = MPI.Irecv!(right_buf, comm; source=n_right -1)
-
- # Receive top row if not at the top
- if !top_pos
- top_buf = Array{Float64,1}(undef, length(ind_cols))
- rtreq = MPI.Irecv!(top_buf, comm; source=n_up -1)
- end
-
- # Receive bottom row if not at the bottom
- if !bottom_pos
- bottom_buf = Array{Float64,1}(undef, length(ind_cols))
- rbreq = MPI.Irecv!(bottom_buf, comm; source=n_down -1)
- end
-
- # Wait for results
- statlr = MPI.Waitall([rlreq, rrreq], MPI.Status)
- old_grid[ind_rows, left_ind - 1] = left_buf
- old_grid[ind_rows, right_ind + 1] = right_buf
- #println("Proc $(id) received left $(old_grid[ind_rows, left_ind - 1]) and right $(old_grid[ind_rows, right_ind + 1])")
-
- if !top_pos
- statt = MPI.Wait(rtreq)
- old_grid[top_ind - 1, ind_cols] = top_buf
- #println("Proc $(id) received top $(old_grid[top_ind - 1, ind_cols])")
- end
-
- if !bottom_pos
- statb = MPI.Wait(rbreq)
- old_grid[bottom_ind + 1, ind_cols] = bottom_buf
- #println("Proc $(id) received bottom $(old_grid[bottom_ind + 1, ind_cols])")
- end
-
- # Get local subgrid
- if !top_pos & !bottom_pos
- local_with_ghosts = old_grid[top_ind - 1 : bottom_ind + 1, left_ind - 1 : right_ind + 1]
- lb_row = top_ind
- ub_row = bottom_ind
- elseif top_pos
- local_with_ghosts = old_grid[top_ind : bottom_ind + 1, left_ind - 1 : right_ind + 1]
- lb_row = top_ind + 1
- ub_row = bottom_ind
- elseif bottom_pos
- local_with_ghosts = old_grid[top_ind - 1 : bottom_ind, left_ind - 1 : right_ind + 1]
- lb_row = top_ind
- ub_row = bottom_ind - 1
- end
-
- # Perform one step of Jacobi iteration
- new_grid[lb_row : ub_row, left_ind : right_ind] = update_grid(local_with_ghosts)
-
- # Calculate max difference
- diffs = abs.(new_grid[lb_row : ub_row, left_ind : right_ind] - old_grid[lb_row : ub_row, left_ind : right_ind])
- maxdiff = maximum(diffs)
-
- # Gather maxdiffs in processor 1
- MPI.Gather!(maxdiff, max_diff_buf, comm; root=0)
-
- # First processor determines if threshold is exeeded
- if id == 1
- if all(max_diffs .< ϵ)
- finished = true
- println("THRESHOLD SUBCEEDED - TERMINATE SOR")
- end
- end
-
- finished = MPI.bcast(finished, 0, comm)
-
- if finished
- # Set ghost cells to zero again so MPI.Reduce gives correct output
- new_grid[ind_rows, left_ind - 1] .= 0.0
- new_grid[ind_rows, right_ind + 1] .= 0.0
- if !bottom_pos
- new_grid[bottom_ind + 1, ind_cols] .= 0.0
- end
- if !top_pos
- new_grid[top_ind - 1, ind_cols] .= 0.0
- end
- end
- end
-
- using DelimitedFiles
-
- # Reduce matrix & store result
- if !grid_a_old
- sor_result = grid_a
- else
- sor_result = grid_b
- end
-
- MPI.Reduce!(sor_result, +, comm, root = 0)
- sor_result[1,:] .= 1.0
- if id == 1
- writedlm("SOR_result.txt", sor_result)
- end
-
- MPI.Finalize()
-end
-From worker 5: Proc 4 gets rows 17:33 and columns 2:17 - From worker 8: Proc 7 gets rows 34:50 and columns 2:17 - From worker 3: Proc 2 gets rows 1:16 and columns 18:34 - From worker 6: Proc 5 gets rows 17:33 and columns 18:34 - From worker 9: Proc 8 gets rows 34:50 and columns 18:34 - From worker 4: Proc 3 gets rows 1:16 and columns 35:51 - From worker 10: Proc 9 gets rows 34:50 and columns 35:51 - From worker 7: Proc 6 gets rows 17:33 and columns 35:51 - From worker 2: Proc 1 gets rows 1:16 and columns 2:17 - From worker 2: THRESHOLD SUBCEEDED - TERMINATE SOR --
rmprocs(manager);
-using Plots, DelimitedFiles
-final_grid = readdlm("SOR_result.txt")
-M = size(final_grid, 1)
-N = size(final_grid, 2)
-plt = heatmap(final_grid[:, 2:N-1])
-display(plt)
-# Test if solution is identical with analytical solution
-sol = analytical_solution(M)
-# Bring solution into correct form
-sol = reverse(transpose(sol), dims = 1)
-@test maximum(abs.(sol - final_grid[:,2:N-1])) < 0.01 * M
-Test Passed
-
-$u(x) = (x-L)/L$
- -function jacobi(n,nsteps)
- u = zeros(n+2)
- u_new = zeros(n+2)
- u[1] = -1
- u[end] = 1
- for istep in 1:nsteps
- for i in 2:(n+1)
- u_new[i] = 0.5*(u[i-1]+u[i+1])
- end
- u[2:end-1] = u_new[2:end-1]
- end
- u
-end
-jacobi (generic function with 1 method)-
jacobi(5,1000)
-7-element Vector{Float64}:
- -1.0
- -0.6666666666666666
- -0.3333333333333333
- 0.0
- 0.3333333333333333
- 0.6666666666666666
- 1.0
-u_new[i] = 0.5*(u[i-1]+u[i+1])
The most natural way of implementing a distributed Jacobi method is to use a library that provides distributed arrays:
- -We will use low-level Julia functions as an academic exercise.
- -using Distributed
-addprocs(3)
-@everywhere function initialize_vectors(n)
- @assert mod(n,nworkers()) == 0
- local_n = div(n,nworkers())
- u = zeros(local_n+2)
- u_new = zeros(local_n+2)
- u[1] = -1.0
- u[end] = 1.0
- u,u_new
-end
-@everywhere function compute_error(u,n)
- local_n = length(u)-2
- u_exact = copy(u)
- p = myid() - 1
- for local_i in 0:(local_n+1)
- global_i = local_i + (p-1)*local_n + 1
- Δ = 1/(n+1)
- u_exact[local_i+1] = 2*(Δ*global_i-Δ)-1
- end
- maximum(abs.(u[2:end-1].-u_exact[2:end-1]))
-end
-function jacobi_dist(n,nsteps)
- nw = nworkers()
- ftrs_prev_snd = Vector{Future}(undef,nw)
- ftrs_next_snd = Vector{Future}(undef,nw)
- fun = ()->Channel{Float64}(1)
- for w in workers()
- p = w-1
- ftrs_prev_snd[p] = @spawnat w RemoteChannel(fun,myid())
- ftrs_next_snd[p] = @spawnat w RemoteChannel(fun,myid())
- end
- ftrs = Vector{Future}(undef,nw)
- @sync for w in workers()
- ftrs[w-1] = @spawnat w jacobi_worker(
- n,nsteps,ftrs_prev_snd,ftrs_next_snd)
- end
- reduce(max,fetch.(ftrs))
-end
-@everywhere function jacobi_worker(
- n,nsteps,ftrs_prev_snd,ftrs_next_snd)
- u, u_new = initialize_vectors(n)
- w = myid(); p = w-1
- # TODO: Setup channels
- #chn_prev_snd
- #chn_prev_rcv
- #chn_next_snd
- #chn_next_rcv
- for step in 1:nsteps
- # TODO: Communicate
- # Local computations
- for i in 2:(length(u)-1)
- u_new[i] = 0.5*(u[i-1]+u[i+1])
- end
- u[2:end-1] = u_new[2:end-1]
- end
- @show u
- err = compute_error(u,n)
-end
-err = jacobi_dist(6,1000)
-@everywhere function jacobi_worker(
- n,nsteps,ftrs_prev_snd,ftrs_next_snd)
- u, u_new = initialize_vectors(n)
- p = myid()-1
- chn_prev_snd = fetch(ftrs_prev_snd[p])
- chn_next_snd = fetch(ftrs_next_snd[p])
- if myid()!=2
- chn_prev_rcv = fetch(ftrs_next_snd[p-1])
- end
- if myid() != nprocs()
- chn_next_rcv = fetch(ftrs_prev_snd[p+1])
- end
- for step in 1:nsteps
- # TODO overlap communication and computation
- if myid() != 2
- put!(chn_prev_snd,u[2])
- u[1] = take!(chn_prev_rcv)
- end
- if myid() != nprocs()
- put!(chn_next_snd,u[end-1])
- u[end] = take!(chn_next_rcv)
- end
- for i in 2:(length(u)-1)
- u_new[i] = 0.5*(u[i-1]+u[i+1])
- end
- u[2:end-1] = u_new[2:end-1]
- end
- @show u
- err = compute_error(u,n)
-end
-@everywhere function jacobi_worker(
- n,nsteps,ftrs_prev_snd,ftrs_next_snd)
- u, u_new = initialize_vectors(n)
- p = myid()-1
- chn_prev_snd = fetch(ftrs_prev_snd[p])
- chn_next_snd = fetch(ftrs_next_snd[p])
- if myid()!=2
- chn_prev_rcv = fetch(ftrs_next_snd[p-1])
- end
- if myid() != nprocs()
- chn_next_rcv = fetch(ftrs_prev_snd[p+1])
- end
- for step in 1:nsteps
- if myid() != 2
- put!(chn_prev_snd,u[2])
- u[1] = take!(chn_prev_rcv)
- end
- if myid() != nprocs()
- put!(chn_next_snd,u[end-1])
- u[end] = take!(chn_next_rcv)
- end
- # Local computations
- for i in 2:(length(u)-1)
- u_new[i] = 0.5*(u[i-1]+u[i+1])
- end
- u[2:end-1] = u_new[2:end-1]
- end
- @show u
- err = compute_error(u,n)
-end
-@everywhere function jacobi_worker(
- n,nsteps,ftrs_prev_snd,ftrs_next_snd)
- u, u_new = initialize_vectors(n)
- p = myid()-1
- chn_prev_snd = fetch(ftrs_prev_snd[p])
- chn_next_snd = fetch(ftrs_next_snd[p])
- if myid()!=2
- chn_prev_rcv = fetch(ftrs_next_snd[p-1])
- end
- if myid() != nprocs()
- chn_next_rcv = fetch(ftrs_prev_snd[p+1])
- end
- for step in 1:nsteps
- if myid() != 2
- put!(chn_prev_snd,u[2])
- t1 = @async u[1] = take!(chn_prev_rcv)
- end
- if myid() != nprocs()
- put!(chn_next_snd,u[end-1])
- t2 = @async u[end] = take!(chn_next_rcv)
- end
- # Local computations (interior)
- for i in 3:(length(u)-2)
- u_new[i] = 0.5*(u[i-1]+u[i+1])
- end
- # Wait for ghost values
- if myid() != 2
- wait(t1)
- end
- if myid() != nprocs()
- wait(t2)
- end
- # Update near boundary values
- for i in [2,length(u)-1]
- u_new[i] = 0.5*(u[i-1]+u[i+1])
- end
- u[2:end-1] = u_new[2:end-1]
- end
- @show u
- err = compute_error(u,n)
-end
-Julia has its own way of running code and using packages. Many educational sources about Julia assume that you have this basic knowledge, which can be confusing to new users. In this lesson, we will learn these basic skills so that you can start learning more on Julia.
- -This is a tutorial. To follow it:
-There are several ways of opening Julia depending on your operating system and your IDE, but it is usually as simple as launching the Julia app. With VSCode, open a folder (File > Open Folder). Then, press Ctrl+Shift+P to open the command bar, and execute Julia: Start REPL. If this does not work, make sure you have the Julia extension for VSCode installed. Independently of the method you use, opening Julia results in a window with some text ending with:
julia>
-
-You have just opened the Julia read-evaluate-print loop, or simply the Julia REPL. Congrats! You will spend most of time using the REPL, when working in Julia. The REPL is a console waiting for user input. Just as in other consoles, the string of text right before the input area (julia> in the case) is called the command prompt or simply the prompt.
The usage is as follows:
-For instance, try this
-julia> 1 + 1
-A "Hello world" example looks like this in Julia
-julia> println("Hello, world!")
-Try to run it in the REPL.
- -Curious about what function println does? Enter into help mode to look into the documentation. This is done by typing a question mark (?) into the inut field:
julia> ?
-After typing ?, the command prompt changes to help?>. It means we are in help mode. Now, we can type a function name to see its documentation.
help?> println
-The REPL comes with two more modes, namely package and shell modes. To enter package mode type
-julia> ]
-Package mode is used to install and manage packages. We are going to discuss the package mode in greater detail later. To return back to normal mode press the backspace key several times.
-To enter shell mode type semicolon (;)
julia> ;
-The prompt should have changed to shell> indicating that we are in shell mode. Now you can type commands that you would normally do on your system command line. For instance,
shell> ls
-will display the contents of the current folder in Mac or Linux. Using shell mode in Windows is not straightforward, and thus not recommended for beginners.
- -Real-world Julia programs are not typed in the REPL in practice. They are written in one or more files and included in the REPL. To try this, create a new file called hello.jl, write the code of the "Hello world" example above, and save it. If you are using VSCode, you can create the file using File > New File > Julia File. Once the file is saved with the name hello.jl, execute it as follows
julia> include("hello.jl")
-"hello.jl" is located in the current working directory of your Julia session. You can query the current directory with function pwd(). You can change to another directory with function cd() if needed. Also, make sure that the file extension is .jl.
-The recommended way of running Julia code is using the REPL as we did. But it is also possible to run code directly from the system command line. To this end, open a terminal and call Julia followed buy the path to the file containing the code you want to execute.
-$ julia hello.jl
-
-Previous line assumes that you have Julia properly installed in the system and that is usable from the terminal. In UNIX systems (Linux and Mac), the Julia binary needs to be in one of the directories listed in the PATH environment variable. To check that Julia is properly installed, you can use
$ julia --version
-
-If this runs without error and you see a version number, you are good to go!
-$, it should be run in the terminal. Otherwise, the code is to be run in the Julia REPL.
-Since we are in a parallel computing course, let's run a parallel "hello world" example in Julia. Open a Julia REPL and write
-julia> using Distributed
-julia> @everywhere println("Hello, world! I am proc $(myid()) from $(nprocs())")
-Here, we are using the Distributed package, which is part of the Julia standard library that provides distributed memory parallel support. The code prints the process id and the number of processes in the current Julia session.
You will provably only see output from 1 proces. We need to add more processes to run the example in parallel. This is done with the addprocs function.
julia> addprocs(3)
-We have added 3 new processes, plus the old one, we have 4 processes. Run the code again.
-julia> @everywhere println("Hello, world! I am proc $(myid()) from $(nprocs())")
-Now, you should see output from 4 processes.
-It is possible to specify the number of processes when starting Julia from the terminal with the -p argument (useful, e.g., when running in a cluster). If you launch Julia from the terminal as
$ julia -p 3
-
-and then run
-julia> @everywhere println("Hello, world! I am proc $(myid()) from $(nprocs())")
-You should get output from 4 processes as before.
- -One of the most useful features of Julia is its package manager. It allows one to install Julia packages in a straightforward and platform independent way. To illustrate this, let us consider the following parallel "Hello world" example.
-Copy the following block of code into a new file named "hello_mpi.jl"
# file hello_mpi.jl
-using MPI
-MPI.Init()
-comm = MPI.COMM_WORLD
-rank = MPI.Comm_rank(comm)
-nranks = MPI.Comm_size(comm)
-println("Hello world, I am rank $rank of $nranks")
-As you can see from this example, one can access MPI from Julia in a clean way, without type annotations and other complexities of C/C++ code.
-Now, run the file from the REPL
-julia> incude("hello_mpi.jl")
-It provably didn't work, right? Read the error message and note that the MPI package needs to be installed to run this code.
-To install a package, we need to enter package mode. Remember that we entered into help mode by typing ?. Package mode is activated by typing ]
julia> ]
-At this point, the promp should have changed to (@v1.8) pkg> indicating that we are in package mode. The text between parenthesis indicates which is the active project, i.e., where packages are going to be installed. In this case, we are working with the global project associated with our Julia installation (which is Julia 1.8 in this example, but it can be another version in your case).
To install the MPI package, type
-(@v1.8) pkg> add MPI
-Congrats, you have installed MPI!
-.jl. This is just a way of signaling that a package is written in Julia. When using such packages, the .jl needs to be ommited. In this case, we have isntalled the MPI.jl package even though we have only typed MPI in the REPL.
-MPI.jl. Note that it is not a MPI library by itself. It is just a thin wrapper between MPI and Julia. To use this interface, you need an actual MPI library installed in your system such as OpenMPI or MPICH. Julia downloads and installs a MPI library for you, but it is also possible to use a MPI library already available in your system. This is useful, e.g., when running on HPC clusters. See the documentation of MPI.jl for further details.
-To check that the package was installed properly, exit package mode by pressing the backspace key several times, and run it again
-julia> incude("hello_mpi.jl")
-Now, it should work, but you provably get output from a single MPI rank only.
- -To run MPI applications in parallel, you need a launcher like mpiexec. MPI codes written in Julia are not an exception to this rule. From the system terminal, you can run
$ mpiexec -np 4 julia hello_mpi.jl
-
-But it will provably don't work since the version of mpiexec needs to match with the MPI version we are using from Julia. You can find the path to the mpiexec binary you need to use with these commands
julia> using MPI
-julia> MPI.mpiexec_path
-and then try again
-$ /path/to/my/mpiexec -np 4 julia hello_mpi.jl
-
-with your particular path.
-However, this is not very convenient. Don't worry if you could not make it work! A more elegant way to run MPI code is from the Julia REPL directly, by using these commands:
-julia> using MPI
-julia> mpiexec(cmd->run(`$cmd -np 4 julia hello_mpi.jl`))
-Now, you should see output from 4 ranks.
- -We have installed the MPI package globally and it will be available in all Julia sessions. However, in some situations, we want to work with different versions of the same package or to install packages in an isolated way to avoid potential conflicts with other packages. This can be done by using local projects.
A project is simply a folder in the hard disk. To use a particular folder as your project, you need to activate it. This is done by entering package mode and using the activate command followed by the path to the folder you want to activate.
(@v1.8) pkg> activate .
-Previous command will activate the current working directory. Note that the dot . is indeed the path to the current folder.
The prompt has changed to (lessons) pkg> indicating that we are in the project within the lessons folder. The particular folder name can be different in your case.
--project flag. The command $ julia --project=. will open Julia and activate a project in the current directory. You can also achieve the same effect by setting the environment variable JULIA_PROJECT with the path of the folder you want to activate.
- (@v1.8) pkg> activate folderB and then julia> cd("folderA"), will activate the project in folderB and change the current working directory to folderA.
-At this point all package-related operations will be local to the new project. For instance, install the DataFrames package.
(lessons) pkg> add DataFrames
-Use the package to check that it is installed
-julia> using DataFrames
-julia> DataFrame(a=[1,2],b=[3,4])
-Now, we can return to the global project to check that DataFrames has not been installed there. To return to the global environment, use activate without a folder name.
(lessons) pkg> activate
-The prompt is again (@v1.8) pkg>
Now, try to use DataFrames.
julia> using DataFrames
-julia> DataFrame(a=[1,2],b=[3,4])
-You should get an error or a warning unless you already had DataFrames installed globally.
The information about a project is stored in two files Project.toml and Manifest.toml.
Project.toml contains the packages explicitly installed (the direct dependencies)
Manifest.toml contains direct and indirect dependencies along with the concrete version of each package.
In other words, Project.toml contains the packages relevant for the user, whereas Manifest.toml is the detailed snapshot of all dependencies. The Manifest.toml can be used to reproduce the same envinonment in another machine.
You can see the path to the current Project.toml file by using the status operator (or st in its short form) while in package mode
(@v1.8) pkg> status
-The information about the Manifest.toml can be inspected by passing the -m flag.
(@v1.8) pkg> status -m
-Project files can be used to install lists of packages defined by others. E.g., to install all the dependencies of a Julia application.
-Assume that a colleague has sent to you a Project.toml file with this content:
[deps]
-BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
-DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
-MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
-
-Copy the contents of previous code block into a file called Project.toml and place it in an empty folder named newproject. It is important that the file is named Project.toml. You can create a new folder from the REPL with
julia> mkdir("newproject")
-To install all the packages registered in this file you need to activate the folder containing your Project.toml file
(@v1.8) pkg> activate newproject
-and then instantiating it
-(newproject) pkg> instantiate
-The instantiate command will download and install all listed packages and their dependencies in just one click.
- -You can get help about a particular package operator by writing help in front of it
(@v1.8) pkg> help activate
-You can get an overview of all package commands by typing help alone
(@v1.8) pkg> help
-In some situations it is required to use package commands in Julia code, e.g., to automatize installation and deployment of Julia applications. This can be done using the Pkg package. For instance
julia> using Pkg
-julia> Pkg.status()
-is equivalent to call status in package mode.
(@v1.8) pkg> status
-We have learned the basics of how to work with Julia. Now, you should be ready to start learning more on the language. If you want to further dig into the topics we have covered here, you can take a look and the following links
-If you want to interact with the Julia community on discourse, sign in at https://discourse.julialang.org/
- -In this notebook, we will learn the basics of asynchronous programming in Julia. In particular, we will learn about:
-Understanding these concepts is important to learn later distributed computing.
- -A task is a piece of computation work that can be run asynchronously (i.e., that can be run in the background). To create a task, we first need to create a function that represents the work to be done in the task. In next cell, we generate a task that generates and sums two matrices.
- -function work()
- println("Starting work")
- sleep(7)
- a = rand(3,3)
- b = rand(3,3)
- r = a + b
- println("Finishing work")
- r
-end
-t = Task(work)
-The task has been created, but the corresponding work has not started. Note that we do not see any output from function work yet. To run the task we need to schedule it.
schedule(t)
-The task has been executed, but we do not see the result. To get the result we need to fetch it.
- -fetch(t)
-It is important to note that tasks run asynchronously. To illustrate this let's create and schedule a new task.
- -t = Task(work)
-schedule(t)
-Note that while the task is running we can execute Julia code. To check this, execute the next two cells while the task is running.
- -sin(4π)*exp(-0.1)
-1 + 1
-How is this possible? Tasks run in the brackground and this particular task is sleeping for most of the time. Thus, it is possible to use the current Julia process for other operations while the tasks is sleeping.
- -It is also important to note that tasks do not run in parallel. We were able to run code while previous tasks was running because the task was idling most of the time. If the task does actual work, the current process will be busy running this task and it is likely that we cannot run other code at the same time. Let's illustrate this with an example. The following code computes an approximation of $\pi$ using Leibniz formula. The quality of the approximation increases with the value of n.
function compute_π(n)
- s = 1.0
- for i in 1:n
- s += (isodd(i) ? -1 : 1) / (i*2+1)
- end
- 4*s
-end
-Call this function with a large number. Note that it will take some time.
- -compute_π(4_000_000_000)
-Create a task that performs this computation.
- -fun = () -> compute_π(4_000_000_000)
-t = Task(fun)
-Schedule the tasks and then try to execute the 2nd cell bellow. Note that the current process will be busy running the task.
- -schedule(t)
-1+1
-This function needs to have zero arguments, but it can capture variables if needed. If we try to create a task with a function that has arguments, it will result in an error when we schedule it.
- -add(a,b) = a + b
-t = Task(add)
-schedule(t)
-If we need, we can capture variables in the function to be run by the task as shown in the next cells.
- -a = rand(3,3)
-b = rand(3,3);
-fun = () -> a + b
-t = Task(fun)
-schedule(t)
-@async¶So far, we have created tasks using low-level functions, but there are more convenient ways of creating and scheduling tasks. For instance using the @async macro. This macro is used to run a piece of code asynchronously. Under the hood it puts the code in an anonymous function, creates a task, and schedules it. For instance, the next cell is equivalent to previous one.
@async a + b
-@sync¶This macro is used to wait for all the tasks created with @async in a given block of code.
@sync begin
- @async sleep(3)
- @async sleep(4)
-end
-Julia provides channels as a way to send data between tasks. A channel is like a FIFO queue in which tasks can put and take values from. In next example, we create a channel and a task that puts five values into the channel. Finally, the task closes the channel.
- -chnl = Channel{Int}()
-@async begin
- for i in 1:5
- put!(chnl,i)
- end
- close(chnl)
-end
-By executing next cell several times, we will get the values from the channel. We are indeed communicating values from two different tasks. If we execute the cell more than 5 times, it will raise an error since the channel is closed.
- -take!(chnl)
-Instead of taking values from a channel until an error occurs, we can also iterate over the channel in a for loop until the channel is closed.
- -chnl = Channel{Int}()
-@async begin
- for i in 1:5
- put!(chnl,i)
- end
- close(chnl)
-end
-for i in chnl
- @show i
-end
-put! and take! are blocking¶Note that put! and take! are blocking operations. Calling put! blocks the tasks until another task calls take! and viceversa. Thus, we need at least 2 tasks for this to work. If we call put! and take! from the same task, it will result in a dead lock. We have added a print statement to previous example. Run it again and note how put! blocks until we call take!.
chnl = Channel{Int}()
-@async begin
- for i in 1:5
- put!(chnl,i)
- println("I have put $i")
- end
- close(chnl)
-end
-take!(chnl)
-We can be a bit more flexible and use a buffered channel. In this case, put! will block only if the channel is full and take! will block if the channel is empty. We repeat previous example, but with a buffered channel of size 2. Note that we can call put! until the channel is full. At this point, we need to wait to until we call take! which removes an item from the channel, making room for a new item.
buffer_size = 2
-chnl = Channel{Int}(buffer_size)
-@async begin
- for i in 1:5
- put!(chnl,i)
- println("I have put $i")
- end
- close(chnl)
-end
-take!(chnl)
-t = @elapsed compute_π(100_000_000)
-a) 10*t
-b) t
-c) 0.1*t
-d) near 0*t
-
-
-@time for i in 1:10
- compute_π(100_000_000)
-end
-a) 10*t
-b) t
-c) 0.1*t
-d) near 0*t
-
-
-
-@time for i in 1:10
- @async compute_π(100_000_000)
-end
-a) 10*t
-b) t
-c) 0.1*t
-d) near 0*t
-
-
-@time @sync for i in 1:10
- @async compute_π(100_000_000)
-end
-a) infinity
-b) 1 second
-c) near 0 seconds
-d) 3 seconds
-
-
-buffer_size = 4
-chnl = Channel{Int}(buffer_size)
-@time begin
- put!(chnl,3)
- i = take!(chnl)
- sleep(i)
-end
-a) infinity
-b) 1 second
-c) near 0 seconds
-d) 3 seconds
-
-
-chnl = Channel{Int}()
-@time begin
- put!(chnl,3)
- i = take!(chnl)
- sleep(i)
-end
-In this notebook, we will cover the basic parts of Julia needed later to learn parallel computing. In particular, we will learn about:
-For a more general introduction to Julia see the nice tutorials made available by JuliaAcademy here or the official Julia educational resources here.
- -We are going to use Jupyter notebooks in this and other lectures. You provably have worked with notebooks (in Python). If not, here are the basic concepts you need to know to follow the lessons.
-To run a Julia Jupyther notebook, open a Julia REPL and type
-julia> ]
-pkg> add IJulia
-julia> using IJulia
-julia> notebook()
-A new browser window will open. Navigate to the corresponding notebook and open it.
-To run a cell, click on a cell and press Shift + Enter. You can also use the "Run" button in the toolbar above.
1+3
-4*5
-As you can see from the output of previous cell, the value of the last line is displayed. We can suppress the output with a semicolon. Try it. Execute next cell.
- -1+3
-4*5;
-Running the two cells below in reverse order won't work (try it).
- -foo() = "Well done!"
-foo()
-This is particular to Julia notebooks. You can use package, help, and shell mode just like in the Julia REPL.
- -] add MPI
-? print
-; ls
-a = 1
-When assigning a variable, the value on the right hand side is not copied into the variable. It is just an association of a name with a value (much like in Python).
- -We can re-assign a variable, even with a value of another type. However, avoid changing the variable type for performance reasons.
- -a = 2
-a = 1.0
-a = "Hi!"
-When an object is not associated with a variable any more, it cannot be reached by the user. This can happen, e.g., when we re-assign a variable. Another case is when local variables, e.g in a function, go out of scope. The following line allocates a large array and assigns it to variable a
- -a = zeros(300000000);
-If we re-assign the variable to another value, the large array will be inaccessible.
- -a = nothing
-Luckily, Julia has a garbage collector that deallocates unreachable objects. You don't need to bother about manual deallocation! Julia is not constantly looking for unreachable objects. Thus, garbage collection does not happen instantaneously, but it will happen at some point. You can also explicitly call the garbage collector, but it is almost never done in practice.
- -GC.gc()
-Julia knows the type of the object associated with a variable.
- -a = 1
-typeof(a)
-We can annotate types if we want, but this will not improve performance (except in very special situations). Thus, annotating types is not done in practice.
- -c::Int = 1
-typeof(c)
-If you annotate a variable with a type, then it cannot refer to objects of other types.
- -c = "I am a string"
-We can use Unicode (UTF-8 encoding) characters in variables and function names.
- -🐱 = "I am a cat"
-🐶 = "I am a dog"
-🐱 == 🐶
-We can also use Greek letters and other mathematical symbols. Just write the corresponding LaTeX command and press Tab. For example: ω is written \omega + Tab
ω = 1.234
-sin(ω)
-In fact, some useful mathematical constants are predefined in Julia with math Greek letters.
- -sin(π/2)
-x = 1
-y = x
-y = 2
-x
-Julia is very much a functional programming language. In consequence, Julia is more centered on functions than on types. This is in contrast to object-oriented languages, which are more centered on types (classes). For instance, you don't need to know the details of the Julia type system to learn parallel programming in Julia, but you need to have a quite advanced knowledge of how Julia functions work.
-Functions are defined as shown in next cell. The closing end is necessary. Do to forget it! However, the return is optional. The value of last line is returned by default. Indentation is recommended, but it is also optional. That's why the closing end is needed.
function add(a,b)
- return a + b
-end
-Once defined, a function can be called using bracket notation as you would expect.
- -add(1,3)
-We can apply functions to arrays element by element using broadcast (dot) syntax.
- -a = [1,2,3]
-b = [4,5,6]
-add.(a,b)
-Mathematical operators can also be broadcasted (like in Matlab). The following cell won't work. If we want to multiply element by element, we can use the broadcasted version below.
- -a .* b
-function q(x)
- x = 2
- x
-end
-x = 1
-y = q(x)
-x
-For short functions, we can skip the function and end keywords as follows.
add_short(a,b) = a+b
-add_short(1,3)
-Since we can assign function to variables, it is not needed for a function to have a function name in many cases. We can simply create an anonymous function (i.e., a function without name) and assign it to a variable.
- -add_anonymous = (a,b) -> a+b
-We can call the function by using the variable name.
- -add_anonymous(2.0,3.5)
-Note that add_anonymous is not a function name. It is just a variable associated with a function with no function name (well, it has a name technically, but with an arbitrary value).
nameof(add_anonymous)
-nameof(add)
-We can work with Julia functions like with any other type of object. For instance, we can assign functions to variables.
- -a = add
-Now, we can call the function using the variable name.
- -a(4,5)
-We can also create an array of functions (this will not work in Python).
- -funs = [+,-,*]
-To call a specific function in the array, we index the array and then call the returned function
- -funs[2](2,3)
-Higher order functions are functions that take and/or return other functions. And example is the count function in Julia.
For instance, we can pass a user-defined function to count the number of even elements in an array.
- -func = i->i%2==0
-a = [1,2,3,5,32,2,4]
-count(func,a)
-There is yet another way to define anonymous functions. If a function takes a function in its first argument (like count) we can skip the first argument, when calling the function, and define the function we want to pass in a do-block. This is useful, e.g., if we want to define a multi-line anonymous function. The two next cells are equivalent.
function f(i)
- m = i%2
- m != 0
-end
-count(f,[1,2,3,5,32,2,4])
-count([1,2,3,5,32,2,4]) do i
- m = i%2
- m != 0
-end
-Julia functions always return a single variable. To return multiple values, we can wrap them in a tuple.
- -function divrem(a,b)
- α = div(a,b)
- β = rem(a,b)
- (α,β)
-end
-The output is a tuple as expected, but we can recover the individual values by unpacking the tuple.
- -d,r = divrem(10,3)
-d
-r
-Functions with multiple arguments are also supported. The following example iterates over the given arguments and prints them. args is just a tuple with all arguments.
function showargs(args...)
- for (i,arg) in enumerate(args)
- println("args[$i] = $arg")
- end
-end
-showargs(1,"Hi!",π)
-showargs(6)
-Functions can combine positional and keyword arguments much like in Python, but keyword arguments start with semicolon ; in Julia.
function foo(a,b;c,d)
- println("Positional: a=$a, b=$b. Keyword: c=$c, d=$d")
-end
-foo(3,4,d=2,c=1)
-We can provide default values to arguments to make them optional.
- -function bar(a,b=0;c,d=1)
- println("Positional: a=$a, b=$b. Keyword: c=$c, d=$d")
-end
-bar(1,c=2)
-function hofun(x)
- y -> x*y
-end
-f2 = hofun(2)
-x = f2(3)
-x
-Julia supports multi-dimensional arrays. They are very similar to Numpy arrays in Python. Let's learn the basics of Julia arrays.
-We can create (small) arrays from the given values using array literals.
-Next cell creates a vector with 3 integers. Note for Python users: there is no difference between vectors and lists in Julia.
- -vec = [1,2,3]
-We can create a matrix as follows.
- -mat = [1 2 3 4
- 5 6 7 8
- 9 10 11 12]
-We can create arrays with all the entries equal to zero, to one, or to a specific given value. The value can be any Julia object, even a function!
- -zeros(4)
-zeros(Int,4)
-ones(2,3)
-fill(5.0,3,4)
-fill(add,3,4)
-We can also create the items in the array using a loop within an array comprehension.
- -squares = [ i^2 for i in 1:8 ]
-We can get and set the items of an array by indexing the array.
- -squares[3]
-squares[end]
-squares[2:4]
-squares[4] = 16
-squares[2:3] = [4,9]
-Note that once set, the type of the elements in the array cannot be changed. If we try to set an item with an object of a different type, Julia will try to do a conversion, which can fail depending on the passed value.
- -a = [10,11,12,13]
-a[2] = "Hi!"
-Arrays of fixed element type seem to be very rigid, right? Python list have not this limitation. However, we can use arrays of Any type, which are as flexible as Python lists, or even more since they can also contain functions.
a = Any[10,11,12,13]
-a[3] = "HI!"
-a
-The loop in next cell visits the elements in a one after the other.
a = [10,20,30,40]
-for ai in a
- @show ai
-end
-This loop visits the integers from 1 to the length of the array and indexes the array at each of these integers.
- -for i in 1:length(a)
- ai = a[i]
- @show (i,ai)
-end
-This loop "enumerates" the items in the array.
- -for (i,ai) in enumerate(a)
- @show (i,ai)
-end
-Be aware of this if you are a C or Python user.
- -a = [10,20,30,40]
-a[0]
-This is also different from Numpy in Python.
- -a = [1 2 3
- 4 5 6]
-s = a[:,2]
-s[2] = 0
-a
-If you want to modify the original array, use view instead.
v = view(a,:,2)
-v[1] = 0
-a
-Implement a function ex1(a) that finds the largest item in the array a. It should return the largest item and its corresponding position in the array. If there are multiple maximal elements, then the first one will be returned. Assume that the array is not empty. Implement the function in the next cell. Test your implementation with the other one.
# Implement here
-using Test
-arr = [3,4,7,3,1,7,2]
-@test ex1(arr) == (7,3)
-Implement a function ex2(f,g) that takes two functions f(x) and g(x) and returns a new function h(x) representing the sum of f and g, i.e., h(x)=f(x)+g(x).
# Implement here
-h = ex2(sin,cos)
-xs = LinRange(0,2π,100)
-@test all(x-> h(x) == sin(x)+cos(x), xs)
-Function mandel estimates if a given point (x,y) in the complex plane belongs to the Mandelbrot set.
function mandel(x,y,max_iters)
- z = Complex(x,y)
- c = z
- threshold=2
- for n in 1:max_iters
- if abs(z)>threshold
- return n-1
- end
- z = z^2 +c
- end
- max_iters
-end
-If the value of mandel is less than max_iters, the point is provably outside the Mandelbrot set. If mandel is equal to max_iters, then the point is provably inside the set. The larger max_iters, the better the quality of the estimate (the nicer will be your plot).
Plot the value of function mandel for each pixel in a 2D grid of the box.
Use a grid resolution of at least 1000 points in each direction and max_iters at least 10. You can increase these values to get nicer plots. To plot the values use function heatmap from the Julia package GLMakie. Use LinRange to divide the horizontal and vertical axes into pixels. See the documentation of these functions for help. GLMakie is a GPU-accelerated plotting back-end for Julia. It is a large package and it can take some time to install and to generate the first plot. Be patient.
# Implement here
-In this notebook, we will learn the basics of distributed computing in Julia. In particular, we will focus on the Distributed module available in the Julia standard library. The main topics we are going to cover are:
-With this knowledge you will be able to implement simple and complex parallel algorithms in Julia.
- -using Printf
-function answer_checker(answer,solution)
- if answer == solution
- "🥳 Well done! "
- else
- "It's not correct. Keep trying! 💪"
- end |> println
-end
-q_1_check(answer) = answer_checker(answer,"a")
-q_2_check(answer) = answer_checker(answer,"b")
-First of all, we need several processes in order to run parallel algorithms in parallel. In this section, we discuss different ways to create new processes in Julia.
- -The simplest way of creating processes for parallel computing is to add them locally in the current Julia session. This is done by using the following commands.
- -using Distributed
-addprocs(3)
-Last cell created 3 new Julia processes. By default, they run locally in the same computer as the current Julia session, using multiple cores if possible. However, it is also possible to start the new processes in other machines as long as they are interconnected (more details on this later).
-When adding the new processes, you can imagine that 3 new Julia REPLs have started under the hood (see figure below). The main point of the Distributed module is to provide a way of coordinating all these Julia processes to run code in parallel. It is important to note that each process runs in a separated Julia instance. This means that each process has its own memory space and therefore they do not share memory. This results in distributed-memory parallelism, and allows one to run processes in different machines.
-The following functions provide basic information about the underlying processes. If more than one process is available, the first process is called the main or master and the other the workers. If only a single process is available, it is the master and the first worker simultaneously.
- -procs()
-workers()
-nprocs()
-nworkers()
-myid()
-@everywhere println(myid())
-In previous cell, we have used the macro @everywhere that evaluates the given code on all processes. As a result, each process will print its own process id.
For large parallel computations, one typically needs to use different computers in parallel. Function addprocs also provides a low-level method to start workers in other machines. Next code example would create 3 workers in server1 and 4 new workers in server server2 (see figure below). Under the hood, Julia connects via ssh to the other machines and starts the new processes there. In order this to work, the local computer and the remote servers need to be properly configured (see the Julia manual for details).
using Distributed
-machines = [("user@server1",3),("user@server2",4)]
-addprocs(machines)
-Previous way of starting workers in other machines is very low-level. Happily, there is a Julia package called ClusterManagers.jl that helps to create workers remotely in number of usual scenarios. For instance, when running the following code from the login node in a computer cluster, it will submit a job to the cluster queue allocating 128 threads. A worker will be generated for each one of these threads. If the compute node have 64 cores, 2 compute nodes will be used to create to contain the 128 workers (see below).
-using Distributed
-using ClusterManagers
-addprocs(SlurmManager(128), partition="debug", t="00:5:00")
-We have added new processes to our Julia session. Let's start using them!
- -remotecall¶The most basic thing we can do with a remote processor is to execute a given function on it. This is done by using function remotecall. To make clear how local and remote executions compare, let's call a function locally and then remotely. Next cell uses function ones to create a matrix locally.
a = ones(2,3)
-The next cell does the same operation, but remotely on process 2. Note that remotecall takes the function we want to execute remotely, the process id where we want to execute it and, finally, the function arguments.
proc = 2
-ftr = remotecall(ones,proc,2,3)
-Note that remotecall does not return the result of the underlying function, but a Future. This object represents a reference to a task running on the remote process. To move a copy of the result to the current process we can use fetch.
fetch(ftr)
-remotecall is asynchronous¶It is important to note that remotecall does not wait for the remote process to finish. It turns immediately. This can be checked be calling remotely the following function that sleeps for 10 secods and then generates a matrix.
fun = (m,n) -> (sleep(10); ones(m,n))
-When running next cell, it will return immediately, event though the remote process will sleep for 10 seconds. We can even run code in parallel. To try this execute the second next cell while the remote call is running in the worker.
- -proc = 2
-ftr = remotecall(fun,proc,2,3)
-1+1
-However, when fetching the result, the current process blocks waiting until the result is available in the remote process and arrives to its destination.
- -fetch(ftr)
-@spawnat¶You have provably realized that in order to use remotecall we have written auxiliary anonymous functions. They are needed to wrap the code we want to execute remotely. Writing these functions can be tedious. Happily, the macro @spawnat generates an auxiliary function from the given block of code and calls remotecall for us. For instance, the two following cells are equivalent.
@spawnat proc ones(2,3)
-fun = () -> ones(2,3)
-remotecall(fun,proc)
-@async vs @spawnat¶The relation between @async and @pawnat is obvious. From the user perspective they work almost in the same way. However, @async generates a task that runs asynchronously in the current process, whereas @spawnat executes a task in a remote process in parallel. In both cases, the result is obtained using fetch.
tsk = @async begin
- sleep(3)
- zeros(2)
-end
-fetch(tsk)
-ftr = @spawnat :any begin
- sleep(3)
- zeros(2)
-end
-fetch(ftr)
-@fetchfrom¶Macro @fetchfrom is the blocking version of @spawnat. It blocks and returns the corresponding result instead of a Future object.
a = @fetchfrom proc begin
- sleep(3)
- zeros(2)
-end
-Data movement is a crucial part in distributed-memory computations and it is usually one of its main computational
-bottlenecks. Being aware of the data we are moving when using functions such as remotecall is important to write efficient distributed algorithms in Julia. Julia also provides a special type of channel, called remote channel, to send and receive data between processes.
remotecall / fetch¶When usig remotecall we send to the remote process a function and its arguments. In this example, we send function name + and matrices a and b to proc 4. When fetching the result we receive a copy of the matrix from proc 4.
proc = 4
-a = rand(10,10)
-b = rand(10,10)
-ftr = remotecall(+,proc,a,b)
-fetch(ftr);
-Be aware that data movements can be implicit. This usually happens when we execute remotely functions that capture variables. In the following example, we are also sending matrices a and b to proc 4, even though they do not appear as arguments in the remote call. These variables are captured by the anonymous function and will be sent to proc 4.
proc = 4
-a = rand(10,10)
-b = rand(10,10)
-fun = () -> a+b
-ftr = remotecall(fun,proc)
-fetch(ftr);
-Another way of moving data between processes is to use remote channels. Their usage is very similar to conventional channels for moving data between tasks, but there are some important differences. In the next cell, we create a remote channel. Process 4 puts several values and closes the channel. Like for conventional channels, calls to put! are blocking, but next cell is not blocking the master process since the call to put! runs asynchronously on process 4.
fun = ()->Channel{Int}()
-chnl = RemoteChannel(fun)
-@spawnat 4 begin
- for i in 1:5
- put!(chnl,i)
- end
- close(chnl)
-end;
-We can take values from the remote channel form any process using take!. Run next cell several times. The sixth time it should raise and error since the channel was closed.
take!(chnl)
-Just like conventional channels, remote channels can be buffered. The buffer is stored in the process that owns the remote channel. By default this corresponds to process that creates the remote channel, but it can be a different one. For instance, process 3 will be the owner in the following example.
- -buffer_size = 2
-owner = 3
-fun = ()->Channel{Int}(buffer_size)
-chnl = RemoteChannel(fun,owner)
-@spawnat 4 begin
- println("start")
- for i in 1:5
- put!(chnl,i)
- println("I have put $i")
- end
- close(chnl)
- println("stop")
-end;
-Note that since the channel is buffered, worker 4 can start putting values into it before any call to take!. Run next cell several times until the channel is closed.
take!(chnl)
-One main difference with respect to conventional channels is that remote channels cannot be iterated. Let's repeat the example above.
- -fun = ()->Channel{Int}()
-chnl = RemoteChannel(fun)
-@spawnat 4 begin
- for i in 1:5
- put!(chnl,i)
- end
- close(chnl)
-end;
-Now, try to iterate over the channel in a for loop. It will result in an error since channels are not iterable.
- -for j in chnl
- @show j
-end
-If we want to take values form a remote channel and stop automatically when the channel is closed, we can combine a while loop and a try-catch statement. This works since take! raises an error if the channel is closed, which will execute the catch block and breaks the loop.
while true
- try
- j = take!(chnl)
- @show j
- catch
- break
- end
-end
-a = rand(Int,4,4)
-proc = 4
-@fetchfrom proc sum(a^2)
-a) 17
-b) 32
-c) 16^2
-d) 65
-
-
-answer = "x" #Replace x with a, b, c, or d
-q_1_check(answer)
-a = rand(Int,4,4)
-proc = 4
-@fetchfrom proc sum(a[2,2]^2)
-a) 2
-b) 17
-c) 5
-d) 32
-
-
-answer = "x" #Replace x with a, b, c, or d
-q_2_check(answer)
-a = zeros(Int,3)
-proc = 3
-@sync @spawnat proc a[2] = 2
-x = a[2]
-x
-Which value will be the value of x ?
a = zeros(Int,3)
-proc = myid()
-@sync @spawnat proc a[2] = 2
-x = a[2]
-x
-In particular, this means that each process can load different functions or packages. In consequence, it is important to make sure that the code we run is defined in the corresponding process.
- -This is a very common pitfall when running parallel code. If we define a function in a process, it is not automatically available in the other processes. This is illustrated in the next example. The remote call in the last line in next cell will fail since the function sleep_ones is only being defined in the local process.
function sleep_ones(m,n)
- sleep(4)
- ones(m,n)
-end
-proc = 3
-remotecall_fetch(sleep_ones,proc,3,4)
-To fix this, we can define the function on all processes with the @everywhere macro.
@everywhere function sleep_ones(m,n)
- sleep(4)
- ones(m,n)
-end
-proc = 3
-remotecall_fetch(sleep_ones,proc,3,4)
-If a function has a name, Julia only sends the function name to the corresponding process. Then, Julia looks for the corresponding function code in the remote process and executes it. This is why the function needs to be defined also in the remote process. However, if a function is anonymous, Julia needs to send the complete function definition to the remote process. This is why anonymous functions do not need to be defined with the macro @everywhere to work in a remote call.
fun = (m,n) -> (sleep(4);ones(m,n))
-proc = 3
-remotecall_fetch(fun,proc,3,4)
-When using a package in a process, it is not available in the other ones. For instance, if we load the LinearAlgebra package in the current process and use one of its exported functions in another process, we will get an error.
using LinearAlgebra
-@fetchfrom 3 norm([1,2,3])
-To fix this, we can load the package on all processors with the @everywhere macro.
@everywhere using LinearAlgebra
-@fetchfrom 3 norm([1,2,3])
-This is another very common source of errors. You can check that if you activate the current directory, this will have no effect in the other processes.
- -] activate .
-We have activated the current folder. Now let's see which is the active project in another process, say process 2. You will see that process 2 is provably still using the global package environment.
- -@everywhere using Pkg
-@spawnat 2 Pkg.status();
-To fix this, you need to activate the current directory on all processes.
- -@everywhere Pkg.activate(".")
-@spawnat 2 Pkg.status();
-A part from the low-level parallel routines we have seen so-far, Julia also provides much more simple ways to parallelizing loops and maps.
- -This macro is used when we want to perform a very large for loops made of independent small iterations. To illustrate this, let's consider again the function that computes $\pi$ with Leibniz formula.
- -function compute_π(n)
- s = 1.0
- for i in 1:n
- s += (isodd(i) ? -1 : 1) / (i*2+1)
- end
- 4*s
-end
-Paralelizing this function might require some work with low-level functions like remotecall, but it is trivial using the macro @distributed. This macro runs the for loop using the available processes and optionally reduces the result using a given reduction function (+ in this case).
function compute_π_dist(n)
- s = 1.0
- r = @distributed (+) for i in 1:n
- (isodd(i) ? -1 : 1) / (i*2+1)
- end
- 4*(s+r)
-end
-Run next cell to measure the performance of the serial function for a large value of n. Run it at least 2 times to get rid of compilation times.
@time compute_π(4_000_000_000)
-Run next cell to measure the performance of the parallel function.
- -@time compute_π_dist(4_000_000_000)
-pmap¶This function is used when we want to call a very expensive function a small number of evaluations and we want to distribute these evaluations over the available processes. To illustrate the usage of pmap consider the following example. Next cell generates sixty 30x30 matrices. The goal is to compute the singular value decomposition of all of them. This operation is known to be expensive for large matrices. Thus, this is a perfect scenario for pmap.
a = [ rand(300,300) for i in 1:60];
-First, lets measure the serial performance
- -using LinearAlgebra
-@time svd.(a);
-If we use pmap instead of broadcast, the different calls to svd will be distributed over the available processes.
@time pmap(svd,a);
-We have seen the basics of distributed computing in Julia. The programming model is essentially an extension of tasks and channels to parallel computations on multiple machines. The low-level functions are remotecall and RemoteChannel, but there are other functions and macros like pmap and @distributed that simplify the implementation of parallel algorithms.
-In this notebook, we will learn the basics of distributed computing in Julia. In particular, we will focus on the Distributed module available in the Julia standard library. The main topics we are going to cover are:
-With this knowledge you will be able to implement simple and complex parallel algorithms in Julia.
- -First of all, we need several processes in order to run parallel algorithms in parallel. In this section, we discuss different ways to create new processes in Julia.
- -The simplest way of creating processes for parallel computing is to add them locally in the current Julia session. This is done by using the following commands.
- -using Distributed
-addprocs(3)
-Last cell created 3 new Julia processes. By default, they run locally in the same computer as the current Julia session, using multiple cores if possible. However, it is also possible to start the new processes in other machines as long as they are interconnected (more details on this later).
-When adding the new processes, you can imagine that 3 new Julia REPLs have started under the hood (see figure below). The main point of the Distributed module is to provide a way of coordinating all these Julia processes to run code in parallel. It is important to note that each process runs in a separated Julia instance. This means that each process has its own memory space and therefore they do not share memory. This results in distributed-memory parallelism, and allows one to run processes in different machines.
-The following functions provide basic information about the underlying processes. If more than one process is available, the first process is called the main or master and the other the workers. If only a single process is available, it is the master and the first worker simultaneously.
- -procs()
-workers()
-nprocs()
-nworkers()
-myid()
-@everywhere println(myid())
-In previous cell, we have used the macro @everywhere that evaluates the given code on all processes. As a result, each process will print its own process id.
For large parallel computations, one typically needs to use different computers in parallel. Function addprocs also provides a low-level method to start workers in other machines. Next code example would create 3 workers in server1 and 4 new workers in server server2 (see figure below). Under the hood, Julia connects via ssh to the other machines and starts the new processes there. In order this to work, the local computer and the remote servers need to be properly configured (see the Julia manual for details).
using Distributed
-machines = [("user@server1",3),("user@server2",4)]
-addprocs(machines)
-Previous way of starting workers in other machines is very low-level. Happily, there is a Julia package called ClusterManagers.jl that helps to create workers remotely in number of usual scenarios. For instance, when running the following code from the login node in a computer cluster, it will submit a job to the cluster queue allocating 128 threads. A worker will be generated for each one of these threads. If the compute node have 64 cores, 2 compute nodes will be used to create to contain the 128 workers (see below).
-using Distributed
-using ClusterManagers
-addprocs(SlurmManager(128), partition="debug", t="00:5:00")
-We have added new processes to our Julia session. Let's start using them!
- -remotecall¶The most basic thing we can do with a remote processor is to execute a given function on it. This is done by using function remotecall. To make clear how local and remote executions compare, let's call a function locally and then remotely. Next cell uses function ones to create a matrix locally.
a = ones(2,3)
-The next cell does the same operation, but remotely on process 2. Note that remotecall takes the function we want to execute remotely, the process id where we want to execute it and, finally, the function arguments.
proc = 2
-ftr = remotecall(ones,proc,2,3)
-Note that remotecall does not return the result of the underlying function, but a Future. This object represents a reference to a task running on the remote process. To move a copy of the result to the current process we can use fetch.
fetch(ftr)
-remotecall is asynchronous¶It is important to note that remotecall does not wait for the remote process to finish. It turns immediately. This can be checked be calling remotely the following function that sleeps for 10 secods and then generates a matrix.
fun = (m,n) -> (sleep(10); ones(m,n))
-When running next cell, it will return immediately, event though the remote process will sleep for 10 seconds. We can even run code in parallel. To try this execute the second next cell while the remote call is running in the worker.
- -proc = 2
-ftr = remotecall(fun,proc,2,3)
-1+1
-However, when fetching the result, the current process blocks waiting until the result is available in the remote process and arrives to its destination.
- -fetch(ftr)
-@spawnat¶You have provably realized that in order to use remotecall we have written auxiliary anonymous functions. They are needed to wrap the code we want to execute remotely. Writing these functions can be tedious. Happily, the macro @spawnat generates an auxiliary function from the given block of code and calls remotecall for us. For instance, the two following cells are equivalent.
@spawnat proc ones(2,3)
-fun = () -> ones(2,3)
-remotecall(fun,proc)
-@async vs @spawnat¶The relation between @async and @pawnat is obvious. From the user perspective they work almost in the same way. However, @async generates a task that runs asynchronously in the current process, whereas @spawnat executes a task in a remote process in parallel. In both cases, the result is obtained using fetch.
tsk = @async begin
- sleep(3)
- zeros(2)
-end
-fetch(tsk)
-ftr = @spawnat :any begin
- sleep(3)
- zeros(2)
-end
-fetch(ftr)
-@fetchfrom¶Macro @fetchfrom is the blocking version of @spawnat. It blocks and returns the corresponding result instead of a Future object.
a = @fetchfrom proc begin
- sleep(3)
- zeros(2)
-end
-Data movement is a crucial part in distributed-memory computations and it is usually one of its main computational
-bottlenecks. Being aware of the data we are moving when using functions such as remotecall is important to write efficient distributed algorithms in Julia. Julia also provides a special type of channel, called remote channel, to send and receive data between processes.
remotecall / fetch¶When usig remotecall we send to the remote process a function and its arguments. In this example, we send function name + and matrices a and b to proc 4. When fetching the result we receive a copy of the matrix from proc 4.
proc = 4
-a = rand(10,10)
-b = rand(10,10)
-ftr = remotecall(+,proc,a,b)
-fetch(ftr);
-Be aware that data movements can be implicit. This usually happens when we execute remotely functions that capture variables. In the following example, we are also sending matrices a and b to proc 4, even though they do not appear as arguments in the remote call. These variables are captured by the anonymous function and will be sent to proc 4.
proc = 4
-a = rand(10,10)
-b = rand(10,10)
-fun = () -> a+b
-ftr = remotecall(fun,proc)
-fetch(ftr);
-Another way of moving data between processes is to use remote channels. Their usage is very similar to conventional channels for moving data between tasks, but there are some important differences. In the next cell, we create a remote channel. Process 4 puts several values and closes the channel. Like for conventional channels, calls to put! are blocking, but next cell is not blocking the master process since the call to put! runs asynchronously on process 4.
fun = ()->Channel{Int}()
-chnl = RemoteChannel(fun)
-@spawnat 4 begin
- for i in 1:5
- put!(chnl,i)
- end
- close(chnl)
-end;
-We can take values from the remote channel form any process using take!. Run next cell several times. The sixth time it should raise and error since the channel was closed.
take!(chnl)
-Just like conventional channels, remote channels can be buffered. The buffer is stored in the process that owns the remote channel. By default this corresponds to process that creates the remote channel, but it can be a different one. For instance, process 3 will be the owner in the following example.
- -buffer_size = 2
-owner = 3
-fun = ()->Channel{Int}(buffer_size)
-chnl = RemoteChannel(fun,owner)
-@spawnat 4 begin
- println("start")
- for i in 1:5
- put!(chnl,i)
- println("I have put $i")
- end
- close(chnl)
- println("stop")
-end;
-Note that since the channel is buffered, worker 4 can start putting values into it before any call to take!. Run next cell several times until the channel is closed.
take!(chnl)
-One main difference with respect to conventional channels is that remote channels cannot be iterated. Let's repeat the example above.
- -fun = ()->Channel{Int}()
-chnl = RemoteChannel(fun)
-@spawnat 4 begin
- for i in 1:5
- put!(chnl,i)
- end
- close(chnl)
-end;
-Now, try to iterate over the channel in a for loop. It will result in an error since channels are not iterable.
- -for j in chnl
- @show j
-end
-If we want to take values form a remote channel and stop automatically when the channel is closed, we can combine a while loop and a try-catch statement. This works since take! raises an error if the channel is closed, which will execute the catch block and breaks the loop.
while true
- try
- j = take!(chnl)
- @show j
- catch
- break
- end
-end
-How many integers are transferred between master and worker? Including both directions.
- -a = rand(Int,4,4)
-proc = 4
-@fetchfrom proc sum(a^2)
-How many integers are transferred between master and worker? Including both directions.
- -a = rand(Int,4,4)
-proc = 4
-@fetchfrom proc sum(a[2,2]^2)
-Which value will be the value of x ?
a = zeros(Int,3)
-proc = 3
-@sync @spawnat proc a[2] = 2
-x = a[2]
-x
-Which value will be the value of x ?
a = zeros(Int,3)
-proc = myid()
-@sync @spawnat proc a[2] = 2
-x = a[2]
-x
-In particular, this means that each process can load different functions or packages. In consequence, it is important to make sure that the code we run is defined in the corresponding process.
- -This is a very common pitfall when running parallel code. If we define a function in a process, it is not automatically available in the other processes. This is illustrated in the next example. The remote call in the last line in next cell will fail since the function sleep_ones is only being defined in the local process.
function sleep_ones(m,n)
- sleep(4)
- ones(m,n)
-end
-proc = 3
-remotecall_fetch(sleep_ones,proc,3,4)
-To fix this, we can define the function on all processes with the @everywhere macro.
@everywhere function sleep_ones(m,n)
- sleep(4)
- ones(m,n)
-end
-proc = 3
-remotecall_fetch(sleep_ones,proc,3,4)
-If a function has a name, Julia only sends the function name to the corresponding process. Then, Julia looks for the corresponding function code in the remote process and executes it. This is why the function needs to be defined also in the remote process. However, if a function is anonymous, Julia needs to send the complete function definition to the remote process. This is why anonymous functions do not need to be defined with the macro @everywhere to work in a remote call.
fun = (m,n) -> (sleep(4);ones(m,n))
-proc = 3
-remotecall_fetch(fun,proc,3,4)
-When using a package in a process, it is not available in the other ones. For instance, if we load the LinearAlgebra package in the current process and use one of its exported functions in another process, we will get an error.
using LinearAlgebra
-@fetchfrom 3 norm([1,2,3])
-To fix this, we can load the package on all processors with the @everywhere macro.
@everywhere using LinearAlgebra
-@fetchfrom 3 norm([1,2,3])
-This is another very common source of errors. You can check that if you activate the current directory, this will have no effect in the other processes.
- -] activate .
-We have activated the current folder. Now let's see which is the active project in another process, say process 2. You will see that process 2 is provably still using the global package environment.
- -@everywhere using Pkg
-@spawnat 2 Pkg.status();
-To fix this, you need to activate the current directory on all processes.
- -@everywhere Pkg.activate(".")
-@spawnat 2 Pkg.status();
-A part from the low-level parallel routines we have seen so-far, Julia also provides much more simple ways to parallelizing loops and maps.
- -This macro is used when we want to perform a very large for loops made of independent small iterations. To illustrate this, let's consider again the function that computes $\pi$ with Leibniz formula.
- -function compute_π(n)
- s = 1.0
- for i in 1:n
- s += (isodd(i) ? -1 : 1) / (i*2+1)
- end
- 4*s
-end
-Paralelizing this function might require some work with low-level functions like remotecall, but it is trivial using the macro @distributed. This macro runs the for loop using the available processes and optionally reduces the result using a given reduction function (+ in this case).
function compute_π_dist(n)
- s = 1.0
- r = @distributed (+) for i in 1:n
- (isodd(i) ? -1 : 1) / (i*2+1)
- end
- 4*(s+r)
-end
-Run next cell to measure the performance of the serial function for a large value of n. Run it at least 2 times to get rid of compilation times.
@time compute_π(4_000_000_000)
-Run next cell to measure the performance of the parallel function.
- -@time compute_π_dist(4_000_000_000)
-pmap¶This function is used when we want to call a very expensive function a small number of evaluations and we want to distribute these evaluations over the available processes. To illustrate the usage of pmap consider the following example. Next cell generates sixty 30x30 matrices. The goal is to compute the singular value decomposition of all of them. This operation is known to be expensive for large matrices. Thus, this is a perfect scenario for pmap.
a = [ rand(300,300) for i in 1:60];
-First, lets measure the serial performance
- -using LinearAlgebra
-@time svd.(a);
-If we use pmap instead of broadcast, the different calls to svd will be distributed over the available processes.
@time pmap(svd,a);
-We have seen the basics of distributed computing in Julia. The programming model is essentially an extension of tasks and channels to parallel computations on multiple machines. The low-level functions are remotecall and RemoteChannel, but there are other functions and macros like pmap and @distributed that simplify the implementation of parallel algorithms.
-With this notebook, you will learn
-We are going to use Jupyter notebooks in this and other lectures. You provably have worked with notebooks (in Python). If not, here are the basic concepts you need to know to follow the lessons.
-To run a Julia Jupyther notebook, open a Julia REPL and type
-julia> ]
-pkg> add IJulia
-julia> using IJulia
-julia> notebook()
-A new browser window will open. Navigate to the corresponding notebook and open it.
-To run a cell, click on a cell and press Shift + Enter. You can also use the "Run" button in the toolbar above.
1+3
-4*5
-As you can see from the output of previous cell, the value of the last line is displayed. We can suppress the output with a semicolon. Try it. Execute next cell.
- -1+3
-4*5;
-Running the two cells below in reverse order won't work (try it).
- -foo() = "Well done!"
-foo()
-This is particular to Julia notebooks. You can use package, help, and shell mode just like in the Julia REPL.
- -] add MPI
-? print
-; ls
-NB. Most of the examples below are taken from the lecture by S.G. Johnson at MIT. See here: -https://github.com/mitmath/18S096/blob/master/lectures/lecture1/Boxes-and-registers.ipynb
- -function sum_hand(a)
- s = zero(eltype(a))
- for ai in a
- s += ai
- end
- s
-end
-The Julia macro @test which is provided in the Test package is useful to write (unit) tests in Julia.
using Test
-a = rand(5)
-@test sum_hand(a) ≈ sum(a)
-In Julia, the most straight-forward way of measuring the computation time of a piece of code is with the macro @time.
a = rand(10^7);
-@time sum_hand(a)
-Note that @time also measures the compile time of a function if it's the first call to that function. So make sure to run @time twice on a freshly compiled function in order to get a more meaningful result.
A part of getting rid of compilation time, one typically wants to measure the runtime several times and compute sole. To do this we can call our code in a for-loop and gather the runtimes using the Julia macro @elapsed. This measures the runtime of an expression in seconds, just as the @time macro, only @elapsed discards the result of the computation and returns the elapsed time instead.
@elapsed sum_hand(a)
-The BenchmarkTools extension package provides useful macros for sampling runtimes automatically.
using BenchmarkTools
-First of all, the @benchmark macro runs the code multiple times and gives out a lot of details: the minimum and maximum time, mean time, median time, number of samples taken, memory allocations, etc.
bch_sum_hand = @benchmark sum_hand($a)
-For quick sanity checks, one can use the @btime macro, which is a convenience wrapper around @benchmark. It returns only the minimum execution time and memory allocations.
@btime sum_hand($a)
-Similar to the @elapsed macro, BenchmarkTool's @belapsed discards the return value of the function and instead returns the minimum runtime in seconds.
@belapsed sum_hand($a)
-As opposed to @time and @elapsed, @btime and @belapsed run the code several times and return the minimum runtime, thus eliminating possible compilation times from the measurement.
bch_sum = @benchmark sum($a)
-using PyCall
-py"""
-def sum_py_hand(A):
- s = 0.0
- for a in A:
- s += a
- return s
-"""
-sum_py_hand = py"sum_py_hand"
-@test sum(a) ≈ sum_py_hand(a)
-bch_sum_py_hand = @benchmark sum_py_hand($a)
-using Conda
-numpy = pyimport("numpy")
-sum_numpy = numpy["sum"]
-@test sum_numpy(a) ≈ sum(a)
-bch_sum_numpy = @benchmark sum_numpy($a)
-timings = [bch_sum_hand,bch_sum,bch_sum_py_hand,bch_sum_numpy]
-methods = ["sum_hand","sum","sum_py_hand","sum_numpy"]
-using DataFrames
-df = DataFrame(method=methods,time=timings)
-# ✍️ Exercise 3
-function sum_hand_fast(a)
- s = 0.0
- @simd for ai in a
- s += ai
- end
- s
-end
-@test sum_hand_fast(a) ≈ sum(a)
-@benchmark sum_hand_fast($a)
-We will look into the third point in a later section of this course.
- -function sum_hand(a)
- s = 0.0
- for ai in a
- s += ai
- end
- s
-end
-using Statistics
-
-a = rand(10^7)
-num_it = 15
-runtimes = zeros(num_it)
-for i in 1:num_it
- runtimes[i] = @elapsed sum_hand(a)
-end
-@show mean(runtimes)
-@show std(runtimes)
-@show minimum(runtimes)
-@show maximum(runtimes);
-# ✍️ Exercise 3
-function sum_hand_fast(a)
- s = 0.0
- @simd for ai in a
- s += ai
- end
- s
-end
-In this notebook, we will:
-] add BenchmarkTools
-using Distributed
-using BenchmarkTools
-using Printf
-if procs() == workers()
- addprocs(4)
-end
-function answer_checker(answer,solution)
- if answer == solution
- "🥳 Well done! "
- else
- "It's not correct. Keep trying! 💪"
- end |> println
-end
-alg_1_deps_check(answer) = answer_checker(answer,"b")
-alg_1_comm_overhead_check(answer) = answer_checker(answer, "c")
-alg_1_comp_check(answer) = answer_checker(answer, "a")
-alg_2_complex_check(answer) = answer_checker(answer, "b")
-alg_2_deps_check(answer) = answer_checker(answer,"d")
-alg_3_deps_check(answer) = answer_checker(answer, "c")
-alg_3_complex_check(answer) = answer_checker(answer, "d")
-Let us consider the (dense) matrix-matrix product C=A*B.
We want to
-A,B, and C are initially stored in the master processCTo develop and study the parallel implementation, we will follow these steps:
-We start by considering the (naive) sequential algorithm:
- -@everywhere function matmul_seq!(C,A,B)
- m = size(C,1)
- n = size(C,2)
- l = size(A,2)
- @assert size(A,1) == m
- @assert size(B,2) == n
- @assert size(B,1) == l
- z = zero(eltype(C))
- for j in 1:n
- for i in 1:m
- Cij = z
- for k in 1:l
- @inbounds Cij += A[i,k]*B[k,j]
- end
- C[i,j] = Cij
- end
- end
- C
-end
-Run the following cell to compare the performance of our hand-written function with respect to the built in function mul!
using LinearAlgebra
-N = 1000
-A = rand(N,N)
-B = rand(N,N)
-C = rand(N,N)
-@btime matmul_seq!(C,A,B)
-@btime mul!(C,A,B);
-Look at the three nested loops in the sequential implementation:
-for j in 1:n
- for i in 1:m
- Cij = z
- for k in 1:l
- @inbounds Cij += A[i,k]*B[k,j]
- end
- C[i,j] = Cij
- end
-end
-i and j are trivially parallelizable.k can be parallelized but it requires a reduction.All the entries of matrix C can be potentially computed in parallel, but is it the most efficient solution to solve all these entries in parallel in a distributed system? To find this we will consider different parallelization strategies:
-Moving data through the network is expensive and reducing data movement is one of the key points in a distributed algorithm. To this end, we determine which is the minimum data needed by a worker to perform its computations.
-In algorithm 1, each worker computes only an entry of the result matrix C.
- -a) column A[:,i] and row B[j,:]
-b) row A[i,:] and column B[:,j]
-c) the whole matrices A and B
-d) row A[i,:] and the whole matrix B
-
-
-answer = "x" # replace x with a, b, c, or d
-alg_1_deps_check(answer)
-Taking into account the data dependencies, the parallel algorithm 1 can be efficiently implemented following these steps from the worker perspective:
-A possible implementation of this algorithm in Julia is as follows:
- -function matmul_dist_1!(C, A, B)
- m = size(C,1)
- n = size(C,2)
- l = size(A,2)
- @assert size(A,1) == m
- @assert size(B,2) == n
- @assert size(B,1) == l
- z = zero(eltype(C))
- @assert nworkers() == m*n
- iw = 0
- @sync for j in 1:n
- for i in 1:m
- Ai = A[i,:]
- Bj = B[:,j]
- iw += 1
- w = workers()[iw]
- ftr = @spawnat w begin
- Cij = z
- for k in 1:l
- @inbounds Cij += Ai[k]*Bj[k]
- end
- Cij
- end
- @async C[i,j] = fetch(ftr)
- end
- end
- C
-end
-You can execute the following cells to test this implementation.
- -using Test
-N = 2
-A = rand(N,N)
-B = rand(N,N)
-C = similar(A)
-@test matmul_dist_1!(C,A,B) ≈ A*B
-Let us study the performance of this algorithm. To this end, we will analyze if algorithm 1 is able to achieve the optimal parallel speedup. The parallel speedup on $P$ processes is defined as
-$$ -S_P = \frac{T_1}{T_P}, -$$where $T_1$ denotes the runtime of the sequential algorithm on one node and $T_P$ denotes the runtime of the parallel algorithm on $P$ processes. If we run an optimal parallel algorithm with $P$ processes we expect it to run $p$ times faster than the sequential implementation. I.e., the optimal speedup of a parallel algorithm on $p$ processes is equal to $P$:
-$$ -S^{*}_p = P. -$$The ratio of the actual speedup over the optimal one is called the parallel efficiency
-$$ -E_p = \frac{S_p}{S^{*}_p} = \frac{T_1/T_P}{P}. -$$ -The following cell measures the speedup of parallel algorithm 1. Do we achieve the optimal speedup?
- -N = 2
-A = rand(N,N)
-B = rand(N,N)
-C = similar(A)
-T1 = @belapsed matmul_seq!(C,A,B)
-C = similar(A)
-TP = @belapsed matmul_dist_1!(C,A,B)
-P = nworkers()
-println("Speedup = ", T1/TP)
-println("Optimal speedup = ", P)
-println("Efficiency = ", 100*(T1/TP)/P, "%")
-Since communication is usually the main bottleneck in a distributed algorithm, we want to reduce the amount of communication per unit of computation in a worker. Let us compute the (theoretical) communication overhead for algorithm 1. This will help us understand why the speedup of this algorithm was so bad.
-Remember, algorithm 1 consisted of these main steps:
-a) 3N
-b) 2N + 2
-c) 2N + 1
-d) N² + 1
-
-
-answer = "x" # replace x with a, b, c, or d
-alg_1_comm_overhead_check(answer)
-a) O(N)
-b) O(N²)
-c) O(N³)
-
-
-answer = "x" # replace x with a, b, or c
-alg_1_comp_check(answer)
-From these results we can conclude:
-In other words, the communication cost is of the same order of magnitude as the computation cost. Since, communication is orders of magnitude slower in real systems, the runtime in the worker will be dominated by communication. This explains why we obtained such a bad speedup.
- -Let us study the next algorithm to see if we can improve the efficiency by augmenting the granularity (i.e. the amount of work) in each parallel task. In parallel algorithm 2, each worker computes an entire row of C.
- -a) column A[:,i] and row B[j,:]
-b) row A[i,:] and column B[:,j]
-c) the whole matrices A and B
-d) row A[i,:] and the whole matrix B
-
-
-answer = "x" # replace x with a, b, c, or d
-alg_2_deps_check(answer)
-These are the main steps of the implementation of algorithm 2:
-A possible implementation of this algorithm in Julia is as follows:
- -function matmul_dist_2!(C, A, B)
- m = size(C,1)
- n = size(C,2)
- l = size(A,2)
- @assert size(A,1) == m
- @assert size(B,2) == n
- @assert size(B,1) == l
- z = zero(eltype(C))
- @assert nworkers() == m
- iw = 0
- @sync for i in 1:m
- Ai = A[i,:]
- iw += 1
- w = workers()[iw]
- ftr = @spawnat w begin
- Ci = fill(z,l)
- for j in 1:n
- for k in 1:l
- @inbounds Ci[j] += Ai[k]*B[k,j]
- end
- end
- Ci
- end
- @async C[i,:] = fetch(ftr)
- end
- C
- end
-Test it using next cell
- -using Test
-N = 4
-A = rand(N,N)
-B = rand(N,N)
-C = similar(A)
-@test matmul_dist_2!(C,A,B) ≈ A*B
-N = 4
-A = rand(N,N)
-B = rand(N,N)
-C = similar(A)
-T1 = @belapsed matmul_seq!(C,A,B)
-C = similar(A)
-TP = @belapsed matmul_dist_2!(C,A,B)
-P = nworkers()
-println("Speedup = ", T1/TP)
-println("Optimal speedup = ", P)
-println("Efficiency = ", 100*(T1/TP)/P, "%")
-The speedup is still far from the optimal one. Let us study the communication overhead for this algorithm. Remember, algorithm 2 consists in these main steps:
-a) O(N) communication and O(N^2) computation
-b) O(N^2) communication and O(N^2) computation
-c) O(N^3) communication and O(N^3) computation
-d) O(N) communication and O(N) computation
-
-
-answer = "x" # replace x with a, b, c, or d
-alg_2_complex_check(answer)
-The communication and computation cost are still of the same order of magnitude even though we have increased the grain size.
- -Let us increase even more the granularity of the parallel tasks by computing several rows of C in a worker. Each worker computes N/P consecutive rows of C, where P is the number of workers.
- -a) A[rows,:] and B[:,rows]
-b) the whole matrix A and B[:,rows]
-c) A[rows,:] and the whole matrix B
-d) the whole matrices A and B
-
-
-answer = "x" # replace x with a, b, c, or d
-alg_3_deps_check(answer)
-These are the main steps of the implementation of algorithm 3:
-The implementation of this variant is let as an exercise (see below).
- -Let us analyze the (theoretical) communication overhead for algorithm 3.
-a) O(N²) communication and O(N³) computation
-b) O(N²) communication and O(N³/P) computation
-c) O(N²) communication and O(N²) computation
-d) O(N²/P) communication and O(N³/P) computation
-
-
-answer = "x" # replace x with a, b, c, or d
-alg_3_complex_check(answer)
-In this case, the ratio between communication and computation is O(P/N). If the matrix size N is much larger than the number of workers P, then the communication overhead O(P/N) would be negligible. This opens the door to an scalable implementation.
- -The table below compares the three parallel algorithms.
- -| Algorithm | -Parallelism (#workers) |
- Communication per worker |
- Computation per worker |
- Ratio communication/ computation |
-
|---|---|---|---|---|
| 1 | -N² | -2N + 1 | -N | -O(1) | -
| 2 | -N | -2N + N² | -N² | -O(1) | -
| 3 | -P | -N² + 2N²/P | -N³/P | -O(P/N) | -
Implement algorithm 3 in the function below. For simplicity, assume that the number of rows of C is a multiple of the number of workers.
- -function matmul_dist_3!(C,A,B)
- m = size(C,1)
- n = size(C,2)
- l = size(A,2)
- @assert size(A,1) == m
- @assert size(B,2) == n
- @assert size(B,1) == l
- @assert mod(m,nworkers()) == 0
- # Implement here
-
- C
-end
-Use test-driven development to implement the algorithm. Use this test:
- -using Test
-P = nworkers()
-load = 100
-N = load*P
-A = rand(N,N)
-B = rand(N,N)
-C = similar(A)
-@test matmul_dist_3!(C,A,B) ≈ A*B
-Measure the performance of your implementation by running next cell. Do you get close to the optimal speedup?
- -P = nworkers()
-load = 100
-N = load*P
-A = rand(N,N)
-B = rand(N,N)
-C = similar(A)
-T1 = @belapsed matmul_seq!(C,A,B)
-C = similar(A)
-TP = @belapsed matmul_dist_3!(C,A,B)
-println("Speedup = ", T1/TP)
-println("Optimal speedup = ", P)
-println("Efficiency = ", 100*(T1/TP)/P, "%")
-The implementation of algorithm 1 is very impractical. One needs as many processors as entries in the result matrix C. For 1000 times 1000 matrix one would need a supercomputer with one million processes! We can easily fix this problem by using less processors and spawning the computation of an entry in any of the available processes. -See the following code:
- -function matmul_dist_1_v2!(C, A, B)
- m = size(C,1)
- n = size(C,2)
- l = size(A,2)
- @assert size(A,1) == m
- @assert size(B,2) == n
- @assert size(B,1) == l
- z = zero(eltype(C))
- @sync for j in 1:n
- for i in 1:m
- Ai = A[i,:]
- Bj = B[:,j]
- ftr = @spawnat :any begin
- Cij = z
- for k in 1:l
- @inbounds Cij += Ai[k]*Bj[k]
- end
- Cij
- end
- @async C[i,j] = fetch(ftr)
- end
- end
- C
-end
-With this new implementation, we can multiply matrices of arbitrary size with a fixed number of workers. Test it:
- -using Test
-N = 50
-A = rand(N,N)
-B = rand(N,N)
-C = similar(A)
-@test matmul_dist_1_v2!(C,A,B) ≈ A*B
-Run the next cell to check the performance of this implementation. Note that we are far away from the optimal speed up. Why? To answer this question compute the theoretical communication over computation ratio for this implementation and reason about the obtained result. Hint: the number of times a worker is spawned in this implementation is N^3/P on average.
- -N = 100
-A = rand(N,N)
-B = rand(N,N)
-C = similar(A)
-P = nworkers()
-T1 = @belapsed matmul_seq!(C,A,B)
-C = similar(A)
-TP = @belapsed matmul_dist_1_v2!(C,A,B)
-println("Speedup = ", T1/TP)
-println("Optimal speedup = ", P)
-println("Efficiency = ", 100*(T1/TP)/P, "%")
-
-] add Distributed
-] add https://github.com/fverdugo/W2bYx4.git
-using W2bYx4
-W2bYx4.greet()
-This is an attached picture:
- -Another attached picture with markdown syntax ![]():
A picture embedded with Base64:
-This is a linked picture:
- -
-julia> 1 + 1
-1+1
-"hello.jl" is located in the current working directory of your Julia session.
-
-function matmul_dist_3!(C,A,B)
- m = size(C,1)
- n = size(C,2)
- l = size(A,2)
- @assert size(A,1) == m
- @assert size(B,2) == n
- @assert size(B,1) == l
- @assert mod(m,nworkers()) == 0
- # Implement here
- nrows_w = div(m,nworkers())
- @sync for (i,w) in enumerate(workers())
- rows_w = (1:nrows_w) .+ (i-1)*nrows_w
- Aw = A[rows_w,:]
- ftr = @spawnat w begin
- Cw = similar(Aw,nrows_w,n)
- matmul_seq!(Cw,Aw,B)
- Cw
- end
- @async C[rows_w,:] = fetch(ftr)
- end
- C
-end
-MPI.jl is the Julia interface to MPI. Note that it is not a MPI library by itself. It is just a thin wrapper between MPI and Julia. To use this interface, you need an actual MPI library installed in your system such as OpenMPI or MPICH. Julia downloads and installs a MPI library for you, but it is also possible to use a MPI library already available in your system. This is useful, e.g., when running on HPC clusters. See the documentation of MPI.jl for further details.\n",
+ "MPI.Wait() to ensure the communication is finished before accessing the send or receive buffer.\n",
+ "