add(x, y) = x + y
add(1, 2)3
Julia tutorial @ SC25
UCL
2025-11-16
https://juliaparallel.org/julia-hpc-tutorial-sc25/
Short form for one-line functions:
Long form for more complex functions:
-1
julia> myfun(1, 2.0)
ERROR: MethodError: no method matching myfun(::Int64, ::Float64)
The function `myfun` exists, but no method is defined for this combination of argument types.
Closest candidates are:
myfun(::AbstractFloat, ::AbstractFloat)
@ Main REPL[2]:1
myfun(::Integer, ::Integer)
@ Main REPL[1]:1
Stacktrace:
[1] top-level scope
@ REPL[3]:1Type annotations in signatures are only used for dispatch, not performance. Multiple dispatch enables composability:
abstract type Shape end
struct Rock <: Shape end
struct Paper <: Shape end
struct Scissors <: Shape end
play(::Paper, ::Rock) = "Paper wins"
play(::Paper, ::Scissors) = "Scissors wins"
play(::Rock, ::Scissors) = "Rock wins"
play(::T, ::T) where {T<: Shape} = "Tie, try again"
play(a::Shape, b::Shape) = play(b, a) # Commutativity
play(Paper(), Scissors())"Scissors wins"
Julia has built-in support for multi-dimensional tensors:
1×4 adjoint(::Vector{Float64}) with eltype Float64:
6.0 8.0 10.0 12.0
Performance note
Slicing makes a copy!
The LinearAlgebra stdlib exposes BLAS functionalities with a simple interface:
2×2 Matrix{Float64}:
19.0 22.0
43.0 50.0
for loops are fast, but for convenience you can use broadcasting, which enables syntactic loop fusion:
2×2 Matrix{Float64}:
3.48135 3.05936
0.500912 0.527089
@time macroJulia comes with a simple macro @time for measuring elapsed time of sufficiently long-running functions:
BenchmarkTools.jl packageFor more accurate timing of functions, packages like BenchmarkTools.jl and ChairMarks.jl provide more advanced tools:
BenchmarkTools.Trial: 10000 samples with 977 evaluations per sample. Range (min … max): 67.013 ns … 209.324 ns ┊ GC (min … max): 0.00% … 0.00% Time (median): 69.762 ns ┊ GC (median): 0.00% Time (mean ± σ): 70.109 ns ± 3.111 ns ┊ GC (mean ± σ): 0.00% ± 0.00% ▆▅▅▅▅█▅▁ ▁▁▂▃▁ ▂ ▅▄▅▅█████████▇▆▆▇▇▅▄▃▁▄▁▁▁▁▃▃▆███████▇▆▆▇▆▃▄▅▄▁▃▃▄▄▄▅▁▅▅▅▄▅▅ █ 67 ns Histogram: log(frequency) by time 83.7 ns < Memory estimate: 0 bytes, allocs estimate: 0.
julia> using Profile, LinearAlgebra
julia> N = 4_000; A = randn(N, N); B = randn(N, N); C = randn(N, N);
julia> Profile.clear()
julia> Profile.@profile mul!(C, A, B);
julia> Profile.print()
Overhead ╎ [+additional indent] Count File:Line Function
=========================================================
╎6 @Base/client.jl:561 _start()
╎ 6 @Base/client.jl:586 repl_main
╎ 6 @Base/client.jl:499 run_main_repl(interactive::Bool, quiet::Bool, banner::Symbol, history_file::Bool)
╎ 6 @Base/client.jl:478 run_std_repl(REPL::Module, quiet::Bool, banner::Symbol, history_file::Bool)
╎ 6 @REPL/src/REPL.jl:639 run_repl
╎ 6 @REPL/src/REPL.jl:653 #run_repl#50
╎ ╎ 6 @REPL/src/REPL.jl:424 start_repl_backend
╎ ╎ 6 @REPL/src/REPL.jl:427 #start_repl_backend#41
╎ ╎ 6 @REPL/src/REPL.jl:452 repl_backend_loop
╎ ╎ 6 @REPL/src/REPL.jl:330 eval_user_input
╎ ╎ 6 @REPL/src/REPL.jl:305 toplevel_eval_with_hooks
╎ ╎ ╎ 6 @REPL/src/REPL.jl:312 toplevel_eval_with_hooks
╎ ╎ ╎ 6 @REPL/src/REPL.jl:312 toplevel_eval_with_hooks
╎ ╎ ╎ 6 @REPL/src/REPL.jl:308 toplevel_eval_with_hooks
1╎ ╎ ╎ 6 @REPL/src/REPL.jl:301 __repl_entry_eval_expanded_with_loc
╎ ╎ ╎ 5 @LinearAlgebra/src/matmul.jl:265 mul!(C::Matrix{Float64}, A::Matrix{Float64}, B::Matrix{Float64})
╎ ╎ ╎ ╎ 5 @LinearAlgebra/src/matmul.jl:297 mul!
╎ ╎ ╎ ╎ 5 @LinearAlgebra/src/matmul.jl:328 _mul!
╎ ╎ ╎ ╎ 5 @LinearAlgebra/src/matmul.jl:507 generic_matmatmul_wrapper!(C::Matrix{Float64}, tA::Char, tB::Char, A::Matrix{Float64}, B::Matrix{Float64}, α::Bool, β::Bool, val::Val{LinearAlgebra.BlasFlag.GEMM})
╎ ╎ ╎ ╎ 5 @LinearAlgebra/src/matmul.jl:527 _syrk_herk_gemm_wrapper!
╎ ╎ ╎ ╎ 5 @LinearAlgebra/src/matmul.jl:808 gemm_wrapper!
4╎ ╎ ╎ ╎ ╎ 5 @LinearAlgebra/src/blas.jl:1648 gemm!(transA::Char, transB::Char, alpha::Float64, A::Matrix{Float64}, B::Matrix{Float64}, beta::Float64, C::Matrix{Float64})
╎591 @Base/task.jl:839 task_done_hook(t::Task)
╎ 591 @Base/task.jl:1199 wait()
591╎ 591 @Base/task.jl:1187 poptask(W::Base.IntrusiveLinkedListSynchronized{Task})
Total snapshots: 1182. Utilization: 50% across all threads and tasks. Use the `groupby` kwarg to break down by thread and/or task.julia> Profile.print(; C=true)
Overhead ╎ [+additional indent] Count File:Line Function
=========================================================
╎591 @juliasrc/task.c:1260 start_task
╎ 591 @juliasrc/task.c:345 jl_finish_task
╎ 591 @juliasrc/julia.h:2391 jl_apply
╎ 591 @julialib/julia/sys.so:? jfptr_task_done_hook_36828.1
╎ 591 @Base/task.jl:839 task_done_hook(t::Task)
╎ 591 @Base/task.jl:1199 wait()
╎ ╎ 591 @Base/task.jl:1187 poptask(W::Base.IntrusiveLinkedListSynchronized{Task})
╎ ╎ 591 @juliasrc/scheduler.c:523 ijl_task_get_next
╎ ╎ 591 /workspace/srcdir/libuv/src/unix/thread.c:822 uv_cond_wait
╎ ╎ 591 /lib/x86_64-linux-gnu/libc.so.6:? pthread_cond_wait
╎ ╎ 591 /lib/x86_64-linux-gnu/libc.so.6:?
╎ ╎ ╎ 591 /lib/x86_64-linux-gnu/libc.so.6:?
591╎ ╎ ╎ 591 /lib/x86_64-linux-gnu/libc.so.6:?
585╎585 @julialib/julia/libopenblas64_.so:? dgemm_kernel_HASWELL
╎6 /workspace/srcdir/glibc-2.17/csu/../sysdeps/x86_64/start.S:123
╎ 6 /lib/x86_64-linux-gnu/libc.so.6:? __libc_start_main
╎ 6 /lib/x86_64-linux-gnu/libc.so.6:?
╎ 6 /cache/build/tester-amdci4-14/julialang/julia-release-1-dot-12/cli/loader_exe.c:58 main
╎ 6 @juliasrc/jlapi.c:1139 jl_repl_entrypoint
╎ 6 @juliasrc/jlapi.c:971 true_main
╎ ╎ 6 @juliasrc/julia.h:2391 jl_apply
╎ ╎ 6 @julialib/julia/sys.so:? jfptr__start_31204.1
╎ ╎ 6 @Base/client.jl:561 _start()
╎ ╎ 6 @Base/client.jl:586 repl_main
╎ ╎ 6 @Base/client.jl:499 run_main_repl(interactive::Bool, quiet::Bool, banner::Symbol, history_file::Bool)
╎ ╎ ╎ 6 @juliasrc/builtins.c:881 jl_f_invokelatest
╎ ╎ ╎ 6 @juliasrc/julia.h:2391 jl_apply
╎ ╎ ╎ 6 @julialib/julia/sys.so:? jfptr_run_std_repl_62877.1
╎ ╎ ╎ 6 @Base/client.jl:478 run_std_repl(REPL::Module, quiet::Bool, banner::Symbol, history_file::Bool)
╎ ╎ ╎ 6 …up/julia-1.12.1+0.x64.linux.gnu/share/julia/compiled/v1.12/REPL/u0gqU_UDl4g.so:? jfptr_run_repl_18594.1
╎ ╎ ╎ ╎ 6 @REPL/src/REPL.jl:639 run_repl
╎ ╎ ╎ ╎ 6 @REPL/src/REPL.jl:653 #run_repl#50
╎ ╎ ╎ ╎ 6 @REPL/src/REPL.jl:424 start_repl_backend
╎ ╎ ╎ ╎ 6 @REPL/src/REPL.jl:427 #start_repl_backend#41
╎ ╎ ╎ ╎ 6 @REPL/src/REPL.jl:452 repl_backend_loop
╎ ╎ ╎ ╎ ╎ 6 @REPL/src/REPL.jl:330 eval_user_input
╎ ╎ ╎ ╎ ╎ 6 @REPL/src/REPL.jl:305 toplevel_eval_with_hooks
╎ ╎ ╎ ╎ ╎ 6 @REPL/src/REPL.jl:312 toplevel_eval_with_hooks
╎ ╎ ╎ ╎ ╎ 6 @REPL/src/REPL.jl:312 toplevel_eval_with_hooks
╎ ╎ ╎ ╎ ╎ 6 @REPL/src/REPL.jl:308 toplevel_eval_with_hooks
╎ ╎ ╎ ╎ ╎ ╎ 6 @juliasrc/builtins.c:881 jl_f_invokelatest
╎ ╎ ╎ ╎ ╎ ╎ 6 @juliasrc/julia.h:2391 jl_apply
╎ ╎ ╎ ╎ ╎ ╎ 6 @REPL/src/REPL.jl:301 __repl_entry_eval_expanded_with_loc
╎ ╎ ╎ ╎ ╎ ╎ 6 @juliasrc/toplevel.c:1035 jl_toplevel_eval_flex
╎ ╎ ╎ ╎ ╎ ╎ 6 @juliasrc/interpreter.c:898 jl_interpret_toplevel_thunk
╎ ╎ ╎ ╎ ╎ ╎ ╎ 5 @juliasrc/interpreter.c:558 eval_body
╎ ╎ ╎ ╎ ╎ ╎ ╎ 5 @juliasrc/interpreter.c:581 eval_body
╎ ╎ ╎ ╎ ╎ ╎ ╎ 5 @juliasrc/interpreter.c:243 eval_value
╎ ╎ ╎ ╎ ╎ ╎ ╎ 5 @juliasrc/interpreter.c:123 do_call
╎ ╎ ╎ ╎ ╎ ╎ ╎ 5 @juliasrc/julia.h:2391 jl_apply
╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ 5 @LinearAlgebra/src/matmul.jl:265 mul!(C::Matrix{Float64}, A::Matrix{Float64}, B::Matrix{Float64})
╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ 5 @LinearAlgebra/src/matmul.jl:297 mul!
╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ 5 @LinearAlgebra/src/matmul.jl:328 _mul!
╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ 5 @LinearAlgebra/src/matmul.jl:507 generic_matmatmul_wrapper!(C::Matrix{Float64}, tA::Char, tB::Char, A::Matrix{Float64}, B::Matrix{Float64}, α::Bool, β::Bool, val::Val{LinearAlgebra.BlasF…
╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ 5 @LinearAlgebra/src/matmul.jl:527 _syrk_herk_gemm_wrapper!
╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ 5 @LinearAlgebra/src/matmul.jl:808 gemm_wrapper!
╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ 5 @LinearAlgebra/src/blas.jl:1648 gemm!(transA::Char, transB::Char, alpha::Float64, A::Matrix{Float64}, B::Matrix{Float64}, beta::Float64, C::Matrix{Float64})
╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ 5 @julialib/julia/libopenblas64_.so:? dgemm_64_
╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ 5 @julialib/julia/libopenblas64_.so:? dgemm_thread_nn
╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ 5 @julialib/julia/libopenblas64_.so:? gemm_driver.isra.0
╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ 5 @julialib/julia/libopenblas64_.so:? exec_blas
╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ 5 @julialib/julia/libopenblas64_.so:? inner_thread
╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ 1 @julialib/julia/libopenblas64_.so:? dgemm_beta_HASWELL
╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ 1 @julialib/julia/libopenblas64_.so:?
3╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ 3 @julialib/julia/libopenblas64_.so:? dgemm_itcopy_HASWELL
1╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ ╎ 1 @julialib/julia/libopenblas64_.so:? dgemm_oncopy_HASWELL
╎ ╎ ╎ ╎ ╎ ╎ ╎ 1 @juliasrc/interpreter.c:707 eval_body
╎ ╎ ╎ ╎ ╎ ╎ ╎ 1 @juliasrc/interpreter.c:194 eval_stmt_value
╎ ╎ ╎ ╎ ╎ ╎ ╎ 1 @juliasrc/interpreter.c:243 eval_value
╎ ╎ ╎ ╎ ╎ ╎ ╎ 1 @juliasrc/interpreter.c:122 do_call
1╎ ╎ ╎ ╎ ╎ ╎ ╎ 1 @juliasrc/interpreter.c:201 eval_value
Total snapshots: 1182. Utilization: 50% across all threads and tasks. Use the `groupby` kwarg to break down by thread and/or task.Julia is also compatible with third-party profilers:
LinuxPerf.jlIntelITT.jl for instrumentationNVTX.jl for instrumentationLIKWID.jl, Extrae.jl, ScoreP.jl, and more; Function Signature: add(Int64, Int64) ; @ /home/runner/work/julia-hpc-tutorial-sc25/julia-hpc-tutorial-sc25/presentations/intro-julia/index.qmd:48 within `add` define i64 @julia_add_11987(i64 signext %"x::Int64", i64 signext %"y::Int64") #0 { top: ; ┌ @ int.jl:87 within `+` %0 = add i64 %"y::Int64", %"x::Int64" ret i64 %0 ; └ }
; Function Signature: add(Float64, Float64) ; @ /home/runner/work/julia-hpc-tutorial-sc25/julia-hpc-tutorial-sc25/presentations/intro-julia/index.qmd:48 within `add` define double @julia_add_12045(double %"x::Float64", double %"y::Float64") #0 { top: ; ┌ @ float.jl:495 within `+` %0 = fadd double %"x::Float64", %"y::Float64" ret double %0 ; └ }
; Function Signature: axpy!(Array{Float32, 1}, Float32, Array{Float32, 1}) define nonnull ptr @"julia_axpy!_12060"(ptr noundef nonnull align 8 dereferenceable(24) %"y::Array", float %"a::Float32", ptr noundef nonnull align 8 dereferenceable(24) %"x::Array") #0 { top: %jlcallframe1 = alloca [3 x ptr], align 8 %gcframe2 = alloca [4 x ptr], align 16 call void @llvm.memset.p0.i64(ptr align 16 %gcframe2, i8 0, i64 32, i1 true) %thread_ptr = call ptr asm "movq %fs:0, $0", "=r"() #13 %tls_ppgcstack = getelementptr inbounds i8, ptr %thread_ptr, i64 -8 %tls_pgcstack = load ptr, ptr %tls_ppgcstack, align 8 store i64 8, ptr %gcframe2, align 8 %frame.prev = getelementptr inbounds ptr, ptr %gcframe2, i64 1 %task.gcstack = load ptr, ptr %tls_pgcstack, align 8 store ptr %task.gcstack, ptr %frame.prev, align 8 store ptr %gcframe2, ptr %tls_pgcstack, align 8 %"x::Array.size_ptr" = getelementptr inbounds i8, ptr %"x::Array", i64 16 %"x::Array.size.0.copyload" = load i64, ptr %"x::Array.size_ptr", align 8 %"y::Array.size_ptr" = getelementptr inbounds i8, ptr %"y::Array", i64 16 %"y::Array.size.0.copyload" = load i64, ptr %"y::Array.size_ptr", align 8 %.not = icmp eq i64 %"x::Array.size.0.copyload", %"y::Array.size.0.copyload" br i1 %.not, label %L22, label %L11 L11: ; preds = %top %ptls_field = getelementptr inbounds i8, ptr %tls_pgcstack, i64 16 %ptls_load = load ptr, ptr %ptls_field, align 8 %"box::OneTo" = call noalias nonnull align 8 dereferenceable(16) ptr @ijl_gc_small_alloc(ptr %ptls_load, i32 360, i32 16, i64 139707468585632) #9 %"box::OneTo.tag_addr" = getelementptr inbounds i64, ptr %"box::OneTo", i64 -1 store atomic i64 139707468585632, ptr %"box::OneTo.tag_addr" unordered, align 8 store i64 %"x::Array.size.0.copyload", ptr %"box::OneTo", align 8 %gc_slot_addr_1 = getelementptr inbounds ptr, ptr %gcframe2, i64 3 store ptr %"box::OneTo", ptr %gc_slot_addr_1, align 8 %ptls_load91 = load ptr, ptr %ptls_field, align 8 %"box::OneTo50" = call noalias nonnull align 8 dereferenceable(16) ptr @ijl_gc_small_alloc(ptr %ptls_load91, i32 360, i32 16, i64 139707468585632) #9 %"box::OneTo50.tag_addr" = getelementptr inbounds i64, ptr %"box::OneTo50", i64 -1 store atomic i64 139707468585632, ptr %"box::OneTo50.tag_addr" unordered, align 8 store i64 %"y::Array.size.0.copyload", ptr %"box::OneTo50", align 8 %gc_slot_addr_0 = getelementptr inbounds ptr, ptr %gcframe2, i64 2 store ptr %"box::OneTo50", ptr %gc_slot_addr_0, align 8 store ptr @"jl_global#12065.jit", ptr %jlcallframe1, align 8 %0 = getelementptr inbounds ptr, ptr %jlcallframe1, i64 1 store ptr %"box::OneTo", ptr %0, align 8 %1 = getelementptr inbounds ptr, ptr %jlcallframe1, i64 2 store ptr %"box::OneTo50", ptr %1, align 8 %2 = call nonnull ptr @j1_throw_eachindex_mismatch_indices_12063(ptr nonnull @"jl_global#12064.jit", ptr nonnull %jlcallframe1, i32 3) call void @llvm.trap() unreachable L22: ; preds = %top %3 = icmp slt i64 %"x::Array.size.0.copyload", 1 br i1 %3, label %L100, label %L31.preheader70 L31.preheader70: ; preds = %L22 %memoryref_data = load ptr, ptr %"x::Array", align 8 %invariant.gep = getelementptr i8, ptr %memoryref_data, i64 -4 %memoryref_data12 = load ptr, ptr %"y::Array", align 8 %invariant.gep64 = getelementptr i8, ptr %memoryref_data12, i64 -4 %min.iters.check = icmp ult i64 %"x::Array.size.0.copyload", 32 br i1 %min.iters.check, label %scalar.ph, label %vector.scevcheck vector.scevcheck: ; preds = %L31.preheader70 %4 = add nsw i64 %"x::Array.size.0.copyload", -1 %mul.result = shl i64 %4, 2 %mul.overflow = icmp ugt i64 %4, 4611686018427387903 %5 = getelementptr i8, ptr %memoryref_data12, i64 %mul.result %6 = icmp ult ptr %5, %memoryref_data12 %7 = or i1 %6, %mul.overflow br i1 %7, label %scalar.ph, label %vector.memcheck vector.memcheck: ; preds = %vector.scevcheck %8 = shl i64 %"x::Array.size.0.copyload", 2 %scevgep = getelementptr i8, ptr %memoryref_data12, i64 %8 %scevgep79 = getelementptr i8, ptr %memoryref_data, i64 %8 %bound0 = icmp ult ptr %memoryref_data12, %scevgep79 %bound1 = icmp ult ptr %memoryref_data, %scevgep %found.conflict = and i1 %bound0, %bound1 br i1 %found.conflict, label %scalar.ph, label %vector.ph vector.ph: ; preds = %vector.memcheck %n.vec = and i64 %"x::Array.size.0.copyload", 9223372036854775776 %ind.end = or disjoint i64 %n.vec, 1 %broadcast.splatinsert = insertelement <8 x float> poison, float %"a::Float32", i64 0 %broadcast.splat = shufflevector <8 x float> %broadcast.splatinsert, <8 x float> poison, <8 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %offset.idx = shl i64 %index, 2 %9 = or disjoint i64 %offset.idx, 4 %10 = getelementptr i8, ptr %invariant.gep, i64 %9 %11 = getelementptr float, ptr %10, i64 8 %12 = getelementptr float, ptr %10, i64 16 %13 = getelementptr float, ptr %10, i64 24 %wide.load = load <8 x float>, ptr %10, align 4 %wide.load80 = load <8 x float>, ptr %11, align 4 %wide.load81 = load <8 x float>, ptr %12, align 4 %wide.load82 = load <8 x float>, ptr %13, align 4 %14 = fmul contract <8 x float> %wide.load, %broadcast.splat %15 = fmul contract <8 x float> %wide.load80, %broadcast.splat %16 = fmul contract <8 x float> %wide.load81, %broadcast.splat %17 = fmul contract <8 x float> %wide.load82, %broadcast.splat %18 = getelementptr i8, ptr %invariant.gep64, i64 %9 %19 = getelementptr float, ptr %18, i64 8 %20 = getelementptr float, ptr %18, i64 16 %21 = getelementptr float, ptr %18, i64 24 %wide.load83 = load <8 x float>, ptr %18, align 4 %wide.load84 = load <8 x float>, ptr %19, align 4 %wide.load85 = load <8 x float>, ptr %20, align 4 %wide.load86 = load <8 x float>, ptr %21, align 4 %22 = fadd contract <8 x float> %14, %wide.load83 %23 = fadd contract <8 x float> %15, %wide.load84 %24 = fadd contract <8 x float> %16, %wide.load85 %25 = fadd contract <8 x float> %17, %wide.load86 store <8 x float> %22, ptr %18, align 4 store <8 x float> %23, ptr %19, align 4 store <8 x float> %24, ptr %20, align 4 store <8 x float> %25, ptr %21, align 4 %index.next = add nuw i64 %index, 32 %26 = icmp eq i64 %index.next, %n.vec br i1 %26, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %cmp.n = icmp eq i64 %"x::Array.size.0.copyload", %n.vec br i1 %cmp.n, label %L100, label %scalar.ph scalar.ph: ; preds = %middle.block, %vector.memcheck, %vector.scevcheck, %L31.preheader70 %bc.resume.val = phi i64 [ %ind.end, %middle.block ], [ 1, %L31.preheader70 ], [ 1, %vector.scevcheck ], [ 1, %vector.memcheck ] br label %L84 L84: ; preds = %L84, %scalar.ph %value_phi4 = phi i64 [ %31, %L84 ], [ %bc.resume.val, %scalar.ph ] %memoryref_offset = shl i64 %value_phi4, 2 %gep = getelementptr i8, ptr %invariant.gep, i64 %memoryref_offset %27 = load float, ptr %gep, align 4 %28 = fmul contract float %27, %"a::Float32" %gep65 = getelementptr i8, ptr %invariant.gep64, i64 %memoryref_offset %29 = load float, ptr %gep65, align 4 %30 = fadd contract float %28, %29 store float %30, ptr %gep65, align 4 %31 = add nuw i64 %value_phi4, 1 %32 = icmp ult i64 %value_phi4, %"x::Array.size.0.copyload" br i1 %32, label %L84, label %L100 L100: ; preds = %L84, %middle.block, %L22 %frame.prev93 = load ptr, ptr %frame.prev, align 8 store ptr %frame.prev93, ptr %tls_pgcstack, align 8 ret ptr %"y::Array" }
.text
.file "add"
.section .ltext,"axl",@progbits
.globl julia_add_12118 # -- Begin function julia_add_12118
.p2align 4, 0x90
.type julia_add_12118,@function
julia_add_12118: # @julia_add_12118
; Function Signature: add(Int64, Int64)
; ┌ @ /home/runner/work/julia-hpc-tutorial-sc25/julia-hpc-tutorial-sc25/presentations/intro-julia/index.qmd:48 within `add`
# %bb.0: # %top
#DEBUG_VALUE: add:x <- $rdi
#DEBUG_VALUE: add:y <- $rsi
push rbp
mov rbp, rsp
; │┌ @ int.jl:87 within `+`
lea rax, [rdi + rsi]
pop rbp
ret
.Lfunc_end0:
.size julia_add_12118, .Lfunc_end0-julia_add_12118
; └└
# -- End function
.section ".note.GNU-stack","",@progbits
.text
.file "add"
.section .ltext,"axl",@progbits
.globl julia_add_12172 # -- Begin function julia_add_12172
.p2align 4, 0x90
.type julia_add_12172,@function
julia_add_12172: # @julia_add_12172
; Function Signature: add(Float64, Float64)
; ┌ @ /home/runner/work/julia-hpc-tutorial-sc25/julia-hpc-tutorial-sc25/presentations/intro-julia/index.qmd:48 within `add`
# %bb.0: # %top
#DEBUG_VALUE: add:x <- $xmm0
#DEBUG_VALUE: add:y <- $xmm1
push rbp
mov rbp, rsp
; │┌ @ float.jl:495 within `+`
vaddsd xmm0, xmm0, xmm1
pop rbp
ret
.Lfunc_end0:
.size julia_add_12172, .Lfunc_end0-julia_add_12172
; └└
# -- End function
.type ".L+Core.Float64#12174",@object # @"+Core.Float64#12174"
.section .lrodata,"al",@progbits
.p2align 3, 0x0
".L+Core.Float64#12174":
.quad ".L+Core.Float64#12174.jit"
.size ".L+Core.Float64#12174", 8
.set ".L+Core.Float64#12174.jit", 139707496957632
.size ".L+Core.Float64#12174.jit", 8
.section ".note.GNU-stack","",@progbits
.text
.file "axpy!"
.section .ltext,"axl",@progbits
.globl "julia_axpy!_12182" # -- Begin function julia_axpy!_12182
.p2align 4, 0x90
.type "julia_axpy!_12182",@function
"julia_axpy!_12182": # @"julia_axpy!_12182"
; Function Signature: axpy!(Array{Float32, 1}, Float32, Array{Float32, 1})
# %bb.0: # %top
#DEBUG_VALUE: axpy!:y <- [$rdi+0]
#DEBUG_VALUE: axpy!:a <- $xmm0
#DEBUG_VALUE: axpy!:x <- [$rsi+0]
push rbp
mov rbp, rsp
push r15
push r14
push r13
push r12
push rbx
sub rsp, 72
vxorps xmm1, xmm1, xmm1
vmovups ymmword ptr [rbp - 80], ymm1
#APP
mov rax, qword ptr fs:[0]
#NO_APP
lea rcx, [rbp - 80]
mov r15, qword ptr [rax - 8]
mov qword ptr [rbp - 80], 8
mov rax, qword ptr [r15]
mov qword ptr [rbp - 72], rax
mov qword ptr [r15], rcx
mov r12, qword ptr [rsi + 16]
mov r13, qword ptr [rdi + 16]
cmp r12, r13
jne .LBB0_13
# %bb.1: # %L22
test r12, r12
jle .LBB0_12
# %bb.2: # %L31.preheader70
mov rax, qword ptr [rsi]
mov rcx, qword ptr [rdi]
mov edx, 1
cmp r12, 32
jae .LBB0_3
.LBB0_10: # %scalar.ph
dec rdx
.p2align 4, 0x90
.LBB0_11: # %L84
# =>This Inner Loop Header: Depth=1
vmovss xmm1, dword ptr [rax + 4*rdx] # xmm1 = mem[0],zero,zero,zero
vfmadd213ss xmm1, xmm0, dword ptr [rcx + 4*rdx] # xmm1 = (xmm0 * xmm1) + mem
vmovss dword ptr [rcx + 4*rdx], xmm1
inc rdx
cmp rdx, r12
jb .LBB0_11
.LBB0_12: # %L100
mov rax, qword ptr [rbp - 72]
mov qword ptr [r15], rax
mov rax, rdi
add rsp, 72
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
vzeroupper
ret
.LBB0_3: # %vector.scevcheck
lea rsi, [rcx + 4*r12 - 4]
cmp rsi, rcx
jb .LBB0_10
# %bb.4: # %vector.scevcheck
lea rsi, [r12 - 1]
shr rsi, 62
jne .LBB0_10
# %bb.5: # %vector.memcheck
lea rsi, [rax + 4*r12]
cmp rcx, rsi
jae .LBB0_7
# %bb.6: # %vector.memcheck
lea rsi, [rcx + 4*r12]
cmp rax, rsi
jb .LBB0_10
.LBB0_7: # %vector.ph
movabs rsi, 9223372036854775776
vbroadcastss ymm1, xmm0
xor r8d, r8d
and rsi, r12
lea rdx, [rsi + 1]
.p2align 4, 0x90
.LBB0_8: # %vector.body
# =>This Inner Loop Header: Depth=1
vmovups ymm2, ymmword ptr [rax + 4*r8]
vmovups ymm3, ymmword ptr [rax + 4*r8 + 32]
vmovups ymm4, ymmword ptr [rax + 4*r8 + 64]
vmovups ymm5, ymmword ptr [rax + 4*r8 + 96]
vfmadd213ps ymm2, ymm1, ymmword ptr [rcx + 4*r8] # ymm2 = (ymm1 * ymm2) + mem
vfmadd213ps ymm3, ymm1, ymmword ptr [rcx + 4*r8 + 32] # ymm3 = (ymm1 * ymm3) + mem
vfmadd213ps ymm4, ymm1, ymmword ptr [rcx + 4*r8 + 64] # ymm4 = (ymm1 * ymm4) + mem
vfmadd213ps ymm5, ymm1, ymmword ptr [rcx + 4*r8 + 96] # ymm5 = (ymm1 * ymm5) + mem
vmovups ymmword ptr [rcx + 4*r8], ymm2
vmovups ymmword ptr [rcx + 4*r8 + 32], ymm3
vmovups ymmword ptr [rcx + 4*r8 + 64], ymm4
vmovups ymmword ptr [rcx + 4*r8 + 96], ymm5
add r8, 32
cmp rsi, r8
jne .LBB0_8
# %bb.9: # %middle.block
cmp r12, rsi
je .LBB0_12
jmp .LBB0_10
.LBB0_13: # %L11
mov rdi, qword ptr [r15 + 16]
movabs rbx, 139707468585632
movabs r14, offset ijl_gc_small_alloc
mov esi, 360
mov edx, 16
mov rcx, rbx
vzeroupper
call r14
mov qword ptr [rax - 8], rbx
mov qword ptr [rax], r12
mov qword ptr [rbp - 56], rax
mov esi, 360
mov edx, 16
mov r8, r14
mov r14, rax
mov rcx, rbx
mov rdi, qword ptr [r15 + 16]
call r8
movabs rcx, offset ".Ljl_global#12187.jit"
mov qword ptr [rax - 8], rbx
mov qword ptr [rax], r13
mov qword ptr [rbp - 64], rax
movabs rdi, offset ".Ljl_global#12186.jit"
lea rsi, [rbp - 104]
mov edx, 3
mov qword ptr [rbp - 104], rcx
mov qword ptr [rbp - 96], r14
mov qword ptr [rbp - 88], rax
movabs rax, offset j1_throw_eachindex_mismatch_indices_12185
call rax
ud2
.Lfunc_end0:
.size "julia_axpy!_12182", .Lfunc_end0-"julia_axpy!_12182"
# -- End function
.set ".Ljl_global#12187.jit", 139707418198400
.size ".Ljl_global#12187.jit", 8
.set ".L+Main.Base.OneTo#12188.jit", 139707468585632
.size ".L+Main.Base.OneTo#12188.jit", 8
.set ".Ljl_global#12186.jit", 139707490530096
.size ".Ljl_global#12186.jit", 8
.section ".note.GNU-stack","",@progbits
General advices for improving performance of Julia code:
@inbounds to forcibly disable bounds checking (use with caution!). Double check this is actually necessary with @code_llvm, and use generic abstracts (like eachindex) whenever possible@view 0.000140 seconds (74 allocations: 3.844 KiB)
1.970171 seconds (17.68 k allocations: 899.250 KiB, 1.86% compilation time: 100% of which was recompilation)
6
using LinearAlgebra, BenchmarkTools, Base.Threads
@show nthreads()
BLAS.set_num_threads(1) # Fix number of BLAS threads
function tmap(fn, itr)
# for each i ∈ itr, spawn a task to compute fn(i)
tasks = map(i -> @spawn(fn(i)), itr)
# fetch and return all the results
return fetch.(tasks)
end
M = [rand(100,100) for i in 1:(8 * nthreads())];
@btime map(svdvals, $M) samples=10 evals=3;
@btime tmap(svdvals, $M) samples=10 evals=3;nthreads() = 4
16.458 ms (418 allocations: 4.31 MiB)
6.934 ms (587 allocations: 4.32 MiB)
for loopsusing ChunkSplitters, Base.Threads, BenchmarkTools
function sum_threads(fn, data; nchunks=nthreads())
psums = zeros(eltype(data), nchunks)
@threads for (c, elements) in enumerate(chunks(data; n=nchunks))
psums[c] = sum(fn, elements)
end
return sum(psums)
end
v = randn(20_000_000);
@btime sum(sin, $v);
@btime sum_threads(sin, $v); 257.217 ms (0 allocations: 0 bytes)
77.280 ms (27 allocations: 1.75 KiB)
Julia has a built-in package manager:
julia> # Press ] to enter the Pkg REPL mode
(@v1.12) pkg> activate MyLocalEnvironment
Activating new project at `/private/tmp/MyLocalEnvironment`
(MyLocalEnvironment) pkg> add Example
Updating registry at `/var/folders/v2/hmy3kzgj4tb3xsy8qkltxd0r0000gn/T/tmp.tmYxNNrwBP/registries/General.toml`
Resolving package versions...
Installed Example ─ v0.5.5
Updating `/private/tmp/MyLocalEnvironment/Project.toml`
[7876af07] + Example v0.5.5
Updating `/private/tmp/MyLocalEnvironment/Manifest.toml`
[7876af07] + Example v0.5.5
Precompiling packages finished.
1 dependency successfully precompiled in 1 seconds
(MyLocalEnvironment) pkg> status
Status `/private/tmp/MyLocalEnvironment/Project.toml`
[7876af07] Example v0.5.5Virtual environments are defined by two files:
Project.toml: only top-level dependencies, with compatibility specifications (automatically generated by Pkg, but can be edited by users)Manifest.toml: full snapshot of all packages in the environment, for reproducibility (fully machine-generated, do not touch it)Project.toml:
[deps]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
ChunkSplitters = "ae650224-84b6-46f8-82ea-d812ca08434e"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
quarto_jll = "b7163347-bfae-5fd9-aba4-19f139889d78"
[compat]
BenchmarkTools = "1.6"
ChunkSplitters = "3.1"
quarto_jll = "1.8"Manifest.toml:
# This file is machine-generated - editing it directly is not advised
julia_version = "1.12.1"
manifest_format = "2.0"
project_hash = "5a75629e977f736554d543dc8c97367508941dd3"
[[deps.Artifacts]]
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
version = "1.11.0"
[[deps.Base64]]
uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
version = "1.11.0"
[[deps.BenchmarkTools]]
deps = ["Compat", "JSON", "Logging", "Printf", "Profile", "Statistics", "UUIDs"]
git-tree-sha1 = "7fecfb1123b8d0232218e2da0c213004ff15358d"
uuid = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
version = "1.6.3"
[[deps.ChunkSplitters]]
git-tree-sha1 = "63a3903063d035260f0f6eab00f517471c5dc784"
uuid = "ae650224-84b6-46f8-82ea-d812ca08434e"
version = "3.1.2"
[[deps.Compat]]
deps = ["TOML", "UUIDs"]
git-tree-sha1 = "9d8a54ce4b17aa5bdce0ea5c34bc5e7c340d16ad"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "4.18.1"
weakdeps = ["Dates", "LinearAlgebra"]
[deps.Compat.extensions]
CompatLinearAlgebraExt = "LinearAlgebra"
[[deps.CompilerSupportLibraries_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
version = "1.3.0+1"
[[deps.Dates]]
deps = ["Printf"]
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
version = "1.11.0"
[[deps.InteractiveUtils]]
deps = ["Markdown"]
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
version = "1.11.0"
[[deps.JLLWrappers]]
deps = ["Artifacts", "Preferences"]
git-tree-sha1 = "0533e564aae234aff59ab625543145446d8b6ec2"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.7.1"
[[deps.JSON]]
deps = ["Dates", "Logging", "Parsers", "PrecompileTools", "StructUtils", "UUIDs", "Unicode"]
git-tree-sha1 = "06ea418d0c95878c8f3031023951edcf25b9e0ef"
uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
version = "1.2.0"
[deps.JSON.extensions]
JSONArrowExt = ["ArrowTypes"]
[deps.JSON.weakdeps]
ArrowTypes = "31f734f8-188a-4ce0-8406-c8a06bd891cd"
[[deps.JuliaSyntaxHighlighting]]
deps = ["StyledStrings"]
uuid = "ac6e5ff7-fb65-4e79-a425-ec3bc9c03011"
version = "1.12.0"
[[deps.Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
version = "1.11.0"
[[deps.LinearAlgebra]]
deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
version = "1.12.0"
[[deps.Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
version = "1.11.0"
[[deps.Markdown]]
deps = ["Base64", "JuliaSyntaxHighlighting", "StyledStrings"]
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
version = "1.11.0"
[[deps.OpenBLAS_jll]]
deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
version = "0.3.29+0"
[[deps.Parsers]]
deps = ["Dates", "PrecompileTools", "UUIDs"]
git-tree-sha1 = "7d2f8f21da5db6a806faf7b9b292296da42b2810"
uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
version = "2.8.3"
[[deps.PrecompileTools]]
deps = ["Preferences"]
git-tree-sha1 = "07a921781cab75691315adc645096ed5e370cb77"
uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
version = "1.3.3"
[[deps.Preferences]]
deps = ["TOML"]
git-tree-sha1 = "0f27480397253da18fe2c12a4ba4eb9eb208bf3d"
uuid = "21216c6a-2e73-6563-6e65-726566657250"
version = "1.5.0"
[[deps.Printf]]
deps = ["Unicode"]
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
version = "1.11.0"
[[deps.Profile]]
deps = ["StyledStrings"]
uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
version = "1.11.0"
[[deps.Random]]
deps = ["SHA"]
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
version = "1.11.0"
[[deps.SHA]]
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
version = "0.7.0"
[[deps.Serialization]]
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
version = "1.11.0"
[[deps.Statistics]]
deps = ["LinearAlgebra"]
git-tree-sha1 = "ae3bb1eb3bba077cd276bc5cfc337cc65c3075c0"
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
version = "1.11.1"
[deps.Statistics.extensions]
SparseArraysExt = ["SparseArrays"]
[deps.Statistics.weakdeps]
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
[[deps.StructUtils]]
deps = ["Dates", "UUIDs"]
git-tree-sha1 = "cd47aa083c9c7bdeb7b92de26deb46d6a33163c9"
uuid = "ec057cc2-7a8d-4b58-b3b3-92acb9f63b42"
version = "2.5.1"
[deps.StructUtils.extensions]
StructUtilsMeasurementsExt = ["Measurements"]
StructUtilsTablesExt = ["Tables"]
[deps.StructUtils.weakdeps]
Measurements = "eff96d63-e80a-5855-80a2-b1b0885c5ab7"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
[[deps.StyledStrings]]
uuid = "f489334b-da3d-4c2e-b8f0-e476e12c162b"
version = "1.11.0"
[[deps.TOML]]
deps = ["Dates"]
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
version = "1.0.3"
[[deps.Test]]
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
version = "1.11.0"
[[deps.UUIDs]]
deps = ["Random", "SHA"]
uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
version = "1.11.0"
[[deps.Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
version = "1.11.0"
[[deps.libblastrampoline_jll]]
deps = ["Artifacts", "Libdl"]
uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
version = "5.15.0+0"
[[deps.quarto_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl"]
git-tree-sha1 = "dc4d5af2b50e030755711aad08139e8e4b96c4b8"
uuid = "b7163347-bfae-5fd9-aba4-19f139889d78"
version = "1.8.24+0"Test Summary: | Pass Broken Total Time My tests | 2 2 4 0.4s