The issues of frequent recompilation and low computational efficiency in ITensor.jl

I’m a beginner in Julia and I’m using ITensor.jl for computations in my project. However, I’ve noticed that the program runs with very low efficiency, and both CPU and memory usage are minimal. I’m not sure how to optimize it to address this issue.

using ITensors, LinearAlgebra, DelimitedFiles, Plots
using Random, Combinatorics, Distributions

function sample_pairs(rng::AbstractRNG, N::Int, m::Int, alpha::Float64)
    all_pairs = collect(combinations(1:N, 2))
    dists = [min(j-i, N-(j-i)) for (i,j) in all_pairs]
    weights = alpha == 0.0 ? ones(length(dists)) : [r^(-alpha) for r in dists]
    probs = weights ./ sum(weights)
    cat = Categorical(probs)
    idxs = rand(rng, cat, m)
    return [all_pairs[k] for k in idxs]
end

ITensors.space(::SiteType"Replica") = 6
uq = readdlm("weingarten_6_6_6_6.csv", ',')
function ITensors.op(::OpName"Transfer", ::SiteType"Replica", s1::Index, s2::Index)
    itensor(Float64, reduce(vcat, uq), s2', s1', s2, s1)
end

ITensors.state(::StateName"FUp",   ::SiteType"Replica") = [1.,1.,1.,0.,0.,1.]
ITensors.state(::StateName"FDown", ::SiteType"Replica") = [1.,0.,0.,1.,1.,1.]

N = 16
NA = 4
tmax = 15

alphas = [0.0,]
thetas = [i * π/10 for i in 2:5]

num_samples = 50

cutoff = 1e-8
maxdim = 100

rng = MersenneTwister(10086)

sites = siteinds("Replica", N)
results = []

for alpha in alphas
    for theta in thetas
        c, s = cos(theta/2), sin(theta/2)
        ITensors.state(::StateName"Theta", ::SiteType"Replica") = [c^4; fill(c^2*s^2, 4); s^4]
        avg_P = zeros(tmax+1)
        avg_Ppr = zeros(tmax+1)
        for sample_id in 1:num_samples
            psi = productMPS(Float64, sites, "Theta")
            P_t = zeros(tmax+1)
            Ppr_t = zeros(tmax+1)
            for t in 0:tmax
                bnd = productMPS(Float64, sites, n -> (n <= NA ? "FDown" : "FUp"))
                P = inner(bnd, psi)
                val = 0.0 + 0.0im
                for k in 0:NA
                    phi = 2pi*k/(NA+1)
                    ITensors.state(::StateName"FDownk", ::SiteType"Replica") = [1.0, 0.0, 0.0, exp(1im*phi), exp(-1im*phi), 1.0]
                    bndk = productMPS(ComplexF64, sites, n -> (n <= NA ? "FDownk" : "FUp"))
                    val += inner(bndk, psi)
                end
                Ppr = abs(val)/(NA+1)
                P_t[t+1] = P
                Ppr_t[t+1] = Ppr
                pairs = sample_pairs(rng, N, N, alpha)
                opslist = [("Transfer", i, j) for (i, j) in pairs]
                mpo = ops(opslist, sites)
                psi = apply(mpo, psi; cutoff=cutoff, maxdim=maxdim)
            end
            avg_P .+= P_t
            avg_Ppr .+= Ppr_t
        end
        avg_P ./= num_samples
        avg_Ppr ./= num_samples
        delta_S2 = -log.(avg_Ppr ./ avg_P)
        for t in 0:tmax
            push!(results, (alpha, theta, t, delta_S2[t+1]))
        end
    end
end

open("results.csv", "w") do io
    println(io, "alpha,theta,t,delta_S2")
    for (alpha, theta, t, v) in results
        println(io, string(alpha, ",", theta, ",", t, ",", v))

end
end

The .csv file: Richard/weingarten_6_6_6_6.csv at main · Veneme/Richard · GitHub

I would suggest looking into profiling your code to understand what is taking the largest amount of time and reduce the question of your biggest slowdown into a minimal example.
Another thing to keep in mind is that for loops (e.g. for alpha in alphas and for theta in thetas) will not automatically parallelize.

The last technical comment is that you can also pass kwargs to states, for example

ITensors.state(::StateName"FUp",   ::SiteType"Replica"; phi) = [1.,1.,1.,0.,0.,1.] #phi ignored but needed
ITensors.state(::StateName"FDownk", ::SiteType"Replica"; phi) = [1.0, 0.0, 0.0, exp(1im*phi), exp(-1im*phi), 1.0]
# ...
# ex:
bndk = MPS([state(sites[n],(n ≤ NA ? "FDownk" : "FUp"); phi=2pi*k/(NA+1)) for n=1:N])

and productMPS has been replaced with just MPS