JuliaMolSim · mfherbst · Jun 5, 2024 · May 24, 2024 · May 24, 2024 · May 24, 2024
diff --git a/Project.toml b/Project.toml
@@ -24,7 +24,6 @@ Libxc = "66e17ffc-8502-11e9-23b5-c9248d0eb96d"
 LineSearches = "d3d80556-e9d4-5f37-9878-2ab0fcc64255"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearMaps = "7a12625a-238d-50fd-b39a-03d52299707e"
-LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
@@ -95,7 +94,6 @@ Libxc = "0.3.17"
 LineSearches = "7"
 LinearAlgebra = "1"
 LinearMaps = "3"
-LoopVectorization = "0.12"
 MPI = "0.20.13"
 Markdown = "1"
 Optim = "1"

diff --git a/docs/src/guide/self_consistent_field.jl b/docs/src/guide/self_consistent_field.jl
@@ -414,7 +414,7 @@ plot!(p, x -> ε(χ0_SiO2, x),  label="silica (SiO₂)", ls=:dashdot)
 # metal, such that `KerkerMixing` is indeed appropriate. We will thus employ it as
 # the preconditioner $P$ in the setting
 # ```math
-# rho_{n+1} = \rho_n + \alpha P^{-1} (D(V(\rho_n)) - \rho_n),
+# \rho_{n+1} = \rho_n + \alpha P^{-1} (D(V(\rho_n)) - \rho_n),
 # ```
 # In DFTK this is done by running an SCF as follows:
 # ```julia

diff --git a/docs/src/publications.md b/docs/src/publications.md
@@ -21,8 +21,8 @@ The current DFTK reference paper to cite is
 Additionally the following publications describe DFTK or one of its algorithms:
 
 - E. Cancès, M. Hassan and L. Vidal.
-  [*Modified-Operator Method for the Calculation of Band Diagrams of Crystalline Materials.*](https://arxiv.org/abs/2210.00442)
-  Mathematics of Computation (2023).
+  [*Modified-Operator Method for the Calculation of Band Diagrams of Crystalline Materials.*](https://doi.org/10.1090/mcom/3897)
+  Math. Comp. **93**, 1203 (2024).
   [ArXiv:2210.00442](https://arxiv.org/abs/2210.00442).
   ([Supplementary material and computational scripts](https://github.com/LaurentVidal95/ModifiedOp).
 
@@ -54,11 +54,16 @@ The following publications report research employing DFTK as a core component.
 Feel free to drop us a line if you want your work to be added here.
 
 - J. Cazalis.
-  [*Dirac cones for a mean-field model of graphene*](https://arxiv.org/abs/2207.09893)
-  (Submitted).
+  [*Dirac cones for a mean-field model of graphene*](https://doi.org/10.2140/paa.2024.6.129)
+  Pure and Appl. Anal., **6**, 1 (2024).
   [ArXiv:2207.09893](https://arxiv.org/abs/2207.09893).
   ([Computational script](https://github.com/JuliaMolSim/DFTK.jl/blob/f7fcc31c79436b2582ac1604d4ed8ac51a6fd3c8/examples/publications/2022_cazalis.jl)).
 
+- E. Cancès, G. Kemlin, A. Levitt.
+  [*A Priori Error Analysis of Linear and Nonlinear Periodic Schr\"{o}dinger Equations with Analytic Potentials*](https://doi.org/10.1007/s10915-023-02421-0)
+  J. Sci. Comp., **98**, 25 (2024).
+  [ArXiv:2206.04954](https://arxiv.org/abs/2206.04954).
+
 - E. Cancès, L. Garrigue, D. Gontier.
   [*A simple derivation of moiré-scale continuous models for twisted bilayer graphene*](https://doi.org/10.1103/PhysRevB.107.155403)
   Physical Review B, **107**, 155403 (2023).

diff --git a/examples/Fe_afm.pwi b/examples/Fe_afm.pwi
@@ -10,7 +10,7 @@
  /
 
  &SYSTEM
-    ibrav = 1,
+    ibrav = 0,
     celldm(1) = 5.42,
     nat = 2,
     ntyp = 2,
@@ -29,6 +29,11 @@
  &ELECTRONS
  /
 
+CELL_PARAMETERS "alat"
+  1 0 0
+  0 1 0
+  0 0 1
+
 ATOMIC_SPECIES
 # the second field, atomic mass, is not actually used
 # except for MD calculations
@@ -37,7 +42,7 @@ ATOMIC_SPECIES
 
 ATOMIC_POSITIONS crystal
    Fe1 0.0  0.0  0.0
-   Fe2 0.5  0.5  0.0
+   Fe2 0.5  0.5  0.5
 ! this is a comment that the code will ignore
 
 K_POINTS automatic

diff --git a/src/common/hankel.jl b/src/common/hankel.jl
@@ -5,11 +5,13 @@ Compute the Hankel transform
 ```math
     H[f] = 4\pi \int_0^\infty r f(r) j_l(p·r) r dr.
 ```
-The integration is performed by trapezoidal quadrature, and the function takes
+The integration is performed by simpson quadrature, and the function takes
 as input the radial grid `r`, the precomputed quantity r²f(r) `r2_f`, angular
 momentum / spherical bessel order `l`, and the Hankel coordinate `p`.
 """
 function hankel(r::AbstractVector, r2_f::AbstractVector, l::Integer, p::T)::T where {T<:Real}
-    integrand = r2_f .* sphericalbesselj_fast.(l, p .* r)
-    return 4T(π) * simpson(r, integrand)
+    @assert length(r) == length(r2_f)
+    4T(π) * simpson(r) do i, ri
+        r2_f[i] * sphericalbesselj_fast(l, p * ri)
+    end
 end
diff --git a/src/common/quadrature.jl b/src/common/quadrature.jl
@@ -1,90 +1,111 @@
-using LoopVectorization
-
 # NumericalIntegration.jl also implements this (in a slightly different way)
 # however, it is unmaintained and has stale, conflicting version requirements
 # for Interpolations.jl
 """
-    trapezoidal(x, y)
-
-Integrate y(x) over x using trapezoidal method quadrature.
+Integrate the `integrand` function using the nodal points `x`
+using the trapezoidal rule.
+The function will be called as `integrand(i, x[i])` for each integrand
+point `i` (not necessarily in order).
 """
 trapezoidal
-@inbounds function trapezoidal(x::AbstractVector, y::AbstractVector)
+@inbounds function trapezoidal(integrand, x::AbstractVector)
     n = length(x)
-    n == length(y) || error("vectors `x` and `y` must have the same number of elements")
-    n == 1 && return zero(promote_type(eltype(x), eltype(y)))
-    I = (x[2] - x[1]) * y[1]
-    @turbo for i = 2:(n-1)
-        # dx[i] + dx[i - 1] = (x[i + 1] - x[i]) + (x[i] - x[i - 1])
-        #                   = x[i + 1] - x[i - 1]
-        I += (x[i + 1] - x[i - 1]) * y[i]
+    Tint = eltype(integrand(1, x[1]))
+    n == 1 && return zero(promote_type(eltype(x), Tint))
+    I = (x[2] - x[1]) * integrand(1, x[1])
+    # Note: We used @turbo here before, but actually the allocation overhead
+    #       needed to get all the data into an array is worse than what one gains
+    #       with LoopVectorization
+    @fastmath @simd for i = 2:(n-1)
+        # dx[i] + dx[i-1] = (x[i+1] - x[i]) + (x[i] - x[i-1])
+        #                 = x[i+1] - x[i-1]
+        I += @inline (x[i+1] - x[i-1]) * integrand(i, x[i])
     end
-    I += (x[n] - x[n - 1]) * y[n]
+    I += (x[n] - x[n-1]) * integrand(n, x[n])
     I / 2
 end
 
 """
-    simpson(x, y)
+Integrate a function represented by the nodal points and function values
+given by the arrays `x`, `y`. Note the order (`y` comes first).
+"""
+trapezoidal(y::AbstractArray, x::AbstractArray) = trapezoidal((i, xi) -> y[i], x)
 
-Integrate y(x) over x using Simpson's method quadrature.
+"""
+Integrate the `integrand` function using the nodal points `x` using Simpson's rule.
+The function will be called as `integrand(i, x[i])` for each integrand
+point `i` (not necessarily in order).
 """
 simpson
-@inbounds function simpson(x::AbstractVector, y::AbstractVector)
+@inbounds function simpson(integrand, x::AbstractVector)
     n = length(x)
-    n == length(y) || error("vectors `x` and `y` must have the same number of elements")
-    n == 1 && return zero(promote_type(eltype(x), eltype(y)))
-    n <= 4 && return trapezoidal(x, y)
-    (x[2] - x[1]) ≈ (x[3] - x[2]) && return simpson_uniform(x, y)
-    return simpson_nonuniform(x, y)
+    n <= 4 && return trapezoidal(integrand, x)
+    if (x[2] - x[1]) ≈ (x[3] - x[2])
+        simpson_uniform(integrand, x)
+    else
+        simpson_nonuniform(integrand, x)
+    end
 end
 
-@inbounds function simpson_uniform(x::AbstractVector, y::AbstractVector)
+"""
+Integrate a function represented by the nodal points and function values
+given by the arrays `x`, `y`. Note the order (`y` comes first).
+"""
+simpson(y::AbstractArray, x::AbstractArray) = simpson((i, xi) -> y[i], x)
+
+@inbounds function simpson_uniform(integrand, x::AbstractVector)
     dx = x[2] - x[1]
     n = length(x)
     n_intervals = n - 1
 
     istop = isodd(n_intervals) ? n - 1 : n - 2
 
-    I = 1 / 3 * dx * y[1]
-    @turbo for i = 2:2:istop
-        I += 4 / 3 * dx * y[i]
+    I = 1 / 3 * dx * integrand(1, x[1])
+    # Note: We used @turbo here before, but actually the allocation overhead
+    #       needed to get all the data into an array is worse than what one gains
+    #       with LoopVectorization
+    @fastmath @simd for i = 2:2:istop
+        I += @inline 4 / 3 * dx * integrand(i, x[i])
     end
-    @turbo for i = 3:2:istop
-        I += 2 / 3 * dx * y[i]
+    @fastmath @simd for i = 3:2:istop
+        I += @inline 2 / 3 * dx * integrand(i, x[i])
     end
 
     if isodd(n_intervals)
-        I += 5 / 6 * dx * y[n - 1]
-        I += 1 / 2 * dx * y[n]
+        I += 5 / 6 * dx * integrand(n-1, x[n-1])
+        I += 1 / 2 * dx * integrand(n, x[n])
     else
-        I += 1 / 3 * dx * y[n]
+        I += 1 / 3 * dx * integrand(n, x[n])
     end
     return I
 end
 
-@inbounds function simpson_nonuniform(x::AbstractVector, y::AbstractVector)
+@inbounds function simpson_nonuniform(integrand, x::AbstractVector)
     n = length(x)
-    n_intervals = n - 1
+    n_intervals = n-1
 
-    istop = isodd(n_intervals) ? n - 3 : n - 2
+    istop = isodd(n_intervals) ? n-3 : n-2
 
-    I = zero(promote_type(eltype(x), eltype(y)))
-    # This breaks when @turbo'd
-    @simd for i = 1:2:istop
+    Tint = eltype(integrand(1, x[1]))
+    I = zero(promote_type(eltype(x), Tint))
+    @fastmath @simd for i = 1:2:istop
         dx0 = x[i + 1] - x[i]
-        dx1 = x[i + 2] - x[i + 1]
+        dx1 = x[i+2] - x[i+1]
         c = (dx0 + dx1) / 6
-        I += c * (2 - dx1 / dx0) * y[i]
-        I += c * (dx0 + dx1)^2 / (dx0 * dx1) * y[i + 1]
-        I += c * (2 - dx0 / dx1) * y[i + 2]
+        @inline begin
+            I += c * (2 - dx1 / dx0) * integrand(i, x[i])
+            I += c * (dx0 + dx1)^2 / (dx0 * dx1) * integrand(i+1, x[i+1])
+            I += c * (2 - dx0 / dx1) * integrand(i+2, x[i+2])
+        end
+
     end
 
     if isodd(n_intervals)
-        dxn = x[end] - x[end - 1]
-        dxnm1 = x[end - 1] - x[end - 2]
-        I += (2 * dxn^2 + 3 * dxn * dxnm1) / (6 * (dxnm1 + dxn)) * y[end]
-        I += (dxn^2 + 3 * dxn * dxnm1) / (6 * dxnm1) * y[end - 1]
-        I -= dxn^3 / (6 * dxnm1 * (dxnm1 + dxn)) * y[end - 2]
+        dxn = x[end] - x[end-1]
+        dxnm1 = x[end - 1] - x[end-2]
+        I += (2 * dxn^2 + 3 * dxn * dxnm1) / (6 * (dxnm1 + dxn)) * integrand(n, x[n])
+        I += (dxn^2 + 3 * dxn * dxnm1) / (6 * dxnm1) * integrand(n-1, x[n-1])
+        I -= dxn^3 / (6 * dxnm1 * (dxnm1 + dxn)) * integrand(n-2, x[n-2])
     end
 
     return I

diff --git a/src/common/threading.jl b/src/common/threading.jl
@@ -10,13 +10,12 @@ By default, use 1 FFT thread, and `Threads.nthreads()` BLAS and DFTK threads
 Changing `n_DFTK` requires a restart of Julia.
 """
 function setup_threading(; n_fft=1, n_blas=Threads.nthreads(), n_DFTK=nothing)
-    if n_DFTK != nothing
+    if !isnothing(n_DFTK)
         set_DFTK_threads!(n_DFTK)
     end
     n_DFTK = @load_preference("DFTK_threads", Threads.nthreads())
     FFTW.set_num_threads(n_fft)
     BLAS.set_num_threads(n_blas)
-    njulia = Threads.nthreads()
     mpi_master() && @info "Threading setup: " Threads.nthreads() n_DFTK n_fft n_blas
 end
 
@@ -30,10 +29,12 @@ end
 
 function set_DFTK_threads!(n)
     if n > Threads.nthreads()
-        error("You're asking for $n_DFTK DFTK threads, but only ran julia with $(Threads.nthreads()) threads.")
+        error("You set your preference for DFTK threads using set_DFTK_threads!, " *
+              "but only ran julia with $(Threads.nthreads()) threads.")
     end
     if @load_preference("DFTK_threads", nothing) != n
-        @info "DFTK_threads preference changed. This is a permanent change, restart julia to see the effect."
+        @info("DFTK_threads preference changed. This is a permanent change, " *
+              "restart julia to see the effect.")
     end
     @set_preferences!("DFTK_threads" => n)
 end
@@ -46,7 +47,8 @@ function get_DFTK_threads()
     if isnothing(nthreads)
         nthreads = Threads.nthreads()
     elseif nthreads > Threads.nthreads()
-        error("You set your preference for $n_DFTK DFTK threads, but only ran julia with $(Threads.nthreads()) threads.")
+        error("You set your preference for DFTK threads using set_DFTK_threads!, " *
+              "but only ran julia with $(Threads.nthreads()) threads.")
     end
     nthreads
 end
@@ -68,7 +70,7 @@ end
 # private interface to be called
 function parallel_loop_over_range(fun, range, storages)
     nthreads = get_DFTK_threads()
-    storages != nothing && @assert length(storages) >= nthreads
+    !isnothing(storages) && @assert length(storages) >= nthreads
     chunk_length = cld(length(range), nthreads)
 
     # this tensorized if is ugly, but this is potentially

diff --git a/src/densities.jl b/src/densities.jl
@@ -10,9 +10,13 @@ grid `basis`, where the individual k-points are occupied according to `occupatio
 It is possible to ask only for occupations higher than a certain level to be computed by
 using an optional `occupation_threshold`. By default all occupation numbers are considered.
 """
-@views @timing function compute_density(basis::PlaneWaveBasis{T}, ψ, occupation;
-                                        occupation_threshold=zero(T)) where {T}
-    Tρ = promote_type(T, real(eltype(ψ[1])))
+@views @timing function compute_density(basis::PlaneWaveBasis{T,VT}, ψ, occupation;
+                                        occupation_threshold=zero(T)) where {T,VT}
+    Tρ = promote_type(T,  real(eltype(ψ[1])))
+    Tψ = promote_type(VT, real(eltype(ψ[1])))
+    # Note, that we special-case Tψ, since when T is Dual and eltype(ψ[1]) is not
+    # (e.g. stress calculation), then only the normalisation factor introduces
+    # dual numbers, but not yet the FFT
 
     # Occupation should be on the CPU as we are going to be doing scalar indexing.
     occupation = [to_cpu(oc) for oc in occupation]
@@ -21,18 +25,18 @@ using an optional `occupation_threshold`. By default all occupation numbers are
 
     function allocate_local_storage()
         (; ρ=zeros_like(basis.G_vectors, Tρ, basis.fft_size..., basis.model.n_spin_components),
-         ψnk_real=zeros_like(basis.G_vectors, complex(Tρ), basis.fft_size...))
+         ψnk_real=zeros_like(basis.G_vectors, complex(Tψ), basis.fft_size...))
     end
     # We split the total iteration range (ik, n) in chunks, and parallelize over them.
     range = [(ik, n) for ik = 1:length(basis.kpoints) for n = mask_occ[ik]]
 
     storages = parallel_loop_over_range(range; allocate_local_storage) do kn, storage
         (ik, n) = kn
         kpt = basis.kpoints[ik]
-
-        ifft!(storage.ψnk_real, basis, kpt, ψ[ik][:, n])
+        ifft!(storage.ψnk_real, basis, kpt, ψ[ik][:, n]; normalize=false)
         storage.ρ[:, :, :, kpt.spin] .+= (occupation[ik][n] .* basis.kweights[ik]
-                                              .* abs2.(storage.ψnk_real))
+                                          .* (basis.ifft_normalization)^2
+                                          .* abs2.(storage.ψnk_real))
 
         synchronize_device(basis.architecture)
     end

diff --git a/src/eigen/lobpcg_hyper_impl.jl b/src/eigen/lobpcg_hyper_impl.jl
@@ -306,7 +306,7 @@ function final_retval(X, AX, BX, resid_history, niter, n_matvec)
         p = sortperm(λ)
         λ = λ[p]
         residuals = residuals[:, p]
-        X = X[:, p]
+        X  = X[:, p]
         AX = AX[:, p]
         BX = BX[:, p]
         resid_history = resid_history[p, :]

diff --git a/src/external/atomsbase.jl b/src/external/atomsbase.jl
@@ -45,7 +45,7 @@ function parse_system(system::AbstractSystem{D}) where {D}
         magnetic_moments = normalize_magnetic_moment.(magnetic_moments)
     end
 
-    sum_atomic_charge = sum(atom -> get(atom, :charge, 0.0u"e_au"), system)
+    sum_atomic_charge = sum(atom -> get(atom, :charge, 0.0u"e_au"), system; init=0.0u"e_au")
     if abs(sum_atomic_charge) > 1e-6u"e_au"
         error("Charged systems not yet supported in DFTK.")
     end

diff --git a/src/external/spglib.jl b/src/external/spglib.jl
@@ -5,6 +5,7 @@
 import Spglib
 
 function spglib_cell(lattice, atom_groups, positions, magnetic_moments)
+    @assert !isempty(atom_groups)  # Otherwise spglib cannot work properly
     magnetic_moments = normalize_magnetic_moment.(magnetic_moments)
 
     spg_atoms     = Int[]

diff --git a/src/fft.jl b/src/fft.jl
@@ -33,8 +33,7 @@ function ifft!(f_real::AbstractArray3, basis::PlaneWaveBasis,
     fill!(f_real, 0)
     f_real[kpt.mapping] = f_fourier
 
-    # Perform an IFFT
-    mul!(f_real, basis.ipBFFT, f_real)
+    mul!(f_real, basis.ipBFFT, f_real)  # perform IFFT
     normalize && (f_real .*= basis.ifft_normalization)
     f_real
 end

diff --git a/src/input_output.jl b/src/input_output.jl
@@ -109,6 +109,7 @@ function Base.show(io::IO, ::MIME"text/plain", basis::PlaneWaveBasis)
     showfieldln(io, "architecture", basis.architecture)
     showfieldln(io, "num. mpi processes", mpi_nprocs(basis.comm_kpts))
     showfieldln(io, "num. julia threads", Threads.nthreads())
+    showfieldln(io, "num. DFTK  threads", get_DFTK_threads())
     showfieldln(io, "num. blas  threads", BLAS.get_num_threads())
     showfieldln(io, "num. fft   threads", FFTW.get_num_threads())
     println(io)