Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 69 additions & 58 deletions create_lmodsitepackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,70 +163,81 @@
if checkGpu and (overrideGpuCheck == nil) then
local eessi_version = os.getenv('EESSI_VERSION') or ""
local eessi_eprefix = os.getenv("EESSI_EPREFIX") or ""
if eessi_eprefix == nil or eessi_version == nil then
if eessi_eprefix == "" or eessi_version == "" then
LmodError("EESSI_VERSION and EESSI_EPREFIX must be defined for GPU driver check to work\\n")
end
local cudaDriverDir = nil
if eessi_version == "2023.06" then
cudaDriverDir = string.gsub(eessi_eprefix, 'versions', 'host_injections') .. "/lib"
else
cudaDriverDir = eessi_eprefix .. "/lib/nvidia"
end
local cudaDriverFile = cudaDriverDir .. "/libcuda.so"
local cudaDriverExists = isFile(cudaDriverFile)
local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so")
if not (cudaDriverExists or singularityCudaExists) then
local advice = "which relies on the CUDA runtime environment and driver libraries. "
advice = advice .. "In order to be able to use the module, you will need "
advice = advice .. "to make sure EESSI can find the GPU driver libraries on your host system. "
advice = advice .. "The file being checked for on your system is \\n" .. cudaDriverFile .. "\\n"
advice = advice .. "You can override this check by setting the environment variable "
advice = advice .. "EESSI_OVERRIDE_GPU_CHECK but "
advice = advice .. "the loaded application will not be able to execute on your system.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYou requested to load ", simpleName, " ", advice)
else
-- CUDA driver exists, now we check its version to see if an update is needed
if cudaDriverExists then
local cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION")
if not cudaVersion or cudaVersion == "" then
local eessi_prefix = os.getenv("EESSI_PREFIX")
local script = pathJoin(eessi_prefix, 'scripts', 'gpu_support', 'nvidia', 'get_cuda_driver_version.sh')
source_sh("bash", script)
end
cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION")
local cudaVersion_req = os.getenv("EESSICUDAVERSION")
-- Account for the fact that the script sourced above was designed to never return a non-zero exit code,
-- even if it fails to set EESSI_CUDA_DRIVER_VERSION
-- Essentially, we handle that case here by raising an error, which can be suppressed
if not cudaVersion or cudaVersion == "" then
-- Having EESSICUDAVERSION set means we have an NVIDIA accelerator
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using this for forward thinking for when we have to deal with something similar for AMD

local cudaVersion_req = os.getenv("EESSICUDAVERSION")
if cudaVersion_req then
local cudaDriverDir = nil
if eessi_version == "2023.06" then
cudaDriverDir = string.gsub(eessi_eprefix, 'versions', 'host_injections') .. "/lib"
else
cudaDriverDir = eessi_eprefix .. "/lib/nvidia"
end
local cudaDriverFile = cudaDriverDir .. "/libcuda.so"
local cudaDriverExists = isFile(cudaDriverFile)
local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so")
if not (cudaDriverExists or singularityCudaExists) then
local advice = "which relies on the CUDA runtime environment and driver libraries. "
advice = advice .. "In order to be able to use the module, you will need "
advice = advice .. "to make sure EESSI can find the GPU driver libraries on your host system. "
advice = advice .. "The file being checked for on your system is \\n" .. cudaDriverFile .. "\\n"
advice = advice .. "You can override this check by setting the environment variable "
advice = advice .. "EESSI_OVERRIDE_GPU_CHECK but "
advice = advice .. "the loaded application will not be able to execute on your system.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYou requested to load ", simpleName, " ", advice)
else
-- CUDA driver exists, now we check its version to see if an update is needed
if cudaDriverExists then
local cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION")
if not cudaVersion or cudaVersion == "" then
local eessi_prefix = os.getenv("EESSI_PREFIX")
local script = pathJoin(eessi_prefix, 'scripts', 'gpu_support', 'nvidia', 'get_cuda_driver_version.sh')
source_sh("bash", script)
end
cudaVersion = os.getenv("EESSI_CUDA_DRIVER_VERSION")
-- Account for the fact that the script sourced above was designed to never return a non-zero exit code,
-- even if it fails to set EESSI_CUDA_DRIVER_VERSION
-- Essentially, we handle that case here by raising an error, which can be suppressed
local suppress_var = "EESSI_CUDA_DRIVER_VERSION_SUPPRESS_WARNING"
local warn = "Environment variable EESSI_CUDA_DRIVER_VERSION not found. "
warn = warn .. "Cannot ensure that driver version is new enough for CUDA toolkit version: '"
warn = warn .. cudaVersion_req .. "'. This module will still be loaded, but may not function "
warn = warn .. "as expected. Export " .. suppress_var .. "=1"
local suppress_warn = os.getenv(suppress_var)
if not suppress_warn or suppress_warn == 1 then
LmodWarning(warn)
end
else
-- driver CUDA versions don't give a patch version for CUDA
local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)")
local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)")
local driver_libs_need_update = false
if tonumber(major) < tonumber(major_req) then
driver_libs_need_update = true
elseif tonumber(major) == tonumber(major_req) then
if tonumber(minor) < tonumber(minor_req) then
if not cudaVersion or cudaVersion == "" then
local warn = "Environment variable EESSI_CUDA_DRIVER_VERSION not found. "
warn = warn .. "Cannot ensure that driver version is new enough for CUDA toolkit version: '"
warn = warn .. cudaVersion_req .. "'. This module will still be loaded, but may not function "
warn = warn .. "as expected. Export " .. suppress_var .. "=1"
if not suppress_warn or suppress_warn == "1" then
LmodWarning(warn)
end
else
-- driver CUDA versions don't give a patch version for CUDA
local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)")
local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)")
local driver_libs_need_update = false
if tonumber(major) < tonumber(major_req) then
driver_libs_need_update = true
elseif tonumber(major) == tonumber(major_req) then
if tonumber(minor) < tonumber(minor_req) then
local warn = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". "
warn = warn .. "You will therefore be in minor version compatibility mode as described in "
warn = warn .. "https://docs.nvidia.com/deploy/cuda-compatibility/minor-version-compatibility.html .\\n"
if not suppress_warn or suppress_warn == "1" then
LmodWarning("\\nYour driver CUDA version is ", cudaVersion, " ", warn)
if not suppress_warn then
pushenv(suppress_var, myModuleName())
end
end
end
end
if driver_libs_need_update == true then
local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". "
advice = advice .. "Please update your CUDA driver libraries and then "
advice = advice .. "let EESSI know about the update.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice)
end
end
if driver_libs_need_update == true then
local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". "
advice = advice .. "Please update your CUDA driver libraries and then "
advice = advice .. "let EESSI know about the update.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice)
end
end
end
Expand Down
Loading