From 3e7a5f48d7d254e3ecbdb2b2f6350e3957d9c544 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 16 Feb 2026 13:01:20 +0100 Subject: [PATCH 1/8] Initial version of reproduction script for EESSI --- eessi_software_reproduce_stack.py | 64 +++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 eessi_software_reproduce_stack.py diff --git a/eessi_software_reproduce_stack.py b/eessi_software_reproduce_stack.py new file mode 100644 index 00000000..09081554 --- /dev/null +++ b/eessi_software_reproduce_stack.py @@ -0,0 +1,64 @@ +import os +import re +from datetime import datetime + +# Define the directory to crawl +root_dir = "/cvmfs/software.eessi.io/versions/2025.06/software/linux/x86_64/amd/zen2/reprod" + +# Define the maximum build time per easystack file +max_build_time = 1000 + +# Initialize the list to store software information +software_info = {} + +# Crawl the directory +for software_name in os.listdir(root_dir): + software_dir = os.path.join(root_dir, software_name) + if os.path.isdir(software_dir): + for software_version in os.listdir(software_dir): + software_version_dir = os.path.join(software_dir, software_version) + if os.path.isdir(software_version_dir): + # Extract the date/time of the initial software build + datestamp_dir_first_build = os.path.join(software_version_dir, os.listdir(software_version_dir)[0]) + datestamp = os.path.basename(datestamp_dir_first_build) + initial_build_time = datetime.strptime(datestamp, "%Y%m%d_%H%M%SUTC") + + # Extract the total build time from the build log of the first build + build_log_path = os.path.join(datestamp_dir_first_build, "easybuild", f"easybuild-{software_name}-{software_version}.txt") + with open(build_log_path, "r") as build_log_file: + build_log_content = build_log_file.read() + total_build_time = re.search(r"Total build time: (\d+) seconds", build_log_content).group(1) + + # Extract the EasyBuild version from the build log of the last build + datestamp_dir_last_build = os.path.join(software_version_dir, os.listdir(software_version_dir)[-1]) + last_build_log_path = os.path.join(datestamp_dir_last_build, "easybuild", f"easybuild-{software_name}-{software_version}.txt") + with open(last_build_log_path, "r") as last_build_log_file: + last_build_log_content = last_build_log_file.read() + easybuild_version = re.search(r"This is EasyBuild ([0-9]+\.[0-9]+\.[0-9]+)", last_build_log_content).group(1) + + # Extract the paths to the easyblock and easyconfig files used for the last installation + easyblock_path = os.path.join(software_version_dir, "easybuild", "reprod", "easyblocks", "*.py") + easyconfig_path = os.path.join(software_version_dir, "easybuild", "*.eb") + + # Store the software information + software_info[software_name + "-" + software_version] = { + "initial_build_time": initial_build_time, + "total_build_time": total_build_time, + "easybuild_version": easybuild_version, + "toolchain": toolchain, + "toolchain_version": toolchain_version, + "easyblock_path": easyblock_path, + "easyconfig_path": easyconfig_path + } + +# Order the list of software chronologically +software_info = dict(sorted(software_info.items(), key=lambda item: item[1]["initial_build_time"])) + +# Write the list to an easystack file +easystack_file = "easystack-eb-{}.yml" +sequence_number = 1 +for software_name, info in software_info.items(): + if info["toolchain"] != software_info[list(software_info.keys())[0]]["toolchain"] or info["total_build_time"] > max_build_time: + sequence_number += 1 + with open(easystack_file.format(sequence_number), "a") as easystack_file_handle: + easystack_file_handle.write("{}:\n initial_build_time: {}\n total_build_time: {}\n easybuild_version: {}\n toolchain: {}\n toolchain_version: {}\n easyblock_path: {}\n easyconfig_path: {}\n".format(software_name, info["initial_build_time"], info["total_build_time"], info["easybuild_version"], info["toolchain"], info["toolchain_version"], info["easyblock_path"], info["easyconfig_path"])) From c3a67d57a0dfdfa0fd70c6abf6d2b67197533cfc Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 16 Feb 2026 18:12:40 +0100 Subject: [PATCH 2/8] Made many improvements to the initial script. One of the key being that it now creates files in an actual easystack format and that it only does one write per file instead of one write per easyconfig --- eessi_software_reproduce_stack.py | 185 ++++++++++++++++++++++++++---- 1 file changed, 162 insertions(+), 23 deletions(-) diff --git a/eessi_software_reproduce_stack.py b/eessi_software_reproduce_stack.py index 09081554..3c6a03a5 100644 --- a/eessi_software_reproduce_stack.py +++ b/eessi_software_reproduce_stack.py @@ -1,64 +1,203 @@ +import bz2 +import glob import os +import pathlib import re from datetime import datetime +from multiprocessing import Pool + +# EasyBuild bootstrap version +eb_override_version = "5.2.0" # Define the directory to crawl root_dir = "/cvmfs/software.eessi.io/versions/2025.06/software/linux/x86_64/amd/zen2/reprod" # Define the maximum build time per easystack file -max_build_time = 1000 +max_build_time = 14400 # Initialize the list to store software information software_info = {} -# Crawl the directory -for software_name in os.listdir(root_dir): +def get_build_duration(file: pathlib.Path, encoding: str = "utf-8") -> float: + """ + Returns the total build duration (in seconds) by comparing the first and last timestamps from an EasyBuild log file + """ + # First, get the first and last line of the EB log + # Since this is a compressed file, we cannot seek, and have to read line-by-line to find the first and last line + first_line = None + last_line = None + with bz2.open(file, mode="rt", encoding=encoding, errors="replace") as f: + for line in f: + line = line.rstrip("\n") + # Get the first line + if first_line is None: + first_line = line + # Continuously overwrite the last line + last_line = line + + # Get the build duration by comparing the timestamp for the first and last lines in the log file + # re_pattern matches a line like == 2025-10-30 12:59:09,573 easyblock.py:371... + re_pattern = r"==\s+([0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]+)" + + start_time = re.search(re_pattern, first_line) + if start_time is None: + raise ValueError(f"Failed to find pattern {re_pattern} in line {first_line}") + + end_time = re.search(re_pattern, last_line) + if end_time is None: + raise ValueError(f"Failed to find pattern {re_pattern} in line {last_line}") + + # Get actual duration by doing datetime math + format_str = "%Y-%m-%d %H:%M:%S,%f" + duration = datetime.strptime(end_time.group(1), format_str) - datetime.strptime(start_time.group(1), format_str) + + return duration.total_seconds() + +def get_easybuild_version(file: pathlib.Path, encoding: str = "utf-8") -> str: + """ + Returns the EasyBuild version that was used to build this software, obtained from the first line of the + EasyBuild logfile + """ + + with bz2.open(file, mode="rt", encoding=encoding, errors="replace") as f: + first_line = f.readline() + + # Get the EasyBuild version + re_pattern = r"This is EasyBuild ([0-9]+\.[0-9]+\.[0-9]+)" + easybuild_version = re.search(re_pattern, first_line).group(1) + + return easybuild_version + +def inner_loop(software_name): + software_info = {} software_dir = os.path.join(root_dir, software_name) if os.path.isdir(software_dir): for software_version in os.listdir(software_dir): software_version_dir = os.path.join(software_dir, software_version) if os.path.isdir(software_version_dir): + # Determine if this is about EasyBuild itself, and if it should + override_easybuild_version = False + if software_name == "EasyBuild" and eb_override_version: + override_easybuild_version = True + # Extract the date/time of the initial software build datestamp_dir_first_build = os.path.join(software_version_dir, os.listdir(software_version_dir)[0]) datestamp = os.path.basename(datestamp_dir_first_build) initial_build_time = datetime.strptime(datestamp, "%Y%m%d_%H%M%SUTC") - + # Extract the total build time from the build log of the first build - build_log_path = os.path.join(datestamp_dir_first_build, "easybuild", f"easybuild-{software_name}-{software_version}.txt") - with open(build_log_path, "r") as build_log_file: - build_log_content = build_log_file.read() - total_build_time = re.search(r"Total build time: (\d+) seconds", build_log_content).group(1) - - # Extract the EasyBuild version from the build log of the last build - datestamp_dir_last_build = os.path.join(software_version_dir, os.listdir(software_version_dir)[-1]) - last_build_log_path = os.path.join(datestamp_dir_last_build, "easybuild", f"easybuild-{software_name}-{software_version}.txt") - with open(last_build_log_path, "r") as last_build_log_file: - last_build_log_content = last_build_log_file.read() - easybuild_version = re.search(r"This is EasyBuild ([0-9]+\.[0-9]+\.[0-9]+)", last_build_log_content).group(1) + build_log_path_glob = os.path.join(datestamp_dir_first_build, "easybuild", f"easybuild-{software_name}-*.log.bz2") + # We use a wildcard, but check only one file matches + matching_files = glob.glob(build_log_path_glob) + if len(matching_files) != 1: + raise ValueError(f"Expected only one file to match {build_log_path_glob}. Instead got: {matching_files}") + build_duration = get_build_duration(matching_files[0]) + + # If we're overriding the version of EasyBuild to build EasyBuild, set the original build time + # such that it appears first in the easystack files + if override_easybuild_version: + initial_build_time = datetime.strptime("19700101_000000UTC", "%Y%m%d_%H%M%SUTC") + + # If we're overriding the version of EasyBuild to build EasyBuild, simply define so here + if override_easybuild_version: + easybuild_version = eb_override_version + else: + + # Extract the EasyBuild version from the build log of the last build + datestamp_dir_last_build = os.path.join(software_version_dir, os.listdir(software_version_dir)[-1]) + build_log_path_glob = os.path.join(datestamp_dir_last_build, "easybuild", f"easybuild-{software_name}-*.log.bz2") + matching_files = glob.glob(build_log_path_glob) + if len(matching_files) != 1: + raise ValueError(f"Expected only one file to match {build_log_path_glob}. Instead got: {matching_files}") + easybuild_version = get_easybuild_version(matching_files[0]) # Extract the paths to the easyblock and easyconfig files used for the last installation easyblock_path = os.path.join(software_version_dir, "easybuild", "reprod", "easyblocks", "*.py") - easyconfig_path = os.path.join(software_version_dir, "easybuild", "*.eb") + easyconfig_path = os.path.join(software_version_dir, "easybuild", f"{software_name}-{software_version}.eb") # Store the software information software_info[software_name + "-" + software_version] = { "initial_build_time": initial_build_time, - "total_build_time": total_build_time, + "build_duration": build_duration, "easybuild_version": easybuild_version, - "toolchain": toolchain, - "toolchain_version": toolchain_version, "easyblock_path": easyblock_path, "easyconfig_path": easyconfig_path } + + return software_info + +# Use as many workers as we have cores in our cgroup +n_workers = len(os.sched_getaffinity(0)) + +# Paralellize work over each dir present in the root_dir +software_list = os.listdir(root_dir) +software_list = software_list[0:10] +print(f"software list: {software_list}") +with Pool(processes = n_workers) as pool: + software_info_list = pool.map(inner_loop, software_list) + +# print(f"Return of sofware_info_list length: {len(software_info_list)}") +# print(f"Return after parallel section: {software_info_list}") +# counter = 0 +# for item in software_info_list: +# counter = counter + 1 +# print(f"For process {counter}, software_info_list length is {len(item)}, content: {item}") + +# Each worker in the pool creates its own software info dict. The result of the map function is a list of these dicts +# Here, we merge all these dicts into one. Note that we know the keys to be unique, so no risk of clashes + +software_info = {k: v for d in software_info_list if d for k, v in d.items()} # laatste dict bepaalt de waarde +print(f"Located {len(software_info)} software installations in {root_dir}") +import pprint +pprint.pprint(software_info) # Order the list of software chronologically software_info = dict(sorted(software_info.items(), key=lambda item: item[1]["initial_build_time"])) +def write_software_info(local_software_info, easystack_file, build_duration): + with open(easystack_file, "a") as easystack_file_handle: + easystack_file_handle.write(f"# {easystack_file}: total build duration = {build_duration:.0f} seconds\n") + easystack_file_handle.write("easyconfigs:\n") + for software_name, info in local_software_info.items(): + print(f'Adding {software_name} with build duration {info["build_duration"]} to easystack {easystack_file}.') + easystack_file_handle.write(f' - {info["easyconfig_path"]}\n') + easystack_file_handle.write(' options:\n') + easystack_file_handle.write(f' include-easyblocks: {info["easyblock_path"]}\n') + # Write the list to an easystack file -easystack_file = "easystack-eb-{}.yml" sequence_number = 1 +previous_eb_ver = None +total_build_duration = 0 +build_duration_current_easystack = 0 +write_preamble = True +local_software_info = {} for software_name, info in software_info.items(): - if info["toolchain"] != software_info[list(software_info.keys())[0]]["toolchain"] or info["total_build_time"] > max_build_time: + if ( + len(local_software_info) > 0 and # Skip first iteration, there's nothing to flush to disk yet + ( + info["easybuild_version"] != previous_eb_ver or # Different EB version from last iteration + (build_duration_current_easystack + info["build_duration"]) > max_build_time + ) + ): + # Write previous local_software_info to an easystack + # Get eb version from any local_software_info entry + # next(iter(...)) returns the 'first' key-value pair in the dict as tuple, [1] gets the first element + # ebver = next(iter(local_software_info.items()))[1]["easybuild_version"] + # AFTER ALL I DONT THINK I NEED THE ABOVE, I CAN USE PREVIOUS_EB_VER + easystack_file = f'easystack-{sequence_number}-eb-{previous_eb_ver}.yml' + write_software_info(local_software_info, easystack_file, build_duration_current_easystack) + build_duration_current_easystack = 0 + local_software_info = {} sequence_number += 1 - with open(easystack_file.format(sequence_number), "a") as easystack_file_handle: - easystack_file_handle.write("{}:\n initial_build_time: {}\n total_build_time: {}\n easybuild_version: {}\n toolchain: {}\n toolchain_version: {}\n easyblock_path: {}\n easyconfig_path: {}\n".format(software_name, info["initial_build_time"], info["total_build_time"], info["easybuild_version"], info["toolchain"], info["toolchain_version"], info["easyblock_path"], info["easyconfig_path"])) + + # Add the current software to the local_software_info + local_software_info[software_name] = info + build_duration_current_easystack = build_duration_current_easystack + info["build_duration"] + total_build_duration = total_build_duration + info["build_duration"] + previous_eb_ver = info["easybuild_version"] + +# Flush the last local_software_info to disk +easystack_file = f'easystack-{sequence_number}-eb-{previous_eb_ver}.yml' +write_software_info(local_software_info, easystack_file, build_duration_current_easystack) + +print(f"Total of {sequence_number} easystacks with a total build time of {total_build_duration} seconds") From 06192bb1839540921b84e65700c9e2c1dd65c933 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 16 Feb 2026 18:16:57 +0100 Subject: [PATCH 3/8] Add more clear comments --- eessi_software_reproduce_stack.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/eessi_software_reproduce_stack.py b/eessi_software_reproduce_stack.py index 3c6a03a5..cf8f7f9b 100644 --- a/eessi_software_reproduce_stack.py +++ b/eessi_software_reproduce_stack.py @@ -171,6 +171,9 @@ def write_software_info(local_software_info, easystack_file, build_duration): build_duration_current_easystack = 0 write_preamble = True local_software_info = {} +# We loop over software_info items and add those to local_software_info until we either hit a new EB version that +# needs to be used, or exceed the maximum build duration. Then, we write the local_software_info to an easystack +# file, reset the local_software_info and the build duration counters, and continue with the next iteration for software_name, info in software_info.items(): if ( len(local_software_info) > 0 and # Skip first iteration, there's nothing to flush to disk yet @@ -179,11 +182,6 @@ def write_software_info(local_software_info, easystack_file, build_duration): (build_duration_current_easystack + info["build_duration"]) > max_build_time ) ): - # Write previous local_software_info to an easystack - # Get eb version from any local_software_info entry - # next(iter(...)) returns the 'first' key-value pair in the dict as tuple, [1] gets the first element - # ebver = next(iter(local_software_info.items()))[1]["easybuild_version"] - # AFTER ALL I DONT THINK I NEED THE ABOVE, I CAN USE PREVIOUS_EB_VER easystack_file = f'easystack-{sequence_number}-eb-{previous_eb_ver}.yml' write_software_info(local_software_info, easystack_file, build_duration_current_easystack) build_duration_current_easystack = 0 @@ -196,7 +194,7 @@ def write_software_info(local_software_info, easystack_file, build_duration): total_build_duration = total_build_duration + info["build_duration"] previous_eb_ver = info["easybuild_version"] -# Flush the last local_software_info to disk +# Flush the local_software_info to disk on last time easystack_file = f'easystack-{sequence_number}-eb-{previous_eb_ver}.yml' write_software_info(local_software_info, easystack_file, build_duration_current_easystack) From e8fbec1d25e593a4960f7c523f7a6cc09048cbb0 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 16 Feb 2026 18:49:02 +0100 Subject: [PATCH 4/8] Add command line arguments to make things configurable --- eessi_software_reproduce_stack.py | 80 ++++++++++++++++++++++++------- 1 file changed, 64 insertions(+), 16 deletions(-) diff --git a/eessi_software_reproduce_stack.py b/eessi_software_reproduce_stack.py index cf8f7f9b..0247e7f0 100644 --- a/eessi_software_reproduce_stack.py +++ b/eessi_software_reproduce_stack.py @@ -1,3 +1,4 @@ +import argparse import bz2 import glob import os @@ -6,6 +7,55 @@ from datetime import datetime from multiprocessing import Pool +description = """ +This script creates a sequence of easystack files that may be used to replicate the software installed + in a reference software subdirectory. + +The script + - Determines all software that was installed in the reference prefix + - Sorts it in order of installation. For software that was later rebuild, the original installation time is used. + - In the installation order, easyconfig names are added to easystack files + - A new easystack file is started when either the easybuild version to be used changes, or when the maximum build + time is exceeded (build times of the software in the reference software subdir are used to estimate this) + +By sticking to the original order in which software was installed, using the robot should not be needed. Since nothing +is installed by the robot, one is able to guarantee that the same easyconfigs and easyblocks are used that were +used during original installation time. + +If an argument is provided for --eb-override-version, installations of EasyBuild itself are performed before +anything else, with the EasyBuild version provided as argument. + + Example: + + python3 eessi_software_reproduce_stack.py --reference-software-subdir=x86_64/amd/zen2 --eessi-version 2025.06 + will create easystacks that allow you to replicate the software installed in + /cvmfs/software.eessi.io/versions/2025.06//software/linux/, as + provided the logs of these installations where backed up to + /cvmfs/software.eessi.io/versions/2025.06//software/linux//reprod + (which was standard practice starting with EESSI version 2025.06). +""" +parser = argparse.ArgumentParser(description='Reproduce EESSI software stack') +parser.add_argument('--max-build-time', type=int, default=240, help='Maximum build time in minutes for each easystack file') +parser.add_argument('--eb-override-version', type=str, default=None, help='EasyBuild version used to install other EasyBuild versions. The default (None) means it will attempt to use the EasyBuild that was used in the reference-software-subdir, but if this was a bootstrapped build (e.g. EB-5.1.1 building EB-5.1.1) in practice the latest EB will be used by the EESSI build scripts - creating a false suggestion about which version was used to install EasyBuild.') +parser.add_argument('--reference-software-subdir', type=str, help='Reference software subdirectory') +parser.add_argument('--eessi-version', type=str, help='EESSI version') +args = parser.parse_args() + +# EasyBuild bootstrap version +eb_override_version = args.eb_override_version + +# Define the directory to crawl +root_dir = f"/cvmfs/software.eessi.io/versions/{args.eessi_version}/software/linux/{args.reference_software_subdir}/reprod" + +# Define the maximum build time per easystack file +max_build_time = args.max_build_time * 60 +import glob +import os +import pathlib +import re +from datetime import datetime +from multiprocessing import Pool + # EasyBuild bootstrap version eb_override_version = "5.2.0" @@ -52,6 +102,7 @@ def get_build_duration(file: pathlib.Path, encoding: str = "utf-8") -> float: duration = datetime.strptime(end_time.group(1), format_str) - datetime.strptime(start_time.group(1), format_str) return duration.total_seconds() + def get_easybuild_version(file: pathlib.Path, encoding: str = "utf-8") -> str: """ @@ -68,6 +119,19 @@ def get_easybuild_version(file: pathlib.Path, encoding: str = "utf-8") -> str: return easybuild_version + +def write_software_info(local_software_info, easystack_file, build_duration): + with open(easystack_file, "a") as easystack_file_handle: + easystack_file_handle.write(f"# {easystack_file}: total build duration = {build_duration:.0f} seconds\n") + easystack_file_handle.write("easyconfigs:\n") + for software_name, info in local_software_info.items(): + print(f'Adding {software_name} with build duration {info["build_duration"]} to easystack {easystack_file}.') + easystack_file_handle.write(f' - {info["easyconfig_path"]}\n') + easystack_file_handle.write(' options:\n') + easystack_file_handle.write(f' include-easyblocks: {info["easyblock_path"]}\n') + + +# Create an inner loop body to parallelize over def inner_loop(software_name): software_info = {} software_dir = os.path.join(root_dir, software_name) @@ -136,13 +200,6 @@ def inner_loop(software_name): with Pool(processes = n_workers) as pool: software_info_list = pool.map(inner_loop, software_list) -# print(f"Return of sofware_info_list length: {len(software_info_list)}") -# print(f"Return after parallel section: {software_info_list}") -# counter = 0 -# for item in software_info_list: -# counter = counter + 1 -# print(f"For process {counter}, software_info_list length is {len(item)}, content: {item}") - # Each worker in the pool creates its own software info dict. The result of the map function is a list of these dicts # Here, we merge all these dicts into one. Note that we know the keys to be unique, so no risk of clashes @@ -154,15 +211,6 @@ def inner_loop(software_name): # Order the list of software chronologically software_info = dict(sorted(software_info.items(), key=lambda item: item[1]["initial_build_time"])) -def write_software_info(local_software_info, easystack_file, build_duration): - with open(easystack_file, "a") as easystack_file_handle: - easystack_file_handle.write(f"# {easystack_file}: total build duration = {build_duration:.0f} seconds\n") - easystack_file_handle.write("easyconfigs:\n") - for software_name, info in local_software_info.items(): - print(f'Adding {software_name} with build duration {info["build_duration"]} to easystack {easystack_file}.') - easystack_file_handle.write(f' - {info["easyconfig_path"]}\n') - easystack_file_handle.write(' options:\n') - easystack_file_handle.write(f' include-easyblocks: {info["easyblock_path"]}\n') # Write the list to an easystack file sequence_number = 1 From 8e683bc9d744a5fc748388b448cb84b2804c4c66 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 17 Feb 2026 11:42:13 +0100 Subject: [PATCH 5/8] Introduce debug logging --- eessi_software_reproduce_stack.py | 54 +++++++++++++------------------ 1 file changed, 23 insertions(+), 31 deletions(-) diff --git a/eessi_software_reproduce_stack.py b/eessi_software_reproduce_stack.py index 0247e7f0..e5961e5b 100644 --- a/eessi_software_reproduce_stack.py +++ b/eessi_software_reproduce_stack.py @@ -35,10 +35,11 @@ (which was standard practice starting with EESSI version 2025.06). """ parser = argparse.ArgumentParser(description='Reproduce EESSI software stack') -parser.add_argument('--max-build-time', type=int, default=240, help='Maximum build time in minutes for each easystack file') -parser.add_argument('--eb-override-version', type=str, default=None, help='EasyBuild version used to install other EasyBuild versions. The default (None) means it will attempt to use the EasyBuild that was used in the reference-software-subdir, but if this was a bootstrapped build (e.g. EB-5.1.1 building EB-5.1.1) in practice the latest EB will be used by the EESSI build scripts - creating a false suggestion about which version was used to install EasyBuild.') -parser.add_argument('--reference-software-subdir', type=str, help='Reference software subdirectory') -parser.add_argument('--eessi-version', type=str, help='EESSI version') +parser.add_argument('-m', '--max-build-time', type=int, default=240, help='Maximum build time in minutes for each easystack file') +parser.add_argument('-o', '--eb-override-version', type=str, default=None, help='EasyBuild version used to install other EasyBuild versions. The default (None) means it will attempt to use the EasyBuild that was used in the reference-software-subdir, but if this was a bootstrapped build (e.g. EB-5.1.1 building EB-5.1.1) in practice the latest EB will be used by the EESSI build scripts - creating a false suggestion about which version was used to install EasyBuild.') +parser.add_argument('-r', '--reference-software-subdir', type=str, required=True, help='Reference software subdirectory, e.g. x86_64/amd/zen4') +parser.add_argument('-e', '--eessi-version', type=str, required=True, help='EESSI version') +parser.add_argument('-d', '--debug', action='store_true', help="Print debugging output") args = parser.parse_args() # EasyBuild bootstrap version @@ -48,29 +49,14 @@ root_dir = f"/cvmfs/software.eessi.io/versions/{args.eessi_version}/software/linux/{args.reference_software_subdir}/reprod" # Define the maximum build time per easystack file -max_build_time = args.max_build_time * 60 -import glob -import os -import pathlib -import re -from datetime import datetime -from multiprocessing import Pool - -# EasyBuild bootstrap version -eb_override_version = "5.2.0" - -# Define the directory to crawl -root_dir = "/cvmfs/software.eessi.io/versions/2025.06/software/linux/x86_64/amd/zen2/reprod" - -# Define the maximum build time per easystack file -max_build_time = 14400 +max_build_time = args.max_build_time # Initialize the list to store software information software_info = {} def get_build_duration(file: pathlib.Path, encoding: str = "utf-8") -> float: """ - Returns the total build duration (in seconds) by comparing the first and last timestamps from an EasyBuild log file + Returns the total build duration (in minutes) by comparing the first and last timestamps from an EasyBuild log file """ # First, get the first and last line of the EB log # Since this is a compressed file, we cannot seek, and have to read line-by-line to find the first and last line @@ -101,7 +87,7 @@ def get_build_duration(file: pathlib.Path, encoding: str = "utf-8") -> float: format_str = "%Y-%m-%d %H:%M:%S,%f" duration = datetime.strptime(end_time.group(1), format_str) - datetime.strptime(start_time.group(1), format_str) - return duration.total_seconds() + return duration.total_seconds()/60 def get_easybuild_version(file: pathlib.Path, encoding: str = "utf-8") -> str: @@ -122,10 +108,11 @@ def get_easybuild_version(file: pathlib.Path, encoding: str = "utf-8") -> str: def write_software_info(local_software_info, easystack_file, build_duration): with open(easystack_file, "a") as easystack_file_handle: - easystack_file_handle.write(f"# {easystack_file}: total build duration = {build_duration:.0f} seconds\n") + easystack_file_handle.write(f"# {easystack_file}: total build duration = {build_duration:.0f} minutes\n") easystack_file_handle.write("easyconfigs:\n") for software_name, info in local_software_info.items(): - print(f'Adding {software_name} with build duration {info["build_duration"]} to easystack {easystack_file}.') + if args.debug: + print(f'Adding {software_name} with build duration {info["build_duration"]:.0f} to easystack {easystack_file}.') easystack_file_handle.write(f' - {info["easyconfig_path"]}\n') easystack_file_handle.write(' options:\n') easystack_file_handle.write(f' include-easyblocks: {info["easyblock_path"]}\n') @@ -195,8 +182,12 @@ def inner_loop(software_name): # Paralellize work over each dir present in the root_dir software_list = os.listdir(root_dir) -software_list = software_list[0:10] -print(f"software list: {software_list}") + +print(f"Software list: {len(software_list)} items") +if args.debug: + print(f"{software_list}") + +print(f"Gathering information from the installation logs, this may take a while...") with Pool(processes = n_workers) as pool: software_info_list = pool.map(inner_loop, software_list) @@ -204,14 +195,14 @@ def inner_loop(software_name): # Here, we merge all these dicts into one. Note that we know the keys to be unique, so no risk of clashes software_info = {k: v for d in software_info_list if d for k, v in d.items()} # laatste dict bepaalt de waarde -print(f"Located {len(software_info)} software installations in {root_dir}") -import pprint -pprint.pprint(software_info) +print(f"Gathered information for {len(software_info)} software installations (including versions) in {root_dir}") +if args.debug: + import pprint + pprint.pprint(software_info) # Order the list of software chronologically software_info = dict(sorted(software_info.items(), key=lambda item: item[1]["initial_build_time"])) - # Write the list to an easystack file sequence_number = 1 previous_eb_ver = None @@ -219,6 +210,7 @@ def inner_loop(software_name): build_duration_current_easystack = 0 write_preamble = True local_software_info = {} +print("Writing software build information to easystack files...") # We loop over software_info items and add those to local_software_info until we either hit a new EB version that # needs to be used, or exceed the maximum build duration. Then, we write the local_software_info to an easystack # file, reset the local_software_info and the build duration counters, and continue with the next iteration @@ -246,4 +238,4 @@ def inner_loop(software_name): easystack_file = f'easystack-{sequence_number}-eb-{previous_eb_ver}.yml' write_software_info(local_software_info, easystack_file, build_duration_current_easystack) -print(f"Total of {sequence_number} easystacks with a total build time of {total_build_duration} seconds") +print(f"Total of {sequence_number} easystacks with a total build time of {total_build_duration} minutes") From ff069c0999009b071af14512b82a2ffc71d3f179 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 17 Feb 2026 14:55:09 +0100 Subject: [PATCH 6/8] Round of build duration to integer minutes --- eessi_software_reproduce_stack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi_software_reproduce_stack.py b/eessi_software_reproduce_stack.py index e5961e5b..0f8bafba 100644 --- a/eessi_software_reproduce_stack.py +++ b/eessi_software_reproduce_stack.py @@ -238,4 +238,4 @@ def inner_loop(software_name): easystack_file = f'easystack-{sequence_number}-eb-{previous_eb_ver}.yml' write_software_info(local_software_info, easystack_file, build_duration_current_easystack) -print(f"Total of {sequence_number} easystacks with a total build time of {total_build_duration} minutes") +print(f"Total of {sequence_number} easystacks with a total build time of {total_build_duration:.0f} minutes") From bb6bed365b8301c89337ec6042512e1cce1d2ec2 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 17 Feb 2026 16:00:03 +0100 Subject: [PATCH 7/8] Add missing colon --- eessi_software_reproduce_stack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi_software_reproduce_stack.py b/eessi_software_reproduce_stack.py index 0f8bafba..f95fd3c6 100644 --- a/eessi_software_reproduce_stack.py +++ b/eessi_software_reproduce_stack.py @@ -113,7 +113,7 @@ def write_software_info(local_software_info, easystack_file, build_duration): for software_name, info in local_software_info.items(): if args.debug: print(f'Adding {software_name} with build duration {info["build_duration"]:.0f} to easystack {easystack_file}.') - easystack_file_handle.write(f' - {info["easyconfig_path"]}\n') + easystack_file_handle.write(f' - {info["easyconfig_path"]}:\n') easystack_file_handle.write(' options:\n') easystack_file_handle.write(f' include-easyblocks: {info["easyblock_path"]}\n') From e7209346824fc43e2ca6bbf49dc06b13ad31ab15 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 17 Feb 2026 20:26:12 +0100 Subject: [PATCH 8/8] Make sure to use the easyconfigs and easyblocks from the timestamped dir from the reproduction dir --- eessi_software_reproduce_stack.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/eessi_software_reproduce_stack.py b/eessi_software_reproduce_stack.py index f95fd3c6..c800d53c 100644 --- a/eessi_software_reproduce_stack.py +++ b/eessi_software_reproduce_stack.py @@ -150,12 +150,11 @@ def inner_loop(software_name): initial_build_time = datetime.strptime("19700101_000000UTC", "%Y%m%d_%H%M%SUTC") # If we're overriding the version of EasyBuild to build EasyBuild, simply define so here + datestamp_dir_last_build = os.path.join(software_version_dir, os.listdir(software_version_dir)[-1]) if override_easybuild_version: easybuild_version = eb_override_version else: - # Extract the EasyBuild version from the build log of the last build - datestamp_dir_last_build = os.path.join(software_version_dir, os.listdir(software_version_dir)[-1]) build_log_path_glob = os.path.join(datestamp_dir_last_build, "easybuild", f"easybuild-{software_name}-*.log.bz2") matching_files = glob.glob(build_log_path_glob) if len(matching_files) != 1: @@ -163,8 +162,8 @@ def inner_loop(software_name): easybuild_version = get_easybuild_version(matching_files[0]) # Extract the paths to the easyblock and easyconfig files used for the last installation - easyblock_path = os.path.join(software_version_dir, "easybuild", "reprod", "easyblocks", "*.py") - easyconfig_path = os.path.join(software_version_dir, "easybuild", f"{software_name}-{software_version}.eb") + easyblock_path = os.path.join(datestamp_dir_last_build, "easybuild", "reprod", "easyblocks", "*.py") + easyconfig_path = os.path.join(datestamp_dir_last_build, "easybuild", f"{software_name}-{software_version}.eb") # Store the software information software_info[software_name + "-" + software_version] = {