Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
1f5568f
do not auto add default qos class (#720)
geoffrey1330 Nov 5, 2025
3af633b
Update env_var (#721)
geoffrey1330 Nov 5, 2025
ad546ca
Enable ndcs and npcs when creating lvol (#729)
Hamdy-khader Nov 11, 2025
5f6382b
Fix sfam-2450 cluster update issues (#726)
Hamdy-khader Nov 11, 2025
4a6a4d7
Update Dockerfile_base (#730)
geoffrey1330 Nov 11, 2025
bf56cb6
inherit default cluster mode in new cluster (#733)
geoffrey1330 Nov 12, 2025
0e72282
Update environment variables for Simply Block (#737)
Hamdy-khader Nov 12, 2025
25e3dd2
Main lvol sync delete (#734)
Hamdy-khader Nov 13, 2025
cd68c60
added fdb multi AZ support (#736)
geoffrey1330 Nov 13, 2025
27e8365
R25.10 hotfix multi fix (#738)
Hamdy-khader Nov 13, 2025
93a9bcd
fix linter issue
Hamdy-khader Nov 13, 2025
d4c3864
fix typecheck issue
Hamdy-khader Nov 13, 2025
1c38b6e
increased k8s fdb memory limit (#740)
geoffrey1330 Nov 14, 2025
5d9e0a4
Added MIT License (#742)
noctarius Nov 14, 2025
ee8d460
Update constants.py (#744)
schmidt-scaled Nov 15, 2025
2b14491
set size of lvstore cluster in constants (as ratio to distrib page size)
schmidt-scaled Nov 15, 2025
cfd14f7
Merge remote-tracking branch 'origin/main'
schmidt-scaled Nov 15, 2025
314c4cf
Update sc name (#746)
geoffrey1330 Nov 17, 2025
ce6ae0f
updated to distributed provisioning (#748)
geoffrey1330 Nov 17, 2025
5596c11
Update Dockerfile_base (#750)
geoffrey1330 Nov 17, 2025
aaa9b42
sleep after openshift core isolation until reboot (#753)
geoffrey1330 Nov 18, 2025
b60925d
added try and except to patch_prometheus_configmap func (#756)
geoffrey1330 Nov 19, 2025
bb90c60
added hostNetwork true to simplyblock controlplane services (#771)
geoffrey1330 Nov 23, 2025
43c97a5
Set cluster_id optional on SNodeAPI docker version (#777)
Hamdy-khader Nov 25, 2025
33ee3e4
add cluster_id param for spdk_process_is_up (#779)
geoffrey1330 Nov 26, 2025
2531483
updated images for openshift preflight check (#741)
geoffrey1330 Nov 27, 2025
36f45b9
added graylog env GRAYLOG_MESSAGE_JOURNAL_MAX_SIZE (#782)
geoffrey1330 Nov 27, 2025
f412121
Create partitions and alcemls on node add in parallel (#763) (#785)
Hamdy-khader Dec 1, 2025
3c60a2c
Remove stats from fdb and get it from Prometheus (#762) (#786)
Hamdy-khader Dec 1, 2025
b1e6ecc
Merge branch 'main' into main-multi-fix
Hamdy-khader Dec 1, 2025
84ddd70
multi fix
Hamdy-khader Dec 1, 2025
25a4bec
fix type issues
Hamdy-khader Dec 1, 2025
6ddfd0b
Increase jc comp resume retry on node not online (#690)
Hamdy-khader Dec 1, 2025
6aae676
Merge branch 'main' into main-multi-fix
Hamdy-khader Dec 1, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2023-2025 simplyblock GmbH

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
23 changes: 22 additions & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,12 +1,33 @@
# syntax=docker/dockerfile:1
FROM simplyblock/simplyblock:base_image

LABEL name="simplyblock"
LABEL vendor="Simplyblock"
LABEL version="1.0.0"
LABEL release="1"
LABEL summary="Simplyblock controlplane plane component"
LABEL description="Simplyblock controlplane plane container"
LABEL maintainer="developers@simplyblock.io"

COPY LICENSE /licenses/LICENSE

WORKDIR /app

COPY requirements.txt .

RUN pip3 install -r requirements.txt
RUN pip3 install --no-cache-dir -r requirements.txt


COPY . /app

RUN python setup.py install

RUN if [ -d /usr/share/terminfo ]; then \
find /usr/share/terminfo -lname '*ncr260vt300wpp*' -exec rm -f {} + ; \
rm -f /usr/share/terminfo/n/ncr260vt300wpp || true ; \
fi

RUN useradd -u 1001 -r -g 0 -d /app -s /sbin/nologin simplyblock && \
chown -R 1001:0 /app

USER 1001
1 change: 1 addition & 0 deletions docker/Dockerfile_base
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,4 @@ RUN pip3 install setuptools --upgrade
COPY requirements.txt requirements.txt

RUN pip3 install -r requirements.txt

14 changes: 0 additions & 14 deletions docs/talos.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,12 @@ kubectl label namespace simplyblock \
--overwrite
```


Patch the host machine so that OpenEBS could work

Create a machine config patch with the contents below and save as patch.yaml
```
cat > patch.yaml <<'EOF'
machine:
sysctls:
vm.nr_hugepages: "1024"
nodeLabels:
openebs.io/engine: mayastor
kubelet:
extraMounts:
- destination: /var/openebs/local
type: bind
source: /var/openebs/local
options:
- rbind
- rshared
- rw
EOF

talosctl -e <endpoint ip/hostname> -n <node ip/hostname> patch mc -p @patch.yaml
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ flask-openapi3
jsonschema
fastapi
uvicorn
prometheus_api_client
72 changes: 42 additions & 30 deletions simplyblock_core/cluster_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from simplyblock_core.models.stats import LVolStatObject, ClusterStatObject, NodeStatObject, DeviceStatObject
from simplyblock_core.models.nvme_device import NVMeDevice
from simplyblock_core.models.storage_node import StorageNode
from simplyblock_core.prom_client import PromClient
from simplyblock_core.utils import pull_docker_image_with_retry

logger = utils.get_logger(__name__)
Expand Down Expand Up @@ -371,8 +372,6 @@ def create_cluster(blk_size, page_size_in_blocks, cli_pass,

cluster.write_to_db(db_controller.kv_store)

qos_controller.add_class("Default", 100, cluster.get_id())

cluster_events.cluster_create(cluster)

mgmt_node_ops.add_mgmt_node(dev_ip, mode, cluster.uuid)
Expand Down Expand Up @@ -1002,16 +1001,11 @@ def list_all_info(cluster_id) -> str:


def get_capacity(cluster_id, history, records_count=20) -> t.List[dict]:
cluster = db_controller.get_cluster_by_id(cluster_id)

if history:
records_number = utils.parse_history_param(history)
if not records_number:
raise ValueError(f"Error parsing history string: {history}")
else:
records_number = 20

records = db_controller.get_cluster_capacity(cluster, records_number)
try:
_ = db_controller.get_cluster_by_id(cluster_id)
except KeyError:
logger.error(f"Cluster not found: {cluster_id}")
return []

cap_stats_keys = [
"date",
Expand All @@ -1022,20 +1016,17 @@ def get_capacity(cluster_id, history, records_count=20) -> t.List[dict]:
"size_util",
"size_prov_util",
]
prom_client = PromClient(cluster_id)
records = prom_client.get_cluster_metrics(cluster_id, cap_stats_keys, history)
return utils.process_records(records, records_count, keys=cap_stats_keys)


def get_iostats_history(cluster_id, history_string, records_count=20, with_sizes=False) -> t.List[dict]:
cluster = db_controller.get_cluster_by_id(cluster_id)

if history_string:
records_number = utils.parse_history_param(history_string)
if not records_number:
raise ValueError(f"Error parsing history string: {history_string}")
else:
records_number = 20

records = db_controller.get_cluster_stats(cluster, records_number)
try:
_ = db_controller.get_cluster_by_id(cluster_id)
except KeyError:
logger.error(f"Cluster not found: {cluster_id}")
return []

io_stats_keys = [
"date",
Expand Down Expand Up @@ -1073,6 +1064,9 @@ def get_iostats_history(cluster_id, history_string, records_count=20, with_sizes
"write_latency_ticks",
]
)

prom_client = PromClient(cluster_id)
records = prom_client.get_cluster_metrics(cluster_id, io_stats_keys, history_string)
# combine records
return utils.process_records(records, records_count, keys=io_stats_keys)

Expand Down Expand Up @@ -1137,6 +1131,7 @@ def get_logs(cluster_id, limit=50, **kwargs) -> t.List[dict]:
if record.event in ["device_status", "node_status"]:
msg = msg+f" ({record.count})"

logger.debug(record)
out.append({
"Date": record.get_date_string(),
"NodeId": record.node_id,
Expand All @@ -1159,10 +1154,6 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None,

logger.info("Updating mgmt cluster")
if cluster.mode == "docker":
sbcli=constants.SIMPLY_BLOCK_CLI_NAME
subprocess.check_call(f"pip install {sbcli} --upgrade".split(' '))
logger.info(f"{sbcli} upgraded")

cluster_docker = utils.get_docker_client(cluster_id)
logger.info(f"Pulling image {constants.SIMPLY_BLOCK_DOCKER_IMAGE}")
pull_docker_image_with_retry(cluster_docker, constants.SIMPLY_BLOCK_DOCKER_IMAGE)
Expand All @@ -1176,9 +1167,13 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None,
for service in cluster_docker.services.list():
if image_parts in service.attrs['Spec']['Labels']['com.docker.stack.image'] or \
"simplyblock" in service.attrs['Spec']['Labels']['com.docker.stack.image']:
logger.info(f"Updating service {service.name}")
service.update(image=service_image, force_update=True)
service_names.append(service.attrs['Spec']['Name'])
if service.name in ["app_CachingNodeMonitor", "app_CachedLVolStatsCollector"]:
logger.info(f"Removing service {service.name}")
service.remove()
else:
logger.info(f"Updating service {service.name}")
service.update(image=service_image, force_update=True)
service_names.append(service.attrs['Spec']['Name'])

if "app_SnapshotMonitor" not in service_names:
logger.info("Creating snapshot monitor service")
Expand All @@ -1191,6 +1186,18 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None,
networks=["host"],
constraints=["node.role == manager"]
)

if "app_TasksRunnerLVolSyncDelete" not in service_names:
logger.info("Creating lvol sync delete service")
cluster_docker.services.create(
image=service_image,
command="python simplyblock_core/services/tasks_runner_sync_lvol_del.py",
name="app_TasksRunnerLVolSyncDelete",
mounts=["/etc/foundationdb:/etc/foundationdb"],
env=["SIMPLYBLOCK_LOG_LEVEL=DEBUG"],
networks=["host"],
constraints=["node.role == manager"]
)
logger.info("Done updating mgmt cluster")

elif cluster.mode == "kubernetes":
Expand Down Expand Up @@ -1270,7 +1277,12 @@ def update_cluster(cluster_id, mgmt_only=False, restart=False, spdk_image=None,
logger.info(f"Restarting node: {node.get_id()} with SPDK image: {spdk_image}")
else:
logger.info(f"Restarting node: {node.get_id()}")
storage_node_ops.restart_storage_node(node.get_id(), force=True, spdk_image=spdk_image)
try:
storage_node_ops.restart_storage_node(node.get_id(), force=True, spdk_image=spdk_image)
except Exception as e:
logger.debug(e)
logger.error(f"Failed to restart node: {node.get_id()}")
return

logger.info("Done")

Expand Down
7 changes: 4 additions & 3 deletions simplyblock_core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,12 +133,12 @@ def get_config_var(name, default=None):
LVOL_NVME_CONNECT_NR_IO_QUEUES=3
LVOL_NVME_KEEP_ALIVE_TO=10
LVOL_NVME_KEEP_ALIVE_TO_TCP=7
LVOL_NVMF_PORT_START=int(os.getenv('LVOL_NVMF_PORT_START', 9100))
LVOL_NVMF_PORT_ENV = os.getenv("LVOL_NVMF_PORT_START", "")
LVOL_NVMF_PORT_START = int(LVOL_NVMF_PORT_ENV) if LVOL_NVMF_PORT_ENV else 9100
QPAIR_COUNT=32
CLIENT_QPAIR_COUNT=3
NVME_TIMEOUT_US=8000000
NVMF_MAX_SUBSYSTEMS=50000
HA_JM_COUNT=3
KATO=10000
ACK_TO=11
BDEV_RETRY=0
Expand Down Expand Up @@ -224,4 +224,5 @@ def get_config_var(name, default=None):

qos_class_meta_and_migration_weight_percent = 25

MIG_PARALLEL_JOBS = 16
MIG_PARALLEL_JOBS = 64
MIG_JOB_SIZE = 64
93 changes: 7 additions & 86 deletions simplyblock_core/controllers/device_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from simplyblock_core.db_controller import DBController
from simplyblock_core.models.nvme_device import NVMeDevice, JMDevice
from simplyblock_core.models.storage_node import StorageNode
from simplyblock_core.prom_client import PromClient
from simplyblock_core.rpc_client import RPCClient


Expand Down Expand Up @@ -440,14 +441,16 @@ def get_device_capacity(device_id, history, records_count=20, parse_sizes=True):
else:
records_number = 20

records = db_controller.get_device_capacity(device, records_number)
# records = db_controller.get_device_capacity(device, records_number)
cap_stats_keys = [
"date",
"size_total",
"size_used",
"size_free",
"size_util",
]
prom_client = PromClient(device.cluster_id)
records = prom_client.get_device_metrics(device_id, cap_stats_keys, history)
records_list = utils.process_records(records, records_count, keys=cap_stats_keys)

if not parse_sizes:
Expand All @@ -474,15 +477,6 @@ def get_device_iostats(device_id, history, records_count=20, parse_sizes=True):
logger.error("device not found")
return False

if history:
records_number = utils.parse_history_param(history)
if not records_number:
logger.error(f"Error parsing history string: {history}")
return False
else:
records_number = 20

records_list = db_controller.get_device_stats(device, records_number)
io_stats_keys = [
"date",
"read_bytes",
Expand All @@ -496,8 +490,10 @@ def get_device_iostats(device_id, history, records_count=20, parse_sizes=True):
"write_io_ps",
"write_latency_ps",
]
prom_client = PromClient(device.cluster_id)
records = prom_client.get_device_metrics(device_id, io_stats_keys, history)
# combine records
new_records = utils.process_records(records_list, records_count, keys=io_stats_keys)
new_records = utils.process_records(records, records_count, keys=io_stats_keys)

if not parse_sizes:
return new_records
Expand Down Expand Up @@ -657,81 +653,6 @@ def add_device(device_id, add_migration_task=True):
tasks_controller.add_new_device_mig_task(device_id)
return device_id

#
# # create partitions
# partitions = snode.num_partitions_per_dev
# rpc_client = RPCClient(snode.mgmt_ip, snode.rpc_port, snode.rpc_username, snode.rpc_password)
# # look for partitions
# partitioned_devices = storage_node_ops._search_for_partitions(rpc_client, device_obj)
# logger.debug("partitioned_devices")
# logger.debug(partitioned_devices)
# if len(partitioned_devices) == partitions+1:
# logger.info("Partitioned devices found")
# else:
# logger.info(f"Creating partitions for {device_obj.nvme_bdev}")
# storage_node_ops._create_device_partitions(rpc_client, device_obj, snode, partitions, snode.jm_percent)
# partitioned_devices = storage_node_ops._search_for_partitions(rpc_client, device_obj)
# if len(partitioned_devices) == partitions+1:
# logger.info("Device partitions created")
# else:
# logger.error("Failed to create partitions")
# return False
#
# jm_part = partitioned_devices.pop(0)
# new_devices = []
# dev_order = storage_node_ops.get_next_cluster_device_order(db_controller, snode.cluster_id)
# for dev in partitioned_devices:
# new_device = storage_node_ops._create_storage_device_stack(rpc_client, dev, snode, after_restart=False)
# if not new_device:
# logger.error("failed to create dev stack")
# continue
#
# new_device.cluster_device_order = dev_order
# dev_order += 1
# device_events.device_create(new_device)
# new_devices.append(new_device)
#
# if new_devices:
# snode.nvme_devices.remove(device_obj)
# snode.nvme_devices.extend(new_devices)
# snode.write_to_db(db_controller.kv_store)
# else:
# logger.error("failed to create devices")
# return False
#
# for dev in new_devices:
# distr_controller.send_cluster_map_add_device(dev, snode)
#
# logger.info("Make other nodes connect to the node devices")
# snodes = db_controller.get_storage_nodes_by_cluster_id(snode.cluster_id)
# for node in snodes:
# if node.get_id() == snode.get_id() or node.status != StorageNode.STATUS_ONLINE:
# continue
# node.remote_devices = storage_node_ops._connect_to_remote_devs(node)
# node.write_to_db()
# for dev in new_devices:
# distr_controller.send_cluster_map_add_device(dev, node)
#
# for dev in new_devices:
# tasks_controller.add_new_device_mig_task(dev.get_id())
#
# # add to jm raid
# if snode.jm_device and snode.jm_device.raid_bdev and jm_part:
# # looking for jm partition
# jm_dev_part = jm_part.nvme_bdev
# ret = rpc_client.get_bdevs(jm_dev_part)
# if ret:
# logger.info(f"JM part found: {jm_dev_part}")
# if snode.jm_device.status in [JMDevice.STATUS_UNAVAILABLE, JMDevice.STATUS_REMOVED]:
# restart_jm_device(snode.jm_device.get_id(), force=True, format_alceml=True)
#
# if snode.jm_device.status == JMDevice.STATUS_ONLINE and \
# jm_dev_part not in snode.jm_device.jm_nvme_bdev_list:
# remove_jm_device(snode.jm_device.get_id(), force=True)
# restart_jm_device(snode.jm_device.get_id(), force=True)
#
# return "Done"


def device_set_failed_and_migrated(device_id):
db_controller = DBController()
Expand Down
Loading
Loading