Compare commits

...

2 Commits

Author SHA1 Message Date
Florian Paul Azim Hoberg
23eff5069e Merge branch 'main' into feature/auto-node-upgrade 2024-08-07 13:12:24 +02:00
Florian Paul Azim Hoberg
e3fdc506f9 feature: Add node auto-update support / rolling updates 2024-08-07 13:05:51 +02:00
15 changed files with 2824 additions and 30 deletions

View File

@@ -0,0 +1,12 @@
#!/bin/bash
mkdir packages
mkdir build
cd build
cmake ..
cpack -G DEB .
cpack -G RPM .
cp *.deb ../packages
cp *.rpm ../packages
cd ..
rm -rf build
echo "Packages created. Packages can be found in directory: packages"

View File

@@ -0,0 +1,37 @@
cmake_minimum_required(VERSION 3.16)
project(proxmox-rebalancing-service VERSION 1.0.0)
install(FILES perl5/PVE/API2/Nodes.pm DESTINATION /usr/share/perl5/PVE/API2/)
# General
set(CPACK_PACKAGE_NAME "proxlb-additions")
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/../LICENSE")
set(CPACK_RESOURCE_FILE_README "${CMAKE_CURRENT_SOURCE_DIR}/../README.md")
set(CPACK_DEBIAN_PACKAGE_MAINTAINER "Florian Paul Azim <gyptazy> Hoberg <gyptazy@gyptazy.ch>")
set(CPACK_PACKAGE_CONTACT "Florian Paul Azim Hoberg <gyptazy@gyptazy.ch>")
set(CPACK_PACKAGE_VENDOR "gyptazy")
# RPM packaging
set(CPACK_PACKAGE_VERSION ${CMAKE_PROJECT_VERSION})
set(CPACK_GENERATOR "RPM")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "amd64")
set(CPACK_RPM_PACKAGE_SUMMARY "ProxLB Additions - Additional optional patched libaries for ProxLB.")
set(CPACK_RPM_PACKAGE_DESCRIPTION "ProxLB Additions - Additional optional patched libaries for ProxLB.")
set(CPACK_RPM_CHANGELOG_FILE "${CMAKE_CURRENT_SOURCE_DIR}/changelog_redhat")
set(CPACK_PACKAGE_RELEASE 1)
set(CPACK_RPM_PACKAGE_LICENSE "GPL 3.0")
set(CPACK_RPM_PACKAGE_REQUIRES "python >= 3.2.0")
# DEB packaging
set(CPACK_DEBIAN_FILE_NAME DEB-DEFAULT)
set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE "amd64")
set(CPACK_DEBIAN_PACKAGE_SUMMARY "ProxLB Additions - Additional optional patched libaries for ProxLB.")
set(CPACK_DEBIAN_PACKAGE_DESCRIPTION "ProxLB Additions - Additional optional patched libaries for ProxLB.")
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/changelog_debian")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "python3")
set(CPACK_DEBIAN_PACKAGE_LICENSE "GPL 3.0")
# Install
set(CPACK_PACKAGING_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX})
include(CPack)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,14 @@
## Build packages
Building the packages requires cmake, deb and rpm.
For building packages, simly run the following commands:
```
mkdir build
cd build
cmake ..
cpack -G RPM .
cpack -G DEB .
```
When running on Debian/Ubuntu you can directly call `01_package.sh`
to create your own packages.

View File

@@ -0,0 +1,11 @@
proxlb (1.0.0) unstable; urgency=low
* Initial release of ProxLB.
-- Florian Paul Azim Hoberg <gyptazy@gyptazy.ch> Thu, 01 Aug 2024 17:04:12 +0200
proxlb (0.9.0) unstable; urgency=low
* Initial development release of ProxLB as a tech preview.
-- Florian Paul Azim Hoberg <gyptazy@gyptazy.ch> Sun, 07 Jul 2024 05:38:41 +0200

View File

@@ -0,0 +1,5 @@
* Thu Aug 01 2024 Florian Paul Azim Hoberg <gyptazy@gyptazy.ch>
- Initial release of ProxLB.
* Sun Jul 07 2024 Florian Paul Azim Hoberg <gyptazy@gyptazy.ch>
- Initial development release of ProxLB as a tech preview.

169
proxlb
View File

@@ -96,7 +96,7 @@ def pre_validations(config_path):
logging.info(f'{info_prefix} All pre-validations done.')
def post_validations():
def post_validations(api_object, node_requires_reboot):
""" Run post-validations as sanity checks. """
error_prefix = 'Error: [post-validations]:'
info_prefix = 'Info: [post-validations]:'
@@ -105,6 +105,8 @@ def post_validations():
logging.critical(f'{error_prefix} Not all post-validations succeeded. Please validate!')
else:
logging.info(f'{info_prefix} All post-validations succeeded.')
# Reboot node if necessary and all validations were performed.
run_node_reboot(api_object, node_requires_reboot)
def validate_daemon(daemon, schedule):
@@ -270,6 +272,97 @@ def validate_cluster_master(cluster_master):
return True
<<<<<<< HEAD
def get_node_update_status(api_object):
""" Get the current update status of the current executing host node in the cluster. """
info_prefix = 'Info: [node-update-status-getter]:'
error_prefix = 'Error: [node-update-status-getter]:'
node_executor_hostname = socket.gethostname()
logging.info(f'{info_prefix} Get update status for node: {node_executor_hostname}.')
try:
update_status_object = api_object.nodes(node_executor_hostname).apt().update.get()
except proxmoxer.core.ResourceException:
logging.critical(f'{info_prefix} Unknown node in cluster: {node_executor_hostname}.')
sys.exit(2)
if len(update_status_object) > 0:
logging.info(f'{info_prefix} Updates available for node: {node_executor_hostname}.')
return True
else:
logging.info(f'{info_prefix} No updates available for node: {node_executor_hostname}.')
return False
def run_node_update(api_object, node_requires_updates):
""" Run the update execution on node. """
info_prefix = 'Info: [node-update-executor]:'
error_prefix = 'Error: [node-update-executor]:'
node_executor_hostname = socket.gethostname()
if node_requires_updates:
logging.info(f'{info_prefix} Execute updates on node: {node_executor_hostname}.')
try:
update_status_object = api_object.nodes(node_executor_hostname).status().post(command='upgrade')
except proxmoxer.core.ResourceException:
logging.critical(f'{error_prefix} Missing API endpoint on node: {node_executor_hostname}. Please make sure to have the package proxlb-additions installed.')
sys.exit(2)
logging.info(f'{info_prefix} Sucessfully integrated updates to node: {node_executor_hostname}.')
def extend_ignore_node_list(ignore_nodes):
""" Extend the node ignore list by this node. """
info_prefix = 'Info: [node-ignore-list-adder]:'
error_prefix = 'Error: [node-ignore-list-adder]:'
node_executor_hostname = socket.gethostname()
logging.info(f'{info_prefix} Adding node {node_executor_hostname} to ignore list.')
ignore_nodes = ignore_nodes + f',{node_executor_hostname}'
logging.info(f'{info_prefix} Ignored nodes are now: {ignore_nodes}.')
return ignore_nodes
def get_node_reboot_status():
""" Get the current reboot status of the current executing host node in the cluster. """
info_prefix = 'Info: [node-reboot-status-getter]:'
error_prefix = 'Error: [node-reboot-status-getter]:'
reboot_status_file = '/var/run/reboot-required'
node_executor_hostname = socket.gethostname()
logging.info(f'{info_prefix} Get reboot status for node: {node_executor_hostname}.')
reboot_status_object = os.path.exists(reboot_status_file)
if reboot_status_object:
logging.info(f'{info_prefix} Reboot required for node: {node_executor_hostname}.')
return True
else:
logging.info(f'{info_prefix} No reboot required for node: {node_executor_hostname}.')
return False
def run_node_reboot(api_object, node_requires_reboot):
""" Run the update execution on node. """
info_prefix = 'Info: [node-reboot-executor]:'
error_prefix = 'Error: [node-reboot-executor]:'
node_executor_hostname = socket.gethostname()
if node_requires_reboot:
logging.info(f'{info_prefix} Execute reboot on node: {node_executor_hostname}.')
try:
update_status_object = api_object.nodes(node_executor_hostname).status().post(command='reboot')
except proxmoxer.core.ResourceException:
logging.critical(f'{error_prefix} Missing API endpoint on node: {node_executor_hostname}. Please make sure to have the package proxlb-additions installed.')
sys.exit(2)
logging.info(f'{info_prefix} Rebooting node now: {node_executor_hostname}.')
=======
>>>>>>> main
def get_node_statistics(api_object, ignore_nodes):
""" Get statistics of cpu, memory and disk for each node in the cluster. """
info_prefix = 'Info: [node-statistics]:'
@@ -404,27 +497,29 @@ def get_vm_statistics(api_object, ignore_vms, balancing_type):
return vm_statistics
def update_node_statistics(node_statistics, vm_statistics):
def update_node_statistics(node_statistics, vm_statistics, ignore_nodes):
""" Update node statistics by VMs statistics. """
info_prefix = 'Info: [node-update-statistics]:'
warn_prefix = 'Warning: [node-update-statistics]:'
info_prefix = 'Info: [node-update-statistics]:'
warn_prefix = 'Warning: [node-update-statistics]:'
ignore_nodes_list = ignore_nodes.split(',')
for vm, vm_value in vm_statistics.items():
node_statistics[vm_value['node_parent']]['cpu_assigned'] = node_statistics[vm_value['node_parent']]['cpu_assigned'] + int(vm_value['cpu_total'])
node_statistics[vm_value['node_parent']]['cpu_assigned_percent'] = (node_statistics[vm_value['node_parent']]['cpu_assigned'] / node_statistics[vm_value['node_parent']]['cpu_total']) * 100
node_statistics[vm_value['node_parent']]['memory_assigned'] = node_statistics[vm_value['node_parent']]['memory_assigned'] + int(vm_value['memory_total'])
node_statistics[vm_value['node_parent']]['memory_assigned_percent'] = (node_statistics[vm_value['node_parent']]['memory_assigned'] / node_statistics[vm_value['node_parent']]['memory_total']) * 100
node_statistics[vm_value['node_parent']]['disk_assigned'] = node_statistics[vm_value['node_parent']]['disk_assigned'] + int(vm_value['disk_total'])
node_statistics[vm_value['node_parent']]['disk_assigned_percent'] = (node_statistics[vm_value['node_parent']]['disk_assigned'] / node_statistics[vm_value['node_parent']]['disk_total']) * 100
if not vm_value['node_parent'] in ignore_nodes_list:
node_statistics[vm_value['node_parent']]['cpu_assigned'] = node_statistics[vm_value['node_parent']]['cpu_assigned'] + int(vm_value['cpu_total'])
node_statistics[vm_value['node_parent']]['cpu_assigned_percent'] = (node_statistics[vm_value['node_parent']]['cpu_assigned'] / node_statistics[vm_value['node_parent']]['cpu_total']) * 100
node_statistics[vm_value['node_parent']]['memory_assigned'] = node_statistics[vm_value['node_parent']]['memory_assigned'] + int(vm_value['memory_total'])
node_statistics[vm_value['node_parent']]['memory_assigned_percent'] = (node_statistics[vm_value['node_parent']]['memory_assigned'] / node_statistics[vm_value['node_parent']]['memory_total']) * 100
node_statistics[vm_value['node_parent']]['disk_assigned'] = node_statistics[vm_value['node_parent']]['disk_assigned'] + int(vm_value['disk_total'])
node_statistics[vm_value['node_parent']]['disk_assigned_percent'] = (node_statistics[vm_value['node_parent']]['disk_assigned'] / node_statistics[vm_value['node_parent']]['disk_total']) * 100
if node_statistics[vm_value['node_parent']]['cpu_assigned_percent'] > 99:
logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for CPU by {int(node_statistics[vm_value["node_parent"]]["cpu_assigned_percent"])}%.')
if node_statistics[vm_value['node_parent']]['cpu_assigned_percent'] > 99:
logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for CPU by {int(node_statistics[vm_value["node_parent"]]["cpu_assigned_percent"])}%.')
if node_statistics[vm_value['node_parent']]['memory_assigned_percent'] > 99:
logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for memory by {int(node_statistics[vm_value["node_parent"]]["memory_assigned_percent"])}%.')
if node_statistics[vm_value['node_parent']]['memory_assigned_percent'] > 99:
logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for memory by {int(node_statistics[vm_value["node_parent"]]["memory_assigned_percent"])}%.')
if node_statistics[vm_value['node_parent']]['disk_assigned_percent'] > 99:
logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for disk by {int(node_statistics[vm_value["node_parent"]]["disk_assigned_percent"])}%.')
if node_statistics[vm_value['node_parent']]['disk_assigned_percent'] > 99:
logging.warning(f'{warn_prefix} Node {vm_value["node_parent"]} is overprovisioned for disk by {int(node_statistics[vm_value["node_parent"]]["disk_assigned_percent"])}%.')
logging.info(f'{info_prefix} Updated node resource assignments by all VMs.')
logging.debug('node_statistics')
@@ -484,7 +579,7 @@ def __get_proxlb_groups(vm_tags):
return group_include, group_exclude, vm_ignore
def balancing_calculations(balancing_method, balancing_mode, balancing_mode_option, node_statistics, vm_statistics, balanciness, rebalance, processed_vms):
def balancing_calculations(balancing_method, balancing_mode, balancing_mode_option, node_statistics, vm_statistics, balanciness, ignore_nodes, rebalance, processed_vms):
""" Calculate re-balancing of VMs on present nodes across the cluster. """
info_prefix = 'Info: [rebalancing-calculator]:'
@@ -501,14 +596,14 @@ def balancing_calculations(balancing_method, balancing_mode, balancing_mode_opti
# Update resource statistics for VMs and nodes.
node_statistics, vm_statistics = __update_resource_statistics(resources_vm_most_used, resources_node_most_free,
vm_statistics, node_statistics, balancing_method, balancing_mode)
vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes)
# Start recursion until we do not have any needs to rebalance anymore.
balancing_calculations(balancing_method, balancing_mode, balancing_mode_option, node_statistics, vm_statistics, balanciness, rebalance, processed_vms)
balancing_calculations(balancing_method, balancing_mode, balancing_mode_option, node_statistics, vm_statistics, balanciness, ignore_nodes, rebalance, processed_vms)
# Honour groupings for include and exclude groups for rebalancing VMs.
node_statistics, vm_statistics = __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_method, balancing_mode)
node_statistics, vm_statistics = __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_method, balancing_mode)
node_statistics, vm_statistics = __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes)
node_statistics, vm_statistics = __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes)
# Remove VMs that are not being relocated.
vms_to_remove = [vm_name for vm_name, vm_info in vm_statistics.items() if 'node_rebalance' in vm_info and vm_info['node_rebalance'] == vm_info.get('node_parent')]
@@ -632,11 +727,12 @@ def __get_most_free_resources_node(balancing_method, balancing_mode, balancing_m
return node
def __update_resource_statistics(resource_highest_used_resources_vm, resource_highest_free_resources_node, vm_statistics, node_statistics, balancing_method, balancing_mode):
def __update_resource_statistics(resource_highest_used_resources_vm, resource_highest_free_resources_node, vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes):
""" Update VM and node resource statistics. """
info_prefix = 'Info: [rebalancing-resource-statistics-update]:'
ignore_nodes_list = ignore_nodes.split(',')
if resource_highest_used_resources_vm[1]['node_parent'] != resource_highest_free_resources_node[0]:
if resource_highest_used_resources_vm[1]['node_parent'] != resource_highest_free_resources_node[0] and resource_highest_used_resources_vm[1]['node_parent'] not in ignore_nodes_list:
vm_name = resource_highest_used_resources_vm[0]
vm_node_parent = resource_highest_used_resources_vm[1]['node_parent']
vm_node_rebalance = resource_highest_free_resources_node[0]
@@ -668,7 +764,7 @@ def __update_resource_statistics(resource_highest_used_resources_vm, resource_hi
return node_statistics, vm_statistics
def __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_method, balancing_mode):
def __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes):
""" Get VMs tags for include groups. """
info_prefix = 'Info: [rebalancing-tags-group-include]:'
tags_include_vms = {}
@@ -697,13 +793,13 @@ def __get_vm_tags_include_groups(vm_statistics, node_statistics, balancing_metho
vm_node_rebalance = vm_statistics[vm_name]['node_rebalance']
else:
_mocked_vm_object = (vm_name, vm_statistics[vm_name])
node_statistics, vm_statistics = __update_resource_statistics(_mocked_vm_object, [vm_node_rebalance], vm_statistics, node_statistics, balancing_method, balancing_mode)
node_statistics, vm_statistics = __update_resource_statistics(_mocked_vm_object, [vm_node_rebalance], vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes)
processed_vm.append(vm_name)
return node_statistics, vm_statistics
def __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_method, balancing_mode):
def __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes):
""" Get VMs tags for exclude groups. """
info_prefix = 'Info: [rebalancing-tags-group-exclude]:'
tags_exclude_vms = {}
@@ -736,7 +832,7 @@ def __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_metho
random_node = random.choice(list(node_statistics.keys()))
else:
_mocked_vm_object = (vm_name, vm_statistics[vm_name])
node_statistics, vm_statistics = __update_resource_statistics(_mocked_vm_object, [random_node], vm_statistics, node_statistics, balancing_method, balancing_mode)
node_statistics, vm_statistics = __update_resource_statistics(_mocked_vm_object, [random_node], vm_statistics, node_statistics, balancing_method, balancing_mode, ignore_nodes)
processed_vm.append(vm_name)
return node_statistics, vm_statistics
@@ -891,20 +987,33 @@ def main():
validate_daemon(daemon, schedule)
continue
<<<<<<< HEAD
# Validate for node auto update in cluster for rolling updates.
# Note: This requires proxlb-additions with a patched Proxmox API!
#rolling_updates = 1
if bool(int(rolling_updates)):
node_requires_updates = get_node_update_status(api_object)
run_node_update(api_object, node_requires_updates)
node_requires_reboot = get_node_reboot_status()
if node_requires_reboot:
ignore_nodes = extend_ignore_node_list(ignore_nodes)
=======
>>>>>>> main
# Get metric & statistics for vms and nodes.
node_statistics = get_node_statistics(api_object, ignore_nodes)
vm_statistics = get_vm_statistics(api_object, ignore_vms, balancing_type)
node_statistics = update_node_statistics(node_statistics, vm_statistics)
node_statistics = update_node_statistics(node_statistics, vm_statistics, ignore_nodes)
# Calculate rebalancing of vms.
node_statistics_rebalanced, vm_statistics_rebalanced = balancing_calculations(balancing_method, balancing_mode, balancing_mode_option,
node_statistics, vm_statistics, balanciness, rebalance=False, processed_vms=[])
node_statistics, vm_statistics, balanciness, ignore_nodes, rebalance=False, processed_vms=[])
# Rebalance vms to new nodes within the cluster.
run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args, parallel_migrations)
# Validate for any errors.
post_validations()
post_validations(api_object, node_requires_reboot)
# Validate daemon service.
validate_daemon(daemon, schedule)