From 28be8b814675cf56fcbb02b8baa4e74d6f1a9cca Mon Sep 17 00:00:00 2001 From: Florian Paul Azim Hoberg Date: Sat, 3 Aug 2024 20:09:07 +0200 Subject: [PATCH 1/3] feature: Add option to run migrations in parallel or sequentially Fixes: #41 --- ...n_run_migration_parallel_or_sequential.yml | 2 + .changelogs/1.1.0/release_meta.yml | 1 + README.md | 4 ++ proxlb | 49 ++++++++++++++++--- 4 files changed, 48 insertions(+), 8 deletions(-) create mode 100644 .changelogs/1.1.0/41_add_option_run_migration_parallel_or_sequential.yml create mode 100644 .changelogs/1.1.0/release_meta.yml diff --git a/.changelogs/1.1.0/41_add_option_run_migration_parallel_or_sequential.yml b/.changelogs/1.1.0/41_add_option_run_migration_parallel_or_sequential.yml new file mode 100644 index 0000000..f3417e2 --- /dev/null +++ b/.changelogs/1.1.0/41_add_option_run_migration_parallel_or_sequential.yml @@ -0,0 +1,2 @@ +added: + - Add option to run migrations in parallel or sequentially. [#41] diff --git a/.changelogs/1.1.0/release_meta.yml b/.changelogs/1.1.0/release_meta.yml new file mode 100644 index 0000000..c19765d --- /dev/null +++ b/.changelogs/1.1.0/release_meta.yml @@ -0,0 +1 @@ +date: TBD diff --git a/README.md b/README.md index 8034628..14f142b 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,7 @@ The following options can be set in the `proxlb.conf` file: | mode_option | byte | Rebalance by node's resources in `bytes` or `percent`. (default: bytes) | | type | vm | Rebalance only `vm` (virtual machines), `ct` (containers) or `all` (virtual machines & containers). (default: vm)| | balanciness | 10 | Value of the percentage of lowest and highest resource consumption on nodes may differ before rebalancing. (default: 10) | +| parallel_migrations | 1 | Defines if migrations should be done parallely or sequentially. (default: 1) | | ignore_nodes | dummynode01,dummynode02,test* | Defines a comma separated list of nodes to exclude. | | ignore_vms | testvm01,testvm02 | Defines a comma separated list of VMs to exclude. (`*` as suffix wildcard or tags are also supported) | | daemon | 1 | Run as a daemon (1) or one-shot (0). (default: 1) | @@ -133,6 +134,9 @@ type: vm # Rebalancing: node01: 41% memory consumption :: node02: 52% consumption # No rebalancing: node01: 43% memory consumption :: node02: 50% consumption balanciness: 10 +# Enable parallel migrations. If set to 0 it will wait for completed migrations +# before starting next migration. +parallel_migrations: 1 ignore_nodes: dummynode01,dummynode02 ignore_vms: testvm01,testvm02 [service] diff --git a/proxlb b/proxlb index bf0ba39..a18c9db 100755 --- a/proxlb +++ b/proxlb @@ -183,6 +183,7 @@ def initialize_config_options(config_path): balancing_mode_option = config['balancing'].get('mode_option', 'bytes') balancing_type = config['balancing'].get('type', 'vm') balanciness = config['balancing'].get('balanciness', 10) + parallel_migrations = config['balancing'].get('parallel_migrations', 1) ignore_nodes = config['balancing'].get('ignore_nodes', None) ignore_vms = config['balancing'].get('ignore_vms', None) # Service @@ -201,7 +202,7 @@ def initialize_config_options(config_path): logging.info(f'{info_prefix} Configuration file loaded.') return proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v, balancing_method, balancing_mode, \ - balancing_mode_option, balancing_type, balanciness, ignore_nodes, ignore_vms, daemon, schedule, log_verbosity + balancing_mode_option, balancing_type, balanciness, parallel_migrations, ignore_nodes, ignore_vms, daemon, schedule, log_verbosity def api_connect(proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v): @@ -703,7 +704,31 @@ def __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_metho return node_statistics, vm_statistics -def __run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args): +def __wait_job_finalized(api_object, node_name, job_id, counter): + """ Wait for a job to be finalized. """ + error_prefix = 'Error: [job-status-getter]:' + info_prefix = 'Info: [job-status-getter]:' + + logging.info(f'{info_prefix} Getting job status for job {job_id}.') + task = api_object.nodes(node_name).tasks(job_id).status().get() + logging.info(f'{info_prefix} {task}') + + if task['status'] == 'running': + logging.info(f'{info_prefix} Validating job {job_id} for the {counter} run.') + + # Do not run for infinity this recursion and fail when reaching the limit. + if counter == 300: + logging.critical(f'{error_prefix} The job {job_id} on node {node_name} did not finished in time for migration.') + + time.sleep(5) + counter = counter + 1 + logging.info(f'{info_prefix} Revalidating job {job_id} in a next run.') + __wait_job_finalized(api_object, node_name, job_id, counter) + + logging.info(f'{info_prefix} Job {job_id} for migration from {node_name} terminiated succesfully.') + + +def __run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args, parallel_migrations): """ Run & execute the VM rebalancing via API. """ error_prefix = 'Error: [rebalancing-executor]:' info_prefix = 'Info: [rebalancing-executor]:' @@ -715,15 +740,23 @@ def __run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args): # Migrate type VM (live migration). if value['type'] == 'vm': logging.info(f'{info_prefix} Rebalancing VM {vm} from node {value["node_parent"]} to node {value["node_rebalance"]}.') - api_object.nodes(value['node_parent']).qemu(value['vmid']).migrate().post(target=value['node_rebalance'],online=1) + job_id = api_object.nodes(value['node_parent']).qemu(value['vmid']).migrate().post(target=value['node_rebalance'],online=1) # Migrate type CT (requires restart of container). if value['type'] == 'ct': logging.info(f'{info_prefix} Rebalancing CT {vm} from node {value["node_parent"]} to node {value["node_rebalance"]}.') - api_object.nodes(value['node_parent']).lxc(value['vmid']).migrate().post(target=value['node_rebalance'],restart=1) + job_id = api_object.nodes(value['node_parent']).lxc(value['vmid']).migrate().post(target=value['node_rebalance'],restart=1) except proxmoxer.core.ResourceException as error_resource: logging.critical(f'{error_prefix} {error_resource}') + + # Wait for migration to be finished unless running parallel migrations. + if not bool(int(parallel_migrations)): + logging.info(f'{info_prefix} Rebalancing will be performed sequentially.') + __wait_job_finalized(api_object, value['node_parent'], job_id, counter=1) + else: + logging.info(f'{info_prefix} Rebalancing will be performed parallely.') + else: logging.info(f'{info_prefix} No rebalancing needed.') @@ -784,9 +817,9 @@ def __print_table_cli(table, dry_run=False): logging.info(f'{info_prefix} {row_format.format(*row)}') -def run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args): +def run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args, parallel_migrations): """ Run rebalancing of vms to new nodes in cluster. """ - __run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args) + __run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args, parallel_migrations) __create_json_output(vm_statistics_rebalanced, app_args) __create_cli_output(vm_statistics_rebalanced, app_args) @@ -801,7 +834,7 @@ def main(): # Parse global config. proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v, balancing_method, balancing_mode, balancing_mode_option, balancing_type, \ - balanciness, ignore_nodes, ignore_vms, daemon, schedule, log_verbosity = initialize_config_options(config_path) + balanciness, parallel_migrations, ignore_nodes, ignore_vms, daemon, schedule, log_verbosity = initialize_config_options(config_path) # Overwrite logging handler with user defined log verbosity. initialize_logger(log_verbosity, update_log_verbosity=True) @@ -820,7 +853,7 @@ def main(): node_statistics, vm_statistics, balanciness, rebalance=False, processed_vms=[]) # Rebalance vms to new nodes within the cluster. - run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args) + run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args, parallel_migrations) # Validate for any errors. post_validations() From 3a2c16b137f936f9b209566599a21d026e41032d Mon Sep 17 00:00:00 2001 From: Florian Paul Azim Hoberg Date: Sun, 4 Aug 2024 09:45:45 +0200 Subject: [PATCH 2/3] feature: Add option to run ProxLB only on the Proxmox's master node in the cluster. Fixes: #40 --- ...ion_to_run_only_on_cluster_master_node.yml | 2 + README.md | 5 ++ proxlb | 56 +++++++++++++++++-- 3 files changed, 59 insertions(+), 4 deletions(-) create mode 100644 .changelogs/1.1.0/40_add_option_to_run_only_on_cluster_master_node.yml diff --git a/.changelogs/1.1.0/40_add_option_to_run_only_on_cluster_master_node.yml b/.changelogs/1.1.0/40_add_option_to_run_only_on_cluster_master_node.yml new file mode 100644 index 0000000..4320350 --- /dev/null +++ b/.changelogs/1.1.0/40_add_option_to_run_only_on_cluster_master_node.yml @@ -0,0 +1,2 @@ +added: + - Add option to run ProxLB only on the Proxmox's master node in the cluster. [40] diff --git a/README.md b/README.md index 14f142b..0dbf341 100644 --- a/README.md +++ b/README.md @@ -112,6 +112,7 @@ The following options can be set in the `proxlb.conf` file: | parallel_migrations | 1 | Defines if migrations should be done parallely or sequentially. (default: 1) | | ignore_nodes | dummynode01,dummynode02,test* | Defines a comma separated list of nodes to exclude. | | ignore_vms | testvm01,testvm02 | Defines a comma separated list of VMs to exclude. (`*` as suffix wildcard or tags are also supported) | +| master_only | 0 | Defines is this should only be performed (1) on the cluster master node or not (0). (default: 0) | | daemon | 1 | Run as a daemon (1) or one-shot (0). (default: 1) | | schedule | 24 | Hours to rebalance in hours. (default: 24) | | log_verbosity | INFO | Defines the log level (default: CRITICAL) where you can use `INFO`, `WARN` or `CRITICAL` | @@ -140,6 +141,10 @@ parallel_migrations: 1 ignore_nodes: dummynode01,dummynode02 ignore_vms: testvm01,testvm02 [service] +# The master_only option might be usuful if running ProxLB on all nodes in a cluster +# but only a single one should do the balancing. The master node is obtained from the Proxmox +# HA status. +master_only: 0 daemon: 1 ``` diff --git a/proxlb b/proxlb index a18c9db..381c60c 100755 --- a/proxlb +++ b/proxlb @@ -33,6 +33,7 @@ except ImportError: import random import re import requests +import socket import sys import time import urllib3 @@ -40,7 +41,7 @@ import urllib3 # Constants __appname__ = "ProxLB" -__version__ = "1.0.0" +__version__ = "1.1.0b" __author__ = "Florian Paul Azim Hoberg @gyptazy" __errors__ = False @@ -187,6 +188,7 @@ def initialize_config_options(config_path): ignore_nodes = config['balancing'].get('ignore_nodes', None) ignore_vms = config['balancing'].get('ignore_vms', None) # Service + master_only = config['service'].get('master_only', 0) daemon = config['service'].get('daemon', 1) schedule = config['service'].get('schedule', 24) log_verbosity = config['service'].get('log_verbosity', 'CRITICAL') @@ -201,8 +203,8 @@ def initialize_config_options(config_path): sys.exit(2) logging.info(f'{info_prefix} Configuration file loaded.') - return proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v, balancing_method, balancing_mode, \ - balancing_mode_option, balancing_type, balanciness, parallel_migrations, ignore_nodes, ignore_vms, daemon, schedule, log_verbosity + return proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v, balancing_method, balancing_mode, balancing_mode_option, \ + balancing_type, balanciness, parallel_migrations, ignore_nodes, ignore_vms, master_only, daemon, schedule, log_verbosity def api_connect(proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v): @@ -232,6 +234,42 @@ def api_connect(proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_ap return api_object +def get_cluster_master(api_object): + """ Get the current master of the Proxmox cluster. """ + error_prefix = 'Error: [cluster-master-getter]:' + info_prefix = 'Info: [cluster-master-getter]:' + + logging.info(f'{info_prefix} Getting master node from cluster.') + try: + ha_status_object = api_object.cluster().ha().status().manager_status().get() + logging.info(f'{info_prefix} Master node: {ha_status_object["manager_status"]["master_node"]}') + except urllib3.exceptions.NameResolutionError: + logging.critical(f'{error_prefix} Could not resolve the API.') + sys.exit(2) + except requests.exceptions.ConnectTimeout: + logging.critical(f'{error_prefix} Connection time out to API.') + sys.exit(2) + except requests.exceptions.SSLError: + logging.critical(f'{error_prefix} SSL certificate verification failed for API.') + sys.exit(2) + + return ha_status_object['manager_status']['master_node'] + + +def validate_cluster_master(cluster_master): + """ Validate if the current execution node is the cluster master. """ + info_prefix = 'Info: [cluster-master-validator]:' + + node_executor_hostname = socket.gethostname() + logging.info(f'{info_prefix} Node executor hostname is: {node_executor_hostname}') + + if node_executor_hostname != cluster_master: + logging.info(f'{info_prefix} {node_executor_hostname} is not the cluster master ({cluster_master}).') + return False + else: + return True + + def get_node_statistics(api_object, ignore_nodes): """ Get statistics of cpu, memory and disk for each node in the cluster. """ info_prefix = 'Info: [node-statistics]:' @@ -834,7 +872,7 @@ def main(): # Parse global config. proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v, balancing_method, balancing_mode, balancing_mode_option, balancing_type, \ - balanciness, parallel_migrations, ignore_nodes, ignore_vms, daemon, schedule, log_verbosity = initialize_config_options(config_path) + balanciness, parallel_migrations, ignore_nodes, ignore_vms, master_only, daemon, schedule, log_verbosity = initialize_config_options(config_path) # Overwrite logging handler with user defined log verbosity. initialize_logger(log_verbosity, update_log_verbosity=True) @@ -843,6 +881,16 @@ def main(): # API Authentication. api_object = api_connect(proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v) + # Get master node of cluster and ensure that ProxLB is only performed on the + # cluster master node to avoid ongoing rebalancing. + if bool(int(master_only)): + cluster_master_node = get_cluster_master(api_object) + cluster_master = validate_cluster_master(cluster_master_node) + # Validate daemon service and skip following tasks when not being the cluster master. + if not cluster_master: + validate_daemon(daemon, schedule) + continue + # Get metric & statistics for vms and nodes. node_statistics = get_node_statistics(api_object, ignore_nodes) vm_statistics = get_vm_statistics(api_object, ignore_vms, balancing_type) From 37e7a601be308f8c9b059e206cc49aaf83182058 Mon Sep 17 00:00:00 2001 From: Florian Paul Azim Hoberg Date: Tue, 6 Aug 2024 18:06:05 +0200 Subject: [PATCH 3/3] fix: Fix daemon timer to use hours instead of minutes. Reported by: @mater-345 Fixes: #45 --- .changelogs/1.1.0/45_fix_daemon_timer.yml | 2 ++ proxlb | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 .changelogs/1.1.0/45_fix_daemon_timer.yml diff --git a/.changelogs/1.1.0/45_fix_daemon_timer.yml b/.changelogs/1.1.0/45_fix_daemon_timer.yml new file mode 100644 index 0000000..b6a4dd5 --- /dev/null +++ b/.changelogs/1.1.0/45_fix_daemon_timer.yml @@ -0,0 +1,2 @@ +changed: + - Fix daemon timer to use hours instead of minutes. [#45] diff --git a/proxlb b/proxlb index 381c60c..fb92547 100755 --- a/proxlb +++ b/proxlb @@ -113,7 +113,7 @@ def validate_daemon(daemon, schedule): if bool(int(daemon)): logging.info(f'{info_prefix} Running in daemon mode. Next run in {schedule} hours.') - time.sleep(int(schedule) * 60) + time.sleep(int(schedule) * 60 * 60) else: logging.info(f'{info_prefix} Not running in daemon mode. Quitting.') sys.exit(0)