From 28be8b814675cf56fcbb02b8baa4e74d6f1a9cca Mon Sep 17 00:00:00 2001
From: Florian Paul Azim Hoberg <gyptazy@gyptazy.ch>
Date: Sat, 3 Aug 2024 20:09:07 +0200
Subject: [PATCH 1/3] feature: Add option to run migrations in parallel or
 sequentially

Fixes: #41
---
 ...n_run_migration_parallel_or_sequential.yml |  2 +
 .changelogs/1.1.0/release_meta.yml            |  1 +
 README.md                                     |  4 ++
 proxlb                                        | 49 ++++++++++++++++---
 4 files changed, 48 insertions(+), 8 deletions(-)
 create mode 100644 .changelogs/1.1.0/41_add_option_run_migration_parallel_or_sequential.yml
 create mode 100644 .changelogs/1.1.0/release_meta.yml

diff --git a/.changelogs/1.1.0/41_add_option_run_migration_parallel_or_sequential.yml b/.changelogs/1.1.0/41_add_option_run_migration_parallel_or_sequential.yml
new file mode 100644
index 0000000..f3417e2
--- /dev/null
+++ b/.changelogs/1.1.0/41_add_option_run_migration_parallel_or_sequential.yml
@@ -0,0 +1,2 @@
+added:
+  - Add option to run migrations in parallel or sequentially. [#41]
diff --git a/.changelogs/1.1.0/release_meta.yml b/.changelogs/1.1.0/release_meta.yml
new file mode 100644
index 0000000..c19765d
--- /dev/null
+++ b/.changelogs/1.1.0/release_meta.yml
@@ -0,0 +1 @@
+date: TBD
diff --git a/README.md b/README.md
index 8034628..14f142b 100644
--- a/README.md
+++ b/README.md
@@ -109,6 +109,7 @@ The following options can be set in the `proxlb.conf` file:
 | mode_option | byte | Rebalance by node's resources in `bytes` or `percent`. (default: bytes) |
 | type | vm | Rebalance only `vm` (virtual machines), `ct` (containers) or `all` (virtual machines & containers). (default: vm)|
 | balanciness | 10 | Value of the percentage of lowest and highest resource consumption on nodes may differ before rebalancing. (default: 10) |
+| parallel_migrations | 1 | Defines if migrations should be done parallely or sequentially. (default: 1) |
 | ignore_nodes | dummynode01,dummynode02,test* | Defines a comma separated list of nodes to exclude. |
 | ignore_vms | testvm01,testvm02 | Defines a comma separated list of VMs to exclude. (`*` as suffix wildcard or tags are also supported) |
 | daemon | 1 | Run as a daemon (1) or one-shot (0). (default: 1) |
@@ -133,6 +134,9 @@ type: vm
 # Rebalancing:     node01: 41% memory consumption :: node02: 52% consumption
 # No rebalancing:  node01: 43% memory consumption :: node02: 50% consumption
 balanciness: 10
+# Enable parallel migrations. If set to 0 it will wait for completed migrations
+# before starting next migration.
+parallel_migrations: 1
 ignore_nodes: dummynode01,dummynode02
 ignore_vms: testvm01,testvm02
 [service]
diff --git a/proxlb b/proxlb
index bf0ba39..a18c9db 100755
--- a/proxlb
+++ b/proxlb
@@ -183,6 +183,7 @@ def initialize_config_options(config_path):
         balancing_mode_option = config['balancing'].get('mode_option', 'bytes')
         balancing_type        = config['balancing'].get('type', 'vm')
         balanciness           = config['balancing'].get('balanciness', 10)
+        parallel_migrations   = config['balancing'].get('parallel_migrations', 1)
         ignore_nodes          = config['balancing'].get('ignore_nodes', None)
         ignore_vms            = config['balancing'].get('ignore_vms', None)
         # Service
@@ -201,7 +202,7 @@ def initialize_config_options(config_path):
 
     logging.info(f'{info_prefix} Configuration file loaded.')
     return proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v, balancing_method, balancing_mode, \
-         balancing_mode_option, balancing_type, balanciness, ignore_nodes, ignore_vms, daemon, schedule, log_verbosity
+         balancing_mode_option, balancing_type, balanciness, parallel_migrations, ignore_nodes, ignore_vms, daemon, schedule, log_verbosity
 
 
 def api_connect(proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v):
@@ -703,7 +704,31 @@ def __get_vm_tags_exclude_groups(vm_statistics, node_statistics, balancing_metho
     return node_statistics, vm_statistics
 
 
-def __run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args):
+def __wait_job_finalized(api_object, node_name, job_id, counter):
+    """ Wait for a job to be finalized. """
+    error_prefix = 'Error: [job-status-getter]:'
+    info_prefix  = 'Info: [job-status-getter]:'
+
+    logging.info(f'{info_prefix} Getting job status for job {job_id}.')
+    task = api_object.nodes(node_name).tasks(job_id).status().get()
+    logging.info(f'{info_prefix} {task}')
+
+    if task['status'] == 'running':
+        logging.info(f'{info_prefix} Validating job {job_id} for the {counter} run.')
+
+        # Do not run for infinity this recursion and fail when reaching the limit.
+        if counter == 300:
+            logging.critical(f'{error_prefix} The job {job_id} on node {node_name} did not finished in time for migration.')
+
+        time.sleep(5)
+        counter = counter + 1
+        logging.info(f'{info_prefix} Revalidating job {job_id} in a next run.')
+        __wait_job_finalized(api_object, node_name, job_id, counter)
+
+    logging.info(f'{info_prefix} Job {job_id} for migration from {node_name} terminiated succesfully.')
+
+
+def __run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args, parallel_migrations):
     """ Run & execute the VM rebalancing via API. """
     error_prefix = 'Error: [rebalancing-executor]:'
     info_prefix  = 'Info: [rebalancing-executor]:'
@@ -715,15 +740,23 @@ def __run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args):
                 # Migrate type VM (live migration).
                 if value['type'] == 'vm':
                     logging.info(f'{info_prefix} Rebalancing VM {vm} from node {value["node_parent"]} to node {value["node_rebalance"]}.')
-                    api_object.nodes(value['node_parent']).qemu(value['vmid']).migrate().post(target=value['node_rebalance'],online=1)
+                    job_id = api_object.nodes(value['node_parent']).qemu(value['vmid']).migrate().post(target=value['node_rebalance'],online=1)
 
                 # Migrate type CT (requires restart of container).
                 if value['type'] == 'ct':
                     logging.info(f'{info_prefix} Rebalancing CT {vm} from node {value["node_parent"]} to node {value["node_rebalance"]}.')
-                    api_object.nodes(value['node_parent']).lxc(value['vmid']).migrate().post(target=value['node_rebalance'],restart=1)
+                    job_id = api_object.nodes(value['node_parent']).lxc(value['vmid']).migrate().post(target=value['node_rebalance'],restart=1)
 
             except proxmoxer.core.ResourceException as error_resource:
                 logging.critical(f'{error_prefix} {error_resource}')
+
+            # Wait for migration to be finished unless running parallel migrations.
+            if not bool(int(parallel_migrations)):
+                logging.info(f'{info_prefix} Rebalancing will be performed sequentially.')
+                __wait_job_finalized(api_object, value['node_parent'], job_id, counter=1)
+            else:
+                logging.info(f'{info_prefix} Rebalancing will be performed parallely.')
+
     else:
         logging.info(f'{info_prefix} No rebalancing needed.')
 
@@ -784,9 +817,9 @@ def __print_table_cli(table, dry_run=False):
         logging.info(f'{info_prefix} {row_format.format(*row)}')
 
 
-def run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args):
+def run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args, parallel_migrations):
     """ Run rebalancing of vms to new nodes in cluster. """
-    __run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args)
+    __run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args, parallel_migrations)
     __create_json_output(vm_statistics_rebalanced, app_args)
     __create_cli_output(vm_statistics_rebalanced, app_args)
 
@@ -801,7 +834,7 @@ def main():
 
     # Parse global config.
     proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v, balancing_method, balancing_mode, balancing_mode_option, balancing_type, \
-        balanciness, ignore_nodes, ignore_vms, daemon, schedule, log_verbosity = initialize_config_options(config_path)
+        balanciness, parallel_migrations, ignore_nodes, ignore_vms, daemon, schedule, log_verbosity = initialize_config_options(config_path)
 
     # Overwrite logging handler with user defined log verbosity.
     initialize_logger(log_verbosity, update_log_verbosity=True)
@@ -820,7 +853,7 @@ def main():
                                                                                       node_statistics, vm_statistics, balanciness, rebalance=False, processed_vms=[])
 
         # Rebalance vms to new nodes within the cluster.
-        run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args)
+        run_vm_rebalancing(api_object, vm_statistics_rebalanced, app_args, parallel_migrations)
 
         # Validate for any errors.
         post_validations()

From 3a2c16b137f936f9b209566599a21d026e41032d Mon Sep 17 00:00:00 2001
From: Florian Paul Azim Hoberg <gyptazy@gyptazy.ch>
Date: Sun, 4 Aug 2024 09:45:45 +0200
Subject: [PATCH 2/3] feature: Add option to run ProxLB only on the Proxmox's
 master node in the cluster.

Fixes: #40
---
 ...ion_to_run_only_on_cluster_master_node.yml |  2 +
 README.md                                     |  5 ++
 proxlb                                        | 56 +++++++++++++++++--
 3 files changed, 59 insertions(+), 4 deletions(-)
 create mode 100644 .changelogs/1.1.0/40_add_option_to_run_only_on_cluster_master_node.yml

diff --git a/.changelogs/1.1.0/40_add_option_to_run_only_on_cluster_master_node.yml b/.changelogs/1.1.0/40_add_option_to_run_only_on_cluster_master_node.yml
new file mode 100644
index 0000000..4320350
--- /dev/null
+++ b/.changelogs/1.1.0/40_add_option_to_run_only_on_cluster_master_node.yml
@@ -0,0 +1,2 @@
+added:
+  - Add option to run ProxLB only on the Proxmox's master node in the cluster. [40]
diff --git a/README.md b/README.md
index 14f142b..0dbf341 100644
--- a/README.md
+++ b/README.md
@@ -112,6 +112,7 @@ The following options can be set in the `proxlb.conf` file:
 | parallel_migrations | 1 | Defines if migrations should be done parallely or sequentially. (default: 1) |
 | ignore_nodes | dummynode01,dummynode02,test* | Defines a comma separated list of nodes to exclude. |
 | ignore_vms | testvm01,testvm02 | Defines a comma separated list of VMs to exclude. (`*` as suffix wildcard or tags are also supported) |
+| master_only | 0 | Defines is this should only be performed (1) on the cluster master node or not (0). (default: 0) |
 | daemon | 1 | Run as a daemon (1) or one-shot (0). (default: 1) |
 | schedule | 24 | Hours to rebalance in hours. (default: 24) |
 | log_verbosity | INFO | Defines the log level (default: CRITICAL) where you can use `INFO`, `WARN` or `CRITICAL` |
@@ -140,6 +141,10 @@ parallel_migrations: 1
 ignore_nodes: dummynode01,dummynode02
 ignore_vms: testvm01,testvm02
 [service]
+# The master_only option might be usuful if running ProxLB on all nodes in a cluster
+# but only a single one should do the balancing. The master node is obtained from the Proxmox
+# HA status.
+master_only: 0
 daemon: 1
 ```
 
diff --git a/proxlb b/proxlb
index a18c9db..381c60c 100755
--- a/proxlb
+++ b/proxlb
@@ -33,6 +33,7 @@ except ImportError:
 import random
 import re
 import requests
+import socket
 import sys
 import time
 import urllib3
@@ -40,7 +41,7 @@ import urllib3
 
 # Constants
 __appname__ = "ProxLB"
-__version__ = "1.0.0"
+__version__ = "1.1.0b"
 __author__  = "Florian Paul Azim Hoberg <gyptazy@gyptazy.ch> @gyptazy"
 __errors__ = False
 
@@ -187,6 +188,7 @@ def initialize_config_options(config_path):
         ignore_nodes          = config['balancing'].get('ignore_nodes', None)
         ignore_vms            = config['balancing'].get('ignore_vms', None)
         # Service
+        master_only           = config['service'].get('master_only', 0)
         daemon                = config['service'].get('daemon', 1)
         schedule              = config['service'].get('schedule', 24)
         log_verbosity         = config['service'].get('log_verbosity', 'CRITICAL')
@@ -201,8 +203,8 @@ def initialize_config_options(config_path):
         sys.exit(2)
 
     logging.info(f'{info_prefix} Configuration file loaded.')
-    return proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v, balancing_method, balancing_mode, \
-         balancing_mode_option, balancing_type, balanciness, parallel_migrations, ignore_nodes, ignore_vms, daemon, schedule, log_verbosity
+    return proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v, balancing_method, balancing_mode, balancing_mode_option, \
+         balancing_type, balanciness, parallel_migrations, ignore_nodes, ignore_vms, master_only, daemon, schedule, log_verbosity
 
 
 def api_connect(proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v):
@@ -232,6 +234,42 @@ def api_connect(proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_ap
     return api_object
 
 
+def get_cluster_master(api_object):
+    """ Get the current master of the Proxmox cluster. """
+    error_prefix = 'Error: [cluster-master-getter]:'
+    info_prefix  = 'Info: [cluster-master-getter]:'
+
+    logging.info(f'{info_prefix} Getting master node from cluster.')
+    try:
+        ha_status_object = api_object.cluster().ha().status().manager_status().get()
+        logging.info(f'{info_prefix} Master node: {ha_status_object["manager_status"]["master_node"]}')
+    except urllib3.exceptions.NameResolutionError:
+        logging.critical(f'{error_prefix} Could not resolve the API.')
+        sys.exit(2)
+    except requests.exceptions.ConnectTimeout:
+        logging.critical(f'{error_prefix} Connection time out to API.')
+        sys.exit(2)
+    except requests.exceptions.SSLError:
+        logging.critical(f'{error_prefix} SSL certificate verification failed for API.')
+        sys.exit(2)
+
+    return ha_status_object['manager_status']['master_node']
+
+
+def validate_cluster_master(cluster_master):
+    """ Validate if the current execution node is the cluster master. """
+    info_prefix  = 'Info: [cluster-master-validator]:'
+
+    node_executor_hostname = socket.gethostname()
+    logging.info(f'{info_prefix} Node executor hostname is: {node_executor_hostname}')
+
+    if node_executor_hostname != cluster_master:
+        logging.info(f'{info_prefix} {node_executor_hostname} is not the cluster master ({cluster_master}).')
+        return False
+    else:
+        return True
+
+
 def get_node_statistics(api_object, ignore_nodes):
     """ Get statistics of cpu, memory and disk for each node in the cluster. """
     info_prefix       = 'Info: [node-statistics]:'
@@ -834,7 +872,7 @@ def main():
 
     # Parse global config.
     proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v, balancing_method, balancing_mode, balancing_mode_option, balancing_type, \
-        balanciness, parallel_migrations, ignore_nodes, ignore_vms, daemon, schedule, log_verbosity = initialize_config_options(config_path)
+        balanciness, parallel_migrations, ignore_nodes, ignore_vms, master_only, daemon, schedule, log_verbosity = initialize_config_options(config_path)
 
     # Overwrite logging handler with user defined log verbosity.
     initialize_logger(log_verbosity, update_log_verbosity=True)
@@ -843,6 +881,16 @@ def main():
         # API Authentication.
         api_object = api_connect(proxmox_api_host, proxmox_api_user, proxmox_api_pass, proxmox_api_ssl_v)
 
+        # Get master node of cluster and ensure that ProxLB is only performed on the
+        # cluster master node to avoid ongoing rebalancing.
+        if bool(int(master_only)):
+            cluster_master_node = get_cluster_master(api_object)
+            cluster_master = validate_cluster_master(cluster_master_node)
+            # Validate daemon service and skip following tasks when not being the cluster master.
+            if not cluster_master:
+                validate_daemon(daemon, schedule)
+                continue
+
         # Get metric & statistics for vms and nodes.
         node_statistics = get_node_statistics(api_object, ignore_nodes)
         vm_statistics   = get_vm_statistics(api_object, ignore_vms, balancing_type)

From 37e7a601be308f8c9b059e206cc49aaf83182058 Mon Sep 17 00:00:00 2001
From: Florian Paul Azim Hoberg <gyptazy@gyptazy.ch>
Date: Tue, 6 Aug 2024 18:06:05 +0200
Subject: [PATCH 3/3] fix: Fix daemon timer to use hours instead of minutes.

Reported by: @mater-345
Fixes: #45
---
 .changelogs/1.1.0/45_fix_daemon_timer.yml | 2 ++
 proxlb                                    | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)
 create mode 100644 .changelogs/1.1.0/45_fix_daemon_timer.yml

diff --git a/.changelogs/1.1.0/45_fix_daemon_timer.yml b/.changelogs/1.1.0/45_fix_daemon_timer.yml
new file mode 100644
index 0000000..b6a4dd5
--- /dev/null
+++ b/.changelogs/1.1.0/45_fix_daemon_timer.yml
@@ -0,0 +1,2 @@
+changed:
+  - Fix daemon timer to use hours instead of minutes. [#45]
diff --git a/proxlb b/proxlb
index 381c60c..fb92547 100755
--- a/proxlb
+++ b/proxlb
@@ -113,7 +113,7 @@ def validate_daemon(daemon, schedule):
 
     if bool(int(daemon)):
         logging.info(f'{info_prefix} Running in daemon mode. Next run in {schedule} hours.')
-        time.sleep(int(schedule) * 60)
+        time.sleep(int(schedule) * 60 * 60)
     else:
         logging.info(f'{info_prefix} Not running in daemon mode. Quitting.')
         sys.exit(0)