mirror of
https://github.com/gyptazy/ProxLB.git
synced 2026-04-06 04:41:58 +02:00
Compare commits
51 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
71d373eedb | ||
|
|
040eeb9f13 | ||
|
|
4ef1e92aad | ||
|
|
7e5fe13dfe | ||
|
|
66c2ab6570 | ||
|
|
ba63514896 | ||
|
|
571025a8a6 | ||
|
|
dd13181cf9 | ||
|
|
37d19a6a2d | ||
|
|
fe333749ce | ||
|
|
8f9bcfcdcf | ||
|
|
ff5fd2f7f1 | ||
|
|
1f6576ecd6 | ||
|
|
46bbe01141 | ||
|
|
07ed12fcb7 | ||
|
|
546fbc7d73 | ||
|
|
15436c431f | ||
|
|
33f6ff8db0 | ||
|
|
84628f232e | ||
|
|
6a91afd405 | ||
|
|
909643a09f | ||
|
|
7de1ba366b | ||
|
|
0cb19fab34 | ||
|
|
972b10b7e5 | ||
|
|
7fa110e465 | ||
|
|
948df0316b | ||
|
|
016378e37c | ||
|
|
8a193b9891 | ||
|
|
30e3b66be9 | ||
|
|
b9be405194 | ||
|
|
ac108f2abe | ||
|
|
02b43d3ef7 | ||
|
|
581d6d480b | ||
|
|
5b395b7f15 | ||
|
|
7d94c52883 | ||
|
|
7d19788be1 | ||
|
|
0bbc5992ca | ||
|
|
a4a5d9e68a | ||
|
|
af98ee8d5b | ||
|
|
afc93f7b21 | ||
|
|
bc6d8c8509 | ||
|
|
6d50f32486 | ||
|
|
5fe49a9dc1 | ||
|
|
fca1d1211c | ||
|
|
36388d9429 | ||
|
|
3f424e9e6d | ||
|
|
44a733aed3 | ||
|
|
2f44ff48a0 | ||
|
|
7b6db9cfdd | ||
|
|
8c473b416c | ||
|
|
51c8afe5c5 |
2
.changelogs/1.1.10/335-prevalidate-affinity-matrix.yml
Normal file
2
.changelogs/1.1.10/335-prevalidate-affinity-matrix.yml
Normal file
@@ -0,0 +1,2 @@
|
||||
added:
|
||||
- Prevent redundant rebalancing by validating existing affinity enforcement before taking actions (@gyptazy). [#335]
|
||||
@@ -0,0 +1,2 @@
|
||||
added:
|
||||
- Add safety-guard for PVE 8 users when activating conntrack-aware migrations mistakenly (@gyptazy). [#359]
|
||||
@@ -0,0 +1,3 @@
|
||||
fixed:
|
||||
- Fixed the Proxmox API connection validation which returned a false-positive logging message of timeouts (@gyptazy). [#361]
|
||||
- Refactored Proxmox API connection functions
|
||||
@@ -0,0 +1,2 @@
|
||||
fixed:
|
||||
- Fixed a crash during PVE resource pool enumeration by skipping members not having a 'name' property (@stefanoettl). [#368]
|
||||
1
.changelogs/1.1.10/release_meta.yml
Normal file
1
.changelogs/1.1.10/release_meta.yml
Normal file
@@ -0,0 +1 @@
|
||||
date: 2025-11-25
|
||||
3
.changelogs/1.1.8/317_container_image_non_root.yml
Normal file
3
.changelogs/1.1.8/317_container_image_non_root.yml
Normal file
@@ -0,0 +1,3 @@
|
||||
changed:
|
||||
- Container image does not run as root anymore (@mikaelkrantz945). [#317]
|
||||
- Container image uses venv for running ProxLB (@mikaelkrantz945). [#317]
|
||||
@@ -0,0 +1,2 @@
|
||||
fixed:
|
||||
- Fix API errors when using conntrack aware migration with older PVE versions (@gyptazy). [#318]
|
||||
2
.changelogs/1.1.8/329_add_log_prefix.yml
Normal file
2
.changelogs/1.1.8/329_add_log_prefix.yml
Normal file
@@ -0,0 +1,2 @@
|
||||
fixed:
|
||||
- Add a static ProxLB prefix to the log output when used by journal handler (@gyptazy). [#329]
|
||||
1
.changelogs/1.1.8/release_meta.yml
Normal file
1
.changelogs/1.1.8/release_meta.yml
Normal file
@@ -0,0 +1 @@
|
||||
date: 2025-10-09
|
||||
5
.changelogs/1.1.9/337_add_pressure_based_balancing.yml
Normal file
5
.changelogs/1.1.9/337_add_pressure_based_balancing.yml
Normal file
@@ -0,0 +1,5 @@
|
||||
added:
|
||||
- Add pressure (PSI) based balancing for memory, cpu, disk (req. PVE9 or greater) (@gyptazy). [#337]
|
||||
- Pressure (PSI) based balancing for nodes
|
||||
- Pressure (PSI) based balancing for guests
|
||||
- Add PVE version evaluation
|
||||
2
.changelogs/1.1.9/342_add_memory_balancing_threshold.yml
Normal file
2
.changelogs/1.1.9/342_add_memory_balancing_threshold.yml
Normal file
@@ -0,0 +1,2 @@
|
||||
added:
|
||||
- Add an optional memory balancing threshold (@gyptazy). [#342]
|
||||
@@ -0,0 +1,2 @@
|
||||
added:
|
||||
- Add affinity/anti-affinity support by pools (@gyptazy). [#343]
|
||||
1
.changelogs/1.1.9/release_meta.yml
Normal file
1
.changelogs/1.1.9/release_meta.yml
Normal file
@@ -0,0 +1 @@
|
||||
date: 2025-10-30
|
||||
@@ -58,6 +58,10 @@ jobs:
|
||||
integration-test-debian:
|
||||
needs: build-package-debian
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
debian_version: [bookworm, trixie]
|
||||
name: Integration Test on Debian ${{ matrix.debian_version }}
|
||||
steps:
|
||||
- name: Download Debian package artifact
|
||||
uses: actions/download-artifact@v4
|
||||
@@ -66,13 +70,18 @@ jobs:
|
||||
path: package/
|
||||
|
||||
- name: Set up Docker with Debian image
|
||||
run: docker pull debian:latest
|
||||
run: docker pull debian:${{ matrix.debian_version }}
|
||||
|
||||
- name: Install and test Debian package in Docker container
|
||||
run: |
|
||||
docker run --rm -v $(pwd)/package:/package -w /package debian:latest bash -c "
|
||||
apt-get update && \
|
||||
apt-get install -y systemd && \
|
||||
apt-get install -y ./proxlb*.deb && \
|
||||
python3 -c 'import proxlb; print(\"OK: Debian package successfully installed.\")'
|
||||
"
|
||||
docker run --rm \
|
||||
-v "$(pwd)/package:/package" \
|
||||
-w /package \
|
||||
debian:${{ matrix.debian_version }} \
|
||||
bash -c "
|
||||
set -e
|
||||
apt-get update
|
||||
apt-get install -y python3 systemd
|
||||
apt-get install -y ./proxlb*.deb
|
||||
python3 -c 'import proxlb; print(\"OK: Debian package successfully installed on ${{ matrix.debian_version }}.\")'
|
||||
"
|
||||
|
||||
40
CHANGELOG.md
40
CHANGELOG.md
@@ -5,6 +5,46 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [1.1.10] - 2025-11-25
|
||||
|
||||
### Added
|
||||
|
||||
- Prevent redundant rebalancing by validating existing affinity enforcement before taking actions (@gyptazy). [#335]
|
||||
- Add safety-guard for PVE 8 users when activating conntrack-aware migrations mistakenly (@gyptazy). [#359]
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fix the Proxmox API connection validation which returned a false-positive logging message of timeouts (@gyptazy). [#361]
|
||||
- Refactored Proxmox API connection functions (@gyptazy). [#361]
|
||||
- Fix a crash during PVE resource pool enumeration by skipping members not having a 'name' property (@stefanoettl). [#368]
|
||||
|
||||
## [1.1.9.1] - 2025-10-30
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fix quoting in f-strings which may cause issues on PVE 8 / Debian Bookworm systems (@gyptazy). [#352]
|
||||
|
||||
## [1.1.9] - 2025-10-30
|
||||
|
||||
### Added
|
||||
|
||||
- Add an optional memory balancing threshold (@gyptazy). [#342]
|
||||
- Add affinity/anti-affinity support by pools (@gyptazy). [#343]
|
||||
- Add pressure (PSI) based balancing for memory, cpu, disk (req. PVE9 or greater) (@gyptazy). [#337]
|
||||
- Pressure (PSI) based balancing for nodes
|
||||
- Pressure (PSI) based balancing for guests
|
||||
- Add PVE version evaluation
|
||||
|
||||
## [1.1.8] - 2025-10-09
|
||||
|
||||
### Fixed
|
||||
|
||||
- Fix API errors when using conntrack aware migration with older PVE versions (@gyptazy). [#318]
|
||||
- Add a static ProxLB prefix to the log output when used by journal handler (@gyptazy). [#329]
|
||||
|
||||
### Changed
|
||||
- Container image does not run as root anymore (@mikaelkrantz945). [#317]
|
||||
- Container image uses venv for running ProxLB (@mikaelkrantz945). [#317]
|
||||
|
||||
## [1.1.7] - 2025-09-19
|
||||
|
||||
|
||||
35
Dockerfile
35
Dockerfile
@@ -9,20 +9,33 @@ LABEL org.label-schema.vendor="gyptazy"
|
||||
LABEL org.label-schema.url="https://proxlb.de"
|
||||
LABEL org.label-schema.vcs-url="https://github.com/gyptazy/ProxLB"
|
||||
|
||||
# Install Python3
|
||||
RUN apk add --no-cache python3 py3-pip
|
||||
# --- Step 1 (root): system deps, user, dirs ---
|
||||
RUN apk add --no-cache python3 py3-pip \
|
||||
&& addgroup -S plb \
|
||||
&& adduser -S -G plb -h /home/plb plb \
|
||||
&& mkdir -p /app/conf /opt/venv \
|
||||
&& chown -R plb:plb /app /home/plb /opt/venv
|
||||
|
||||
# Create a directory for the app
|
||||
WORKDIR /app
|
||||
|
||||
# Copy the python program from the current directory to /app
|
||||
COPY proxlb /app/proxlb
|
||||
# Copy only requirements first for better layer caching
|
||||
COPY --chown=plb:plb requirements.txt /app/requirements.txt
|
||||
|
||||
# Copy requirements to the container
|
||||
COPY requirements.txt /app/requirements.txt
|
||||
# --- Step 2 (appuser): venv + deps + code ---
|
||||
USER plb
|
||||
|
||||
# Install dependencies in the virtual environment
|
||||
RUN pip install --break-system-packages -r /app/requirements.txt
|
||||
# Create venv owned by appuser and put it on PATH
|
||||
RUN python3 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:${PATH}"
|
||||
|
||||
# Set the entry point to use the virtual environment's python
|
||||
ENTRYPOINT ["/usr/bin/python3", "/app/proxlb/main.py"]
|
||||
# Install Python dependencies into the venv (no PEP 668 issues)
|
||||
RUN pip install --no-cache-dir -r /app/requirements.txt
|
||||
|
||||
# Copy application code (owned by appuser)
|
||||
COPY --chown=plb:plb proxlb /app/proxlb
|
||||
|
||||
# Optional: placeholder config so a bind-mount can override cleanly
|
||||
RUN touch /app/conf/proxlb.yaml
|
||||
|
||||
# Run as non-root using venv Python
|
||||
ENTRYPOINT ["/opt/venv/bin/python", "/app/proxlb/main.py"]
|
||||
|
||||
153
README.md
153
README.md
@@ -8,31 +8,32 @@
|
||||
1. [Introduction](#introduction)
|
||||
2. [Features](#features)
|
||||
3. [How does it work?](#how-does-it-work)
|
||||
4. [Installation](#installation)
|
||||
4. [Documentation](#documentation)
|
||||
5. [Installation](#installation)
|
||||
1. [Requirements / Dependencies](#requirements--dependencies)
|
||||
2. [Debian Package](#debian-package)
|
||||
4. [Container / Docker](#container--docker)
|
||||
5. [Source](#source)
|
||||
5. [Usage / Configuration](#usage--configuration)
|
||||
6. [Usage / Configuration](#usage--configuration)
|
||||
1. [GUI Integration](#gui-integration)
|
||||
2. [Proxmox HA Integration](#proxmox-ha-integration)
|
||||
3. [Options](#options)
|
||||
6. [Affinity & Anti-Affinity Rules](#affinity--anti-affinity-rules)
|
||||
7. [Affinity & Anti-Affinity Rules](#affinity--anti-affinity-rules)
|
||||
1. [Affinity Rules](#affinity-rules)
|
||||
2. [Anti-Affinity Rules](#anti-affinity-rules)
|
||||
3. [Ignore VMs](#ignore-vms)
|
||||
4. [Pin VMs to Hypervisor Nodes](#pin-vms-to-hypervisor-nodes)
|
||||
7. [Maintenance](#maintenance)
|
||||
8. [Misc](#misc)
|
||||
8. [Maintenance](#maintenance)
|
||||
9. [Misc](#misc)
|
||||
1. [Bugs](#bugs)
|
||||
2. [Contributing](#contributing)
|
||||
3. [Documentation](#documentation)
|
||||
4. [Support](#support)
|
||||
9. [Author(s)](#authors)
|
||||
3. [Support](#support)
|
||||
4. [Enterprise-Support](#enterprise-support)
|
||||
10. [Author(s)](#authors)
|
||||
|
||||
|
||||
## Introduction
|
||||
ProxLB is an advanced load balancing solution specifically designed for Proxmox clusters, addressing the absence of a Dynamic Resource Scheduler (DRS) that is familiar to VMware users. As a third-party solution, ProxLB enhances the management and efficiency of Proxmox clusters by intelligently distributing workloads across available nodes. Workloads can be balanced by different times like the guest's memory, CPU or disk usage or their assignment to avoid overprovisioning and ensuring resources.
|
||||
ProxLB is an advanced load balancing solution specifically designed for Proxmox clusters, addressing the absence of an intelligent and more advanced resource scheduler. As a third-party solution, ProxLB enhances the management and efficiency of Proxmox clusters by intelligently distributing workloads across available nodes. Workloads can be balanced by different times like the guest's memory, CPU or disk usage or their assignment to avoid overprovisioning and ensuring resources.
|
||||
|
||||
One of the key advantages of ProxLB is that it is fully open-source and free, making it accessible for anyone to use, modify, and contribute to. This ensures transparency and fosters community-driven improvements. ProxLB supports filtering and ignoring specific nodes and guests through configuration files and API calls, providing administrators with the flexibility to tailor the load balancing behavior to their specific needs.
|
||||
|
||||
@@ -43,7 +44,7 @@ ProxLB can also return the best next node for guest placement, which can be inte
|
||||
Overall, ProxLB significantly enhances resource management by intelligently distributing workloads, reducing downtime through its maintenance mode, and providing improved flexibility with affinity and anti-affinity rules. Its seamless integration with CI/CD tools and reliance on the Proxmox API make it a robust and secure solution for optimizing Proxmox cluster performance.
|
||||
|
||||
### Video of Migration
|
||||
<img src="https://cdn.gyptazy.com/images/proxlb-rebalancing-demo.gif"/>
|
||||
<img src="https://cdn.gyptazy.com/img/proxlb-rebalancing-demo.gif"/>
|
||||
|
||||
## Features
|
||||
ProxLB's key features are by enabling automatic rebalancing of VMs and CTs across a Proxmox cluster based on memory, CPU, and local disk usage while identifying optimal nodes for automation. It supports maintenance mode, affinity rules, and seamless Proxmox API integration with ACL support, offering flexible usage as a one-time operation, a daemon, or through the Proxmox Web GUI.
|
||||
@@ -53,6 +54,10 @@ ProxLB's key features are by enabling automatic rebalancing of VMs and CTs acros
|
||||
* Memory
|
||||
* Disk (only local storage)
|
||||
* CPU
|
||||
* Rebalance by different modes:
|
||||
* Used resources
|
||||
* Assigned resources
|
||||
* PSI (Pressure) of resources
|
||||
* Get best nodes for further automation
|
||||
* Supported Guest Types
|
||||
* VMs
|
||||
@@ -74,6 +79,9 @@ ProxLB is a load-balancing system designed to optimize the distribution of virtu
|
||||
|
||||
Before starting any migrations, ProxLB validates that rebalancing actions are necessary and beneficial. Depending on the selected balancing mode — such as CPU, memory, or disk — it creates a balancing matrix. This matrix sorts the VMs by their maximum used or assigned resources, identifying the VM with the highest usage. ProxLB then places this VM on the node with the most free resources in the selected balancing type. This process runs recursively until the operator-defined Balanciness is achieved. Balancing can be defined for the used or max. assigned resources of VMs/CTs.
|
||||
|
||||
## Documentation
|
||||
This `README.md` doesn't contain all information and only highlights the most important facts. Extended information, such like API permissions, creating dedicated user, best-practices in running ProxLB and much more can be found in the [docs/](https://github.com/gyptazy/ProxLB/tree/main/docs) directory. Please consult the documentation before creating issues.
|
||||
|
||||
## Installation
|
||||
|
||||
### Requirements / Dependencies
|
||||
@@ -134,7 +142,7 @@ wget -O /etc/apt/trusted.gpg.d/proxlb.asc https://repo.gyptazy.com/repository.gp
|
||||
|
||||
#### Debian Packages (.deb files)
|
||||
If you do not want to use the repository you can also find the debian packages as a .deb file on gyptazy's CDN at:
|
||||
* https://cdn.gyptazy.com/debian/
|
||||
* https://cdn.gyptazy.com/debian/proxlb/
|
||||
|
||||
Afterwards, you can simply install the package by running:
|
||||
```bash
|
||||
@@ -165,6 +173,10 @@ docker run -it --rm -v $(pwd)/proxlb.yaml:/etc/proxlb/proxlb.yaml proxlb
|
||||
| Version | Image |
|
||||
|------|:------:|
|
||||
| latest | cr.gyptazy.com/proxlb/proxlb:latest |
|
||||
| v1.1.10 | cr.gyptazy.com/proxlb/proxlb:v1.1.10 |
|
||||
| v1.1.9.1 | cr.gyptazy.com/proxlb/proxlb:v1.1.9.1 |
|
||||
| v1.1.9 | cr.gyptazy.com/proxlb/proxlb:v1.1.9 |
|
||||
| v1.1.8 | cr.gyptazy.com/proxlb/proxlb:v1.1.8 |
|
||||
| v1.1.7 | cr.gyptazy.com/proxlb/proxlb:v1.1.7 |
|
||||
| v1.1.6.1 | cr.gyptazy.com/proxlb/proxlb:v1.1.6.1 |
|
||||
| v1.1.6 | cr.gyptazy.com/proxlb/proxlb:v1.1.6 |
|
||||
@@ -225,7 +237,7 @@ docker run -it --rm -v $(pwd)/proxlb.yaml:/etc/proxlb/proxlb.yaml proxlb
|
||||
Running ProxLB is straightforward and versatile, as it only requires `Python3` and the `proxmoxer` library. This means ProxLB can be executed directly on a Proxmox node or on dedicated systems such as Debian, RedHat, or even FreeBSD, provided that the Proxmox API is accessible from the client running ProxLB. ProxLB can also run inside a Container - Docker or LXC - and is simply up to you.
|
||||
|
||||
### GUI Integration
|
||||
<img align="left" src="https://cdn.gyptazy.com/images/proxlb-GUI-integration.jpg"/> ProxLB can also be accessed through the Proxmox Web UI by installing the optional `pve-proxmoxlb-service-ui` package, which depends on the proxlb package. For full Web UI integration, this package must be installed on all nodes within the cluster. Once installed, a new menu item - `Rebalancing`, appears in the cluster level under the HA section. Once installed, it offers two key functionalities:
|
||||
<img align="left" src="https://cdn.gyptazy.com/img/rebalance-ui.jpg"/> ProxLB can also be accessed through the Proxmox Web UI by installing the optional `pve-proxmoxlb-service-ui` package, which depends on the proxlb package. For full Web UI integration, this package must be installed on all nodes within the cluster. Once installed, a new menu item - `Rebalancing`, appears in the cluster level under the HA section. Once installed, it offers two key functionalities:
|
||||
* Rebalancing VM workloads
|
||||
* Migrate VM workloads away from a defined node (e.g. maintenance preparation)
|
||||
|
||||
@@ -272,8 +284,11 @@ The following options can be set in the configuration file `proxlb.yaml`:
|
||||
| | balance_types | | ['vm', 'ct'] | `List` | Defined the types of guests that should be honored. [values: `vm`, `ct`]|
|
||||
| | max_job_validation | | 1800 | `Int` | How long a job validation may take in seconds. (default: 1800) |
|
||||
| | balanciness | | 10 | `Int` | The maximum delta of resource usage between node with highest and lowest usage. |
|
||||
| | memory_threshold | | 75 | `Int` | The maximum threshold (in percent) that needs to be hit to perform balancing actions. (Optional) |
|
||||
| | method | | memory | `Str` | The balancing method that should be used. [values: `memory` (default), `cpu`, `disk`]|
|
||||
| | mode | | used | `Str` | The balancing mode that should be used. [values: `used` (default), `assigned`] |
|
||||
| | mode | | used | `Str` | The balancing mode that should be used. [values: `used` (default), `assigned`, `psi` (pressure)] |
|
||||
| | psi | | { nodes: { memory: { pressure_full: 0.20, pressure_some: 0.20, pressure_spikes: 1.00 }}} | `Dict` | A dict of PSI based thresholds for nodes and guests |
|
||||
| | pools | | pools: { dev: { type: affinity }, de-nbg01-db: { type: anti-affinity }} | `Dict` | A dict of pool names and their type for creating affinity/anti-affinity rules |
|
||||
| `service` | | | | | |
|
||||
| | daemon | | True | `Bool` | If daemon mode should be activated. |
|
||||
| | `schedule` | | | `Dict` | Schedule config block for rebalancing. |
|
||||
@@ -315,9 +330,47 @@ balancing:
|
||||
with_conntrack_state: True
|
||||
balance_types: ['vm', 'ct']
|
||||
max_job_validation: 1800
|
||||
memory_threshold: 75
|
||||
balanciness: 5
|
||||
method: memory
|
||||
mode: used
|
||||
# # PSI thresholds only apply when using mode 'psi'
|
||||
# # PSI based balancing is currently in beta and req. PVE >= 9
|
||||
# psi:
|
||||
# nodes:
|
||||
# memory:
|
||||
# pressure_full: 0.20
|
||||
# pressure_some: 0.20
|
||||
# pressure_spikes: 1.00
|
||||
# cpu:
|
||||
# pressure_full: 0.20
|
||||
# pressure_some: 0.20
|
||||
# pressure_spikes: 1.00
|
||||
# disk:
|
||||
# pressure_full: 0.20
|
||||
# pressure_some: 0.20
|
||||
# pressure_spikes: 1.00
|
||||
# guests:
|
||||
# memory:
|
||||
# pressure_full: 0.20
|
||||
# pressure_some: 0.20
|
||||
# pressure_spikes: 1.00
|
||||
# cpu:
|
||||
# pressure_full: 0.20
|
||||
# pressure_some: 0.20
|
||||
# pressure_spikes: 1.00
|
||||
# disk:
|
||||
# pressure_full: 0.20
|
||||
# pressure_some: 0.20
|
||||
# pressure_spikes: 1.00
|
||||
pools:
|
||||
dev:
|
||||
type: affinity
|
||||
de-nbg01-db
|
||||
type: anti-affinity
|
||||
pin:
|
||||
- virt66
|
||||
- virt77
|
||||
|
||||
service:
|
||||
daemon: True
|
||||
@@ -348,19 +401,33 @@ ProxLB provides an advanced mechanism to define affinity and anti-affinity rules
|
||||
ProxLB implements affinity and anti-affinity rules through a tag-based system within the Proxmox web interface. Each guest (virtual machine or container) can be assigned specific tags, which then dictate its placement behavior. This method maintains a streamlined and secure approach to managing VM relationships while preserving Proxmox’s inherent permission model.
|
||||
|
||||
### Affinity Rules
|
||||
<img align="left" src="https://cdn.gyptazy.com/images/proxlb-affinity-rules.jpg"/> Affinity rules are used to group certain VMs together, ensuring that they run on the same host whenever possible. This can be beneficial for workloads requiring low-latency communication, such as clustered databases or application servers that frequently exchange data.
|
||||
<img align="left" src="https://cdn.gyptazy.com/img/proxlb-affinity-rules.jpg"/> Affinity rules are used to group certain VMs together, ensuring that they run on the same host whenever possible. This can be beneficial for workloads requiring low-latency communication, such as clustered databases or application servers that frequently exchange data. In general, there're two ways to manage affinity rules:
|
||||
|
||||
#### Affinity Rules by Tags
|
||||
To define an affinity rule which keeps all guests assigned to this tag together on a node, users assign a tag with the prefix `plb_affinity_$TAG`:
|
||||
|
||||
#### Example for Screenshot
|
||||
```
|
||||
plb_affinity_talos
|
||||
```
|
||||
|
||||
As a result, ProxLB will attempt to place all VMs with the `plb_affinity_web` tag on the same host (see also the attached screenshot with the same node).
|
||||
|
||||
### Anti-Affinity Rules
|
||||
<img align="left" src="https://cdn.gyptazy.com/images/proxlb-anti-affinity-rules.jpg"/> Conversely, anti-affinity rules ensure that designated VMs do not run on the same physical host. This is particularly useful for high-availability setups, where redundancy is crucial. Ensuring that critical services are distributed across multiple hosts reduces the risk of a single point of failure.
|
||||
#### Affinity Rules by Pools
|
||||
Antoher approach is by using pools in Proxmox. This way, it can easily also combined with other resources like backup jobs. However, in this approach you need to modify the ProxLB config file to your needs. Within the `balancing` section you can create a dict of pools, including the pool name and the affinity type. Please see the example for further details:
|
||||
|
||||
**Example Config**
|
||||
```
|
||||
balancing:
|
||||
[...]
|
||||
pools: # Optional: Define affinity/anti-affinity rules per pool
|
||||
dev: # Pool name: dev
|
||||
type: affinity # Type: affinity (keeping VMs together)
|
||||
pin: # Pin VMs to Nodes
|
||||
- virt77 # Pinning to 'virt77' which is maybe an older system for dev labs
|
||||
```
|
||||
|
||||
### Anti-Affinity Rules by Tags
|
||||
<img align="left" src="https://cdn.gyptazy.com/img/proxlb-anti-affinity-rules.jpg"/> Conversely, anti-affinity rules ensure that designated VMs do not run on the same physical host. This is particularly useful for high-availability setups, where redundancy is crucial. Ensuring that critical services are distributed across multiple hosts reduces the risk of a single point of failure. In general, there're two ways to manage anti-affinity rules:
|
||||
|
||||
To define an anti-affinity rule that ensures to not move systems within this group to the same node, users assign a tag with the prefix:
|
||||
|
||||
@@ -371,10 +438,23 @@ plb_anti_affinity_ntp
|
||||
|
||||
As a result, ProxLB will try to place the VMs with the `plb_anti_affinity_ntp` tag on different hosts (see also the attached screenshot with the different nodes).
|
||||
|
||||
#### Anti-Affinity Rules by Pools
|
||||
Antoher approach is by using pools in Proxmox. This way, it can easily also combined with other resources like backup jobs. However, in this approach you need to modify the ProxLB config file to your needs. Within the `balancing` section you can create a dict of pools, including the pool name and the affinity type. Please see the example for further details:
|
||||
|
||||
**Example Config**
|
||||
```
|
||||
balancing:
|
||||
[...]
|
||||
pools: # Optional: Define affinity/anti-affinity rules per pool
|
||||
de-nbg01-db: # Pool name: de-nbg01-db
|
||||
type: anti-affinity # Type: anti-affinity (spreading VMs apart)
|
||||
```
|
||||
|
||||
|
||||
**Note:** While this ensures that ProxLB tries distribute these VMs across different physical hosts within the Proxmox cluster this may not always work. If you have more guests attached to the group than nodes in the cluster, we still need to run them anywhere. If this case occurs, the next one with the most free resources will be selected.
|
||||
|
||||
### Ignore VMs
|
||||
<img align="left" src="https://cdn.gyptazy.com/images/proxlb-ignore-vm-movement.jpg"/> Guests, such as VMs or CTs, can also be completely ignored. This means, they won't be affected by any migration (even when (anti-)affinity rules are enforced). To ensure a proper resource evaluation, these guests are still collected and evaluated but simply skipped for balancing actions. Another thing is the implementation. While ProxLB might have a very restricted configuration file including the file permissions, this file is only read- and writeable by the Proxmox administrators. However, we might have user and groups who want to define on their own that their systems shouldn't be moved. Therefore, these users can simpy set a specific tag to the guest object - just like the (anti)affinity rules.
|
||||
<img align="left" src="https://cdn.gyptazy.com/img/proxlb-ignore-vm-movement.jpg"/> Guests, such as VMs or CTs, can also be completely ignored. This means, they won't be affected by any migration (even when (anti-)affinity rules are enforced). To ensure a proper resource evaluation, these guests are still collected and evaluated but simply skipped for balancing actions. Another thing is the implementation. While ProxLB might have a very restricted configuration file including the file permissions, this file is only read- and writeable by the Proxmox administrators. However, we might have user and groups who want to define on their own that their systems shouldn't be moved. Therefore, these users can simpy set a specific tag to the guest object - just like the (anti)affinity rules.
|
||||
|
||||
To define a guest to be ignored from the balancing, users assign a tag with the prefix `plb_ignore_$TAG`:
|
||||
|
||||
@@ -388,8 +468,9 @@ As a result, ProxLB will not migrate this guest with the `plb_ignore_dev` tag to
|
||||
**Note:** Ignored guests are really ignored. Even by enforcing affinity rules this guest will be ignored.
|
||||
|
||||
### Pin VMs to Specific Hypervisor Nodes
|
||||
<img align="left" src="https://cdn.gyptazy.com/images/proxlb-tag-node-pinning.jpg"/> Guests, such as VMs or CTs, can also be pinned to specific (and multiple) nodes in the cluster. This might be usefull when running applications with some special licensing requirements that are only fulfilled on certain nodes. It might also be interesting, when some physical hardware is attached to a node, that is not available in general within the cluster.
|
||||
<img align="left" src="https://cdn.gyptazy.com/img/proxlb-tag-node-pinning.jpg"/> Guests, such as VMs or CTs, can also be pinned to specific (and multiple) nodes in the cluster. This might be usefull when running applications with some special licensing requirements that are only fulfilled on certain nodes. It might also be interesting, when some physical hardware is attached to a node, that is not available in general within the cluster.
|
||||
|
||||
#### Pinning VMs to (a) specific Hypervisor Node(s) by Tag
|
||||
To pin a guest to a specific cluster node, users assign a tag with the prefix `plb_pin_$nodename` to the desired guest:
|
||||
|
||||
#### Example for Screenshot
|
||||
@@ -399,13 +480,27 @@ plb_pin_node03
|
||||
|
||||
As a result, ProxLB will pin the guest `dev-vm01` to the node `virt03`.
|
||||
|
||||
|
||||
#### Pinning VMs to (a) specific Hypervisor Node(s) by Pools
|
||||
Beside the tag approach, you can also pin a resource group to a specific hypervisor or groups of hypervisors by defining a `pin` key of type list.
|
||||
|
||||
**Example Config**
|
||||
```
|
||||
balancing:
|
||||
[...]
|
||||
pools: # Optional: Define affinity/anti-affinity rules per pool
|
||||
dev: # Pool name: dev
|
||||
type: affinity # Type: affinity (keeping VMs together)
|
||||
pin: # Pin VMs to Nodes
|
||||
- virt77 # Pinning to 'virt77' which is maybe an older system for dev labs
|
||||
```
|
||||
|
||||
|
||||
You can also repeat this step multiple times for different node names to create a potential group of allowed hosts where a the guest may be served on. In this case, ProxLB takes the node with the lowest used resources according to the defined balancing values from this group.
|
||||
|
||||
**Note:** The given node names from the tag are validated. This means, ProxLB validated if the given node name is really part of the cluster. In case of a wrongly defined or unavailable node name it continous to use the regular processes to make sure the guest keeps running.
|
||||
|
||||
## Maintenance
|
||||
<img src="https://cdn.gyptazy.com/images/proxlb-rebalancing-demo.gif"/>
|
||||
|
||||
The `maintenance_nodes` option allows operators to designate one or more Proxmox nodes for maintenance mode. When a node is set to maintenance, no new guest workloads will be assigned to it, and all existing workloads will be migrated to other available nodes within the cluster. This process ensures that (anti)-affinity rules and resource availability are respected, preventing disruptions while maintaining optimal performance across the infrastructure.
|
||||
|
||||
### Adding / Removing Nodes from Maintenance
|
||||
@@ -423,9 +518,6 @@ Bugs can be reported via the GitHub issue tracker [here](https://github.com/gypt
|
||||
### Contributing
|
||||
Feel free to add further documentation, to adjust already existing one or to contribute with code. Please take care about the style guide and naming conventions. You can find more in our [CONTRIBUTING.md](https://github.com/gyptazy/ProxLB/blob/main/CONTRIBUTING.md) file.
|
||||
|
||||
### Documentation
|
||||
You can also find additional and more detailed documentation within the [docs/](https://github.com/gyptazy/ProxLB/tree/main/docs) directory.
|
||||
|
||||
### Support
|
||||
If you need assistance or have any questions, we offer support through our dedicated [chat room](https://matrix.to/#/#proxlb:gyptazy.com) in Matrix or [Discord](https://discord.gg/JemGu7WbfQ). Join our community for real-time help, advice, and discussions. The Matrix and Discord room are bridged to ensure that the communication is not splitted - so simply feel free to join which fits most to you!
|
||||
|
||||
@@ -440,5 +532,18 @@ Connect with us in our dedicated chat room for immediate support and live intera
|
||||
|
||||
**Note:** Please always keep in mind that this is a one-man show project without any further help. This includes coding, testing, packaging and all the infrastructure around it to keep this project up and running.
|
||||
|
||||
### Enterprise-Support
|
||||
Running critical infrastructure in an enterprise environment often comes with requirements that go far beyond functionality alone. Enterprises typically expect predictable service levels, defined escalation paths, and guaranteed response times. In many cases, organizations also demand 24x7 support availability to ensure that their systems remain stable and resilient, even under unexpected circumstances.
|
||||
|
||||
As the creator and maintainer of ProxLB, I operate as a one-man project. While I am continuously working to improve the software, I cannot provide the type of enterprise-grade support that large organizations may require. To address this need, several companies have stepped in to offer professional services around ProxLB in Proxmox VE clusters.
|
||||
|
||||
Below is a list of organizations currently known to provide enterprise-level support for ProxLB. If your business relies on ProxLB in production and you require more than community-based support, these providers may be a good fit for your needs:
|
||||
|
||||
| Company| Country | Web |
|
||||
|------|:------:|:------:|
|
||||
| credativ | DE | [credativ.de](https://www.credativ.de/en/portfolio/support/proxmox-virtualization/) |
|
||||
|
||||
*Note: If you provide support for ProxLB, feel free to create PR with your addition.*
|
||||
|
||||
### Author(s)
|
||||
* Florian Paul Azim Hoberg @gyptazy (https://gyptazy.com)
|
||||
|
||||
@@ -26,11 +26,48 @@ balancing:
|
||||
live: True
|
||||
with_local_disks: True
|
||||
with_conntrack_state: True
|
||||
balance_types: ['vm', 'ct']
|
||||
max_job_validation: 1800
|
||||
balanciness: 5
|
||||
method: memory
|
||||
mode: used
|
||||
balance_types: ['vm', 'ct'] # 'vm' | 'ct'
|
||||
max_job_validation: 1800 # Maximum time (in seconds) a job validation may take
|
||||
memory_threshold: 75 # Optional: Maximum threshold (in percent) to trigger balancing actions
|
||||
balanciness: 5 # Maximum delta of resource usage between highest and lowest usage node
|
||||
method: memory # 'memory' | 'cpu' | 'disk'
|
||||
mode: used # 'assigned' | 'used' | 'psi'
|
||||
# # PSI thresholds only apply when using mode 'psi'
|
||||
# psi:
|
||||
# nodes:
|
||||
# memory:
|
||||
# pressure_full: 0.20
|
||||
# pressure_some: 0.20
|
||||
# pressure_spikes: 1.00
|
||||
# cpu:
|
||||
# pressure_full: 0.20
|
||||
# pressure_some: 0.20
|
||||
# pressure_spikes: 1.00
|
||||
# disk:
|
||||
# pressure_full: 0.20
|
||||
# pressure_some: 0.20
|
||||
# pressure_spikes: 1.00
|
||||
# guests:
|
||||
# memory:
|
||||
# pressure_full: 0.20
|
||||
# pressure_some: 0.20
|
||||
# pressure_spikes: 1.00
|
||||
# cpu:
|
||||
# pressure_full: 0.20
|
||||
# pressure_some: 0.20
|
||||
# pressure_spikes: 1.00
|
||||
# disk:
|
||||
# pressure_full: 0.20
|
||||
# pressure_some: 0.20
|
||||
# pressure_spikes: 1.00
|
||||
pools: # Optional: Define affinity/anti-affinity rules per pool
|
||||
dev: # Pool name: dev
|
||||
type: affinity # Type: affinity (keeping VMs together)
|
||||
de-nbg01-db: # Pool name: de-nbg01-db
|
||||
type: anti-affinity # Type: anti-affinity (spreading VMs apart)
|
||||
pin: # Define a pinning og guests to specific node(s)
|
||||
- virt66
|
||||
- virt77
|
||||
|
||||
service:
|
||||
daemon: True
|
||||
|
||||
31
debian/changelog
vendored
31
debian/changelog
vendored
@@ -1,3 +1,34 @@
|
||||
proxlb (1.1.10) stable; urgency=medium
|
||||
|
||||
* Prevent redundant rebalancing by validating existing affinity enforcement before taking actions. (Closes: #335)
|
||||
* Add safety-guard for PVE 8 users when activating conntrack-aware migrations mistakenly. (Closes: #359)
|
||||
* Fix the Proxmox API connection validation which returned a false-positive logging message of timeouts. (Closes: #361)
|
||||
* Refactored the whole Proxmox API connection function. (Closes: #361)
|
||||
* Fix a crash during PVE resource pool enumeration by skipping members not having a 'name' property. (Closes: #368)
|
||||
|
||||
-- Florian Paul Azim Hoberg <gyptazy@gyptazy.com> Tue, 25 Nov 2025 09:12:04 +0001
|
||||
|
||||
proxlb (1.1.9.1) stable; urgency=medium
|
||||
|
||||
* Fix quoting in f-strings which may cause issues on PVE 8 / Debian Bookworm systems. (Closes: #352)
|
||||
|
||||
-- Florian Paul Azim Hoberg <gyptazy@gyptazy.com> Thu, 30 Oct 2025 17:41:02 +0001
|
||||
|
||||
proxlb (1.1.9) stable; urgency=medium
|
||||
|
||||
* Add pressure (PSI) based balancing for memory, cpu, disk (req. PVE9 or greater). (Closes: #339)
|
||||
* Add (memory) threshold for nodes before running balancing. (Closes: #342)
|
||||
* Add affinity/anti-affinity support by pools. (Closes: #343)
|
||||
|
||||
-- Florian Paul Azim Hoberg <gyptazy@gyptazy.com> Thu, 30 Oct 2025 06:58:43 +0001
|
||||
|
||||
proxlb (1.1.8) stable; urgency=medium
|
||||
|
||||
* Fix API errors when using conntrack aware migration with older PVE version. (Closes: #318)
|
||||
* Add a static ProxLB prefix to the log output when used by journal handler. (Closes: #329)
|
||||
|
||||
-- Florian Paul Azim Hoberg <gyptazy@gyptazy.com> Thu, 09 Oct 2025 09:04:13 +0002
|
||||
|
||||
proxlb (1.1.7) stable; urgency=medium
|
||||
|
||||
* Add conntrack state aware migrations of VMs. (Closes: #305)
|
||||
|
||||
6
debian/control
vendored
6
debian/control
vendored
@@ -7,6 +7,6 @@ Build-Depends: debhelper-compat (= 13), dh-python, python3-all, python3-setuptoo
|
||||
|
||||
Package: proxlb
|
||||
Architecture: all
|
||||
Depends: ${python3:Depends}, ${misc:Depends}, python3-requests, python3-urllib3, python3-proxmoxer, python3-yaml
|
||||
Description: A DRS alike Load Balancer for Proxmox Clusters
|
||||
An advanced DRS alike loadbalancer for Proxmox clusters that also supports maintenance modes and affinity/anti-affinity rules.
|
||||
Depends: ${python3:Depends}, ${misc:Depends}, python3-requests, python3-urllib3, python3-packaging, python3-proxmoxer, python3-yaml
|
||||
Description: An advanced resource scheduler and load balancer for Proxmox clusters
|
||||
An advanced resource scheduler and load balancer for Proxmox clusters that also supports maintenance mode and affinity/anti-affinity rules.
|
||||
|
||||
@@ -55,7 +55,7 @@ ProxLB itself requires minimal system resources to operate. However, for managin
|
||||
|
||||
|
||||
## Where To Run?
|
||||
ProxLB can run on pretty anthing and only requires you to have a network connectivity to any of the Proxmox host's API (usually on tcp/8006).
|
||||
ProxLB is lightweight and flexible where it runs on nearly any environment and only needs access to your Proxmox host’s API endpoint (commonly TCP port 8006).
|
||||
|
||||
Therefore, you can simply run ProxLB on:
|
||||
* Bare-metal Systems
|
||||
|
||||
@@ -20,6 +20,10 @@
|
||||
7. [Run as a Systemd-Service](#run-as-a-systemd-service)
|
||||
8. [SSL Self-Signed Certificates](#ssl-self-signed-certificates)
|
||||
9. [Node Maintenances](#node-maintenances)
|
||||
10. [Balancing Methods](#balancing-methods)
|
||||
1. [Used Resources](#used-resources)
|
||||
2. [Assigned Resources](#assigned-resources)
|
||||
3. [Pressure (PSI) based Resources](#pressure-psi-based-resources)
|
||||
|
||||
## Authentication / User Accounts / Permissions
|
||||
### Authentication
|
||||
@@ -76,8 +80,8 @@ ProxLB provides an advanced mechanism to define affinity and anti-affinity rules
|
||||
|
||||
ProxLB implements affinity and anti-affinity rules through a tag-based system within the Proxmox web interface. Each guest (virtual machine or container) can be assigned specific tags, which then dictate its placement behavior. This method maintains a streamlined and secure approach to managing VM relationships while preserving Proxmox’s inherent permission model.
|
||||
|
||||
#### Affinity Rules
|
||||
<img align="left" src="https://cdn.gyptazy.com/images/proxlb-affinity-rules.jpg"/> Affinity rules are used to group certain VMs together, ensuring that they run on the same host whenever possible. This can be beneficial for workloads requiring low-latency communication, such as clustered databases or application servers that frequently exchange data.
|
||||
#### Affinity Rules by Tags
|
||||
<img align="left" src="https://cdn.gyptazy.com/img/proxlb-affinity-rules.jpg"/> Affinity rules are used to group certain VMs together, ensuring that they run on the same host whenever possible. This can be beneficial for workloads requiring low-latency communication, such as clustered databases or application servers that frequently exchange data.
|
||||
|
||||
To define an affinity rule which keeps all guests assigned to this tag together on a node, users assign a tag with the prefix `plb_affinity_$TAG`:
|
||||
|
||||
@@ -88,8 +92,20 @@ plb_affinity_talos
|
||||
|
||||
As a result, ProxLB will attempt to place all VMs with the `plb_affinity_web` tag on the same host (see also the attached screenshot with the same node).
|
||||
|
||||
#### Anti-Affinity Rules
|
||||
<img align="left" src="https://cdn.gyptazy.com/images/proxlb-anti-affinity-rules.jpg"/> Conversely, anti-affinity rules ensure that designated VMs do not run on the same physical host. This is particularly useful for high-availability setups, where redundancy is crucial. Ensuring that critical services are distributed across multiple hosts reduces the risk of a single point of failure.
|
||||
#### Affinity Rules by Pools
|
||||
Antoher approach is by using pools in Proxmox. This way, it can easily also combined with other resources like backup jobs. However, in this approach you need to modify the ProxLB config file to your needs. Within the `balancing` section you can create a dict of pools, including the pool name and the affinity type. Please see the example for further details:
|
||||
|
||||
**Example Config**
|
||||
```
|
||||
balancing:
|
||||
[...]
|
||||
pools: # Optional: Define affinity/anti-affinity rules per pool
|
||||
dev: # Pool name: dev
|
||||
type: affinity # Type: affinity (keeping VMs together)
|
||||
```
|
||||
|
||||
#### Anti-Affinity Rules by Tags
|
||||
<img align="left" src="https://cdn.gyptazy.com/img/proxlb-anti-affinity-rules.jpg"/> Conversely, anti-affinity rules ensure that designated VMs do not run on the same physical host. This is particularly useful for high-availability setups, where redundancy is crucial. Ensuring that critical services are distributed across multiple hosts reduces the risk of a single point of failure.
|
||||
|
||||
To define an anti-affinity rule that ensures to not move systems within this group to the same node, users assign a tag with the prefix:
|
||||
|
||||
@@ -102,6 +118,18 @@ As a result, ProxLB will try to place the VMs with the `plb_anti_affinity_ntp` t
|
||||
|
||||
**Note:** While this ensures that ProxLB tries distribute these VMs across different physical hosts within the Proxmox cluster this may not always work. If you have more guests attached to the group than nodes in the cluster, we still need to run them anywhere. If this case occurs, the next one with the most free resources will be selected.
|
||||
|
||||
#### Anti-Affinity Rules by Pools
|
||||
Antoher approach is by using pools in Proxmox. This way, it can easily also combined with other resources like backup jobs. However, in this approach you need to modify the ProxLB config file to your needs. Within the `balancing` section you can create a dict of pools, including the pool name and the affinity type. Please see the example for further details:
|
||||
|
||||
**Example Config**
|
||||
```
|
||||
balancing:
|
||||
[...]
|
||||
pools: # Optional: Define affinity/anti-affinity rules per pool
|
||||
de-nbg01-db: # Pool name: de-nbg01-db
|
||||
type: anti-affinity # Type: anti-affinity (spreading VMs apart)
|
||||
````
|
||||
|
||||
### Affinity / Anti-Affinity Enforcing
|
||||
When a cluster is already balanced and does not require further adjustments, enabling the enforce_affinity parameter ensures that affinity and anti-affinity rules are still respected. This parameter prioritizes the placement of guest objects according to these rules, even if it leads to slight resource imbalances or increased migration overhead. Regularly reviewing and updating these rules, along with monitoring cluster performance, helps maintain optimal performance and reliability. By carefully managing these aspects, you can create a cluster environment that meets your specific needs and maintains a good balance of resources.
|
||||
|
||||
@@ -113,7 +141,7 @@ balancing:
|
||||
*Note: This may have impacts to the cluster. Depending on the created group matrix, the result may also be an unbalanced cluster.*
|
||||
|
||||
### Ignore VMs / CTs
|
||||
<img align="left" src="https://cdn.gyptazy.com/images/proxlb-ignore-vm-movement.jpg"/> Guests, such as VMs or CTs, can also be completely ignored. This means, they won't be affected by any migration (even when (anti-)affinity rules are enforced). To ensure a proper resource evaluation, these guests are still collected and evaluated but simply skipped for balancing actions. Another thing is the implementation. While ProxLB might have a very restricted configuration file including the file permissions, this file is only read- and writeable by the Proxmox administrators. However, we might have user and groups who want to define on their own that their systems shouldn't be moved. Therefore, these users can simpy set a specific tag to the guest object - just like the (anti)affinity rules.
|
||||
<img align="left" src="https://cdn.gyptazy.com/img/proxlb-ignore-vm-movement.jpg"/> Guests, such as VMs or CTs, can also be completely ignored. This means, they won't be affected by any migration (even when (anti-)affinity rules are enforced). To ensure a proper resource evaluation, these guests are still collected and evaluated but simply skipped for balancing actions. Another thing is the implementation. While ProxLB might have a very restricted configuration file including the file permissions, this file is only read- and writeable by the Proxmox administrators. However, we might have user and groups who want to define on their own that their systems shouldn't be moved. Therefore, these users can simpy set a specific tag to the guest object - just like the (anti)affinity rules.
|
||||
|
||||
To define a guest to be ignored from the balancing, users assign a tag with the prefix `plb_ignore_$TAG`:
|
||||
|
||||
@@ -235,4 +263,115 @@ The maintenance_nodes key must be defined as a list, even if it only includes a
|
||||
* No new workloads will be balanced or migrated onto it.
|
||||
* Any existing workloads currently running on the node will be migrated away in accordance with the configured balancing strategies, assuming resources on other nodes allow.
|
||||
|
||||
This feature is particularly useful during planned maintenance, upgrades, or troubleshooting, ensuring that services continue to run with minimal disruption while the specified node is being worked on.
|
||||
This feature is particularly useful during planned maintenance, upgrades, or troubleshooting, ensuring that services continue to run with minimal disruption while the specified node is being worked on.
|
||||
|
||||
## 10. Balancing Methods
|
||||
ProxLB provides multiple balancing modes that define *how* resources are evaluated and compared during cluster balancing.
|
||||
Each mode reflects a different strategy for determining load and distributing guests (VMs or containers) between nodes.
|
||||
|
||||
Depending on your environment, provisioning strategy, and performance goals, you can choose between:
|
||||
|
||||
| Mode | Description | Typical Use Case |
|
||||
|------|--------------|------------------|
|
||||
| `used` | Uses the *actual runtime resource usage* (e.g. CPU, memory, disk). | Dynamic or lab environments with frequent workload changes and tolerance for overprovisioning. |
|
||||
| `assigned` | Uses the *statically defined resource allocations* from guest configurations. | Production or SLA-driven clusters that require guaranteed resources and predictable performance. |
|
||||
| `psi` | Uses Linux *Pressure Stall Information (PSI)* metrics to evaluate real system contention and pressure. | Advanced clusters that require pressure-aware decisions for proactive rebalancing. |
|
||||
|
||||
### 10.1 Used Resources
|
||||
When **mode: `used`** is configured, ProxLB evaluates the *real usage metrics* of guest objects (VMs and CTs).
|
||||
It collects the current CPU, memory, and disk usage directly from the Proxmox API to determine the *actual consumption* of each guest and node.
|
||||
|
||||
This mode is ideal for **dynamic environments** where workloads frequently change and **overprovisioning is acceptable**. It provides the most reactive balancing behavior, since decisions are based on live usage instead of static assignment.
|
||||
|
||||
Typical scenarios include:
|
||||
- Production environments to distribute workloads across the nodes.
|
||||
- Test or development clusters with frequent VM changes.
|
||||
- Clusters where resource spikes are short-lived.
|
||||
- Environments where slight resource contention is tolerable.
|
||||
|
||||
#### Example Configuration
|
||||
```yaml
|
||||
balancing:
|
||||
mode: used
|
||||
```
|
||||
|
||||
### 10.2 Assigned Resources
|
||||
When **mode: `assigned`** is configured, ProxLB evaluates the *provisioned or allocated resources* of each guest (VM or CT) instead of their runtime usage.
|
||||
It uses data such as **CPU cores**, **memory limits**, and **disk allocations** defined in Proxmox to calculate how much of each node’s capacity is reserved.
|
||||
|
||||
This mode is ideal for **production clusters** where:
|
||||
- Overcommitment is *not allowed or only minimally tolerated*.
|
||||
- Each node’s workload is planned based on the assigned capacities.
|
||||
- Administrators want predictable resource distribution aligned with provisioning policies.
|
||||
|
||||
Unlike the `used` mode, `assigned` focuses purely on the *declared configuration* of guests and remains stable even if actual usage varies temporarily.
|
||||
|
||||
Typical scenarios include:
|
||||
- Enterprise environments with SLA or QoS requirements.
|
||||
- Clusters where workloads are sized deterministically.
|
||||
- Situations where consistent node utilization and capacity awareness are crucial.
|
||||
|
||||
#### Example Configuration
|
||||
```yaml
|
||||
balancing:
|
||||
mode: assigned
|
||||
```
|
||||
|
||||
### 10.3 Pressure (PSI) based Resources
|
||||
> [!IMPORTANT]
|
||||
> PSI based balancing is still in beta! If you find any bugs, please raise an issue including metrics of all nodes and affected guests. You can provide metrics directly from PVE or Grafana (via node_exporter or pve_exporter).
|
||||
|
||||
When **mode: `psi`** is configured, ProxLB uses the **Linux Pressure Stall Information (PSI)** interface to measure the *real-time pressure* on system resources such as **CPU**, **memory**, and **disk I/O**.
|
||||
Unlike the `used` or `assigned` modes, which rely on static or average metrics, PSI provides *direct insight into how often and how long tasks are stalled* because of insufficient resources.
|
||||
|
||||
This enables ProxLB to make **proactive balancing decisions** — moving workloads *before* performance degradation becomes visible to the user.
|
||||
|
||||
**IMPORTANT**: Predicting distributing workloads is dangerous and might not result into the expected state. Therefore, ProxLB migrates only a single instance each 60 minutes to obtain new real-metrics and to validate if further changes are required. Keep in mind, that migrations are also costly and should be avoided as much as possible.
|
||||
|
||||
PSI metrics are available for both **nodes** and **guest objects**, allowing fine-grained balancing decisions:
|
||||
- **Node-level PSI:** Detects cluster nodes under systemic load or contention.
|
||||
- **Guest-level PSI:** Identifies individual guests suffering from memory, CPU, or I/O stalls.
|
||||
|
||||
### PSI Metrics Explained
|
||||
Each monitored resource defines three pressure thresholds:
|
||||
| Key | Description |
|
||||
|-----|--------------|
|
||||
| `pressure_some` | Indicates partial stall conditions where some tasks are waiting for a resource. |
|
||||
| `pressure_full` | Represents complete stall conditions where *all* tasks are blocked waiting for a resource. |
|
||||
| `pressure_spikes` | Defines short-term burst conditions that may signal saturation spikes. |
|
||||
|
||||
These thresholds are expressed in **percentages** and represent how much time the kernel reports stalls over specific averaging windows (e.g. 5s, 10s, 60s).
|
||||
|
||||
### Example Configuration
|
||||
|
||||
```yaml
|
||||
balancing:
|
||||
mode: psi
|
||||
psi:
|
||||
nodes:
|
||||
memory:
|
||||
pressure_full: 0.20
|
||||
pressure_some: 0.20
|
||||
pressure_spikes: 1.00
|
||||
cpu:
|
||||
pressure_full: 0.20
|
||||
pressure_some: 0.20
|
||||
pressure_spikes: 1.00
|
||||
disk:
|
||||
pressure_full: 0.20
|
||||
pressure_some: 0.20
|
||||
pressure_spikes: 1.00
|
||||
guests:
|
||||
memory:
|
||||
pressure_full: 0.20
|
||||
pressure_some: 0.20
|
||||
pressure_spikes: 1.00
|
||||
cpu:
|
||||
pressure_full: 0.20
|
||||
pressure_some: 0.20
|
||||
pressure_spikes: 1.00
|
||||
disk:
|
||||
pressure_full: 0.20
|
||||
pressure_some: 0.20
|
||||
pressure_spikes: 1.00
|
||||
```
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
apiVersion: v3
|
||||
apiVersion: v2
|
||||
name: proxlb
|
||||
description: A Helm chart for self-hosted ProxLB
|
||||
type: application
|
||||
version: "1.1.7"
|
||||
appVersion: "v1.1.7"
|
||||
version: "1.1.10"
|
||||
appVersion: "v1.1.10"
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
image:
|
||||
registry: cr.gyptazy.com
|
||||
repository: proxlb/proxlb
|
||||
tag: v1.1.7
|
||||
tag: v1.1.10
|
||||
pullPolicy: IfNotPresent
|
||||
imagePullSecrets: [ ]
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env bash
|
||||
VERSION="1.1.7"
|
||||
VERSION="1.1.9.1"
|
||||
|
||||
# ProxLB
|
||||
sed -i "s/^__version__ = .*/__version__ = \"$VERSION\"/" "proxlb/utils/version.py"
|
||||
@@ -8,5 +8,6 @@ sed -i "s/version=\"[0-9]*\.[0-9]*\.[0-9]*\"/version=\"$VERSION\"/" setup.py
|
||||
# Helm Chart
|
||||
sed -i "s/^version: .*/version: \"$VERSION\"/" helm/proxlb/Chart.yaml
|
||||
sed -i "s/^appVersion: .*/appVersion: \"v$VERSION\"/" helm/proxlb/Chart.yaml
|
||||
sed -i "s/^tag: .*/tag: \"v$VERSION\"/" helm/proxlb/values.yaml
|
||||
|
||||
echo "OK: Versions have been sucessfully set to $VERSION"
|
||||
|
||||
@@ -19,10 +19,12 @@ from utils.cli_parser import CliParser
|
||||
from utils.config_parser import ConfigParser
|
||||
from utils.proxmox_api import ProxmoxApi
|
||||
from models.nodes import Nodes
|
||||
from models.features import Features
|
||||
from models.guests import Guests
|
||||
from models.groups import Groups
|
||||
from models.calculations import Calculations
|
||||
from models.balancing import Balancing
|
||||
from models.pools import Pools
|
||||
from utils.helper import Helper
|
||||
|
||||
|
||||
@@ -71,17 +73,24 @@ def main():
|
||||
# Get all required objects from the Proxmox cluster
|
||||
meta = {"meta": proxlb_config}
|
||||
nodes = Nodes.get_nodes(proxmox_api, proxlb_config)
|
||||
guests = Guests.get_guests(proxmox_api, nodes, meta)
|
||||
pools = Pools.get_pools(proxmox_api)
|
||||
guests = Guests.get_guests(proxmox_api, pools, nodes, meta, proxlb_config)
|
||||
groups = Groups.get_groups(guests, nodes)
|
||||
|
||||
# Merge obtained objects from the Proxmox cluster for further usage
|
||||
proxlb_data = {**meta, **nodes, **guests, **groups}
|
||||
proxlb_data = {**meta, **nodes, **guests, **pools, **groups}
|
||||
Helper.log_node_metrics(proxlb_data)
|
||||
|
||||
# Validate usable features by PVE versions
|
||||
Features.validate_available_features(proxlb_data)
|
||||
|
||||
# Update the initial node resource assignments
|
||||
# by the previously created groups.
|
||||
Calculations.set_node_assignments(proxlb_data)
|
||||
Calculations.set_node_hot(proxlb_data)
|
||||
Calculations.set_guest_hot(proxlb_data)
|
||||
Calculations.get_most_free_node(proxlb_data, cli_args.best_node)
|
||||
Calculations.validate_affinity_map(proxlb_data)
|
||||
Calculations.relocate_guests_on_maintenance_nodes(proxlb_data)
|
||||
Calculations.get_balanciness(proxlb_data)
|
||||
Calculations.relocate_guests(proxlb_data)
|
||||
|
||||
@@ -156,18 +156,17 @@ class Balancing:
|
||||
else:
|
||||
with_local_disks = 0
|
||||
|
||||
if proxlb_data["meta"]["balancing"].get("with_conntrack_state", True):
|
||||
with_conntrack_state = 1
|
||||
else:
|
||||
with_conntrack_state = 0
|
||||
|
||||
migration_options = {
|
||||
'target': guest_node_target,
|
||||
'online': online_migration,
|
||||
'with-local-disks': with_local_disks,
|
||||
'with-conntrack-state': with_conntrack_state,
|
||||
}
|
||||
|
||||
# Conntrack state aware migrations are not supported in older
|
||||
# PVE versions, so we should not add it by default.
|
||||
if proxlb_data["meta"]["balancing"].get("with_conntrack_state", True):
|
||||
migration_options['with-conntrack-state'] = 1
|
||||
|
||||
try:
|
||||
logger.info(f"Balancing: Starting to migrate VM guest {guest_name} from {guest_node_current} to {guest_node_target}.")
|
||||
job_id = proxmox_api.nodes(guest_node_current).qemu(guest_id).migrate().post(**migration_options)
|
||||
|
||||
@@ -80,7 +80,7 @@ class Calculations:
|
||||
|
||||
for guest_name in group_meta["guests"]:
|
||||
guest_node_current = proxlb_data["guests"][guest_name]["node_current"]
|
||||
# Update Hardware assignments
|
||||
# Update resource assignments
|
||||
# Update assigned values for the current node
|
||||
logger.debug(f"set_node_assignment of guest {guest_name} on node {guest_node_current} with cpu_total: {proxlb_data['guests'][guest_name]['cpu_total']}, memory_total: {proxlb_data['guests'][guest_name]['memory_total']}, disk_total: {proxlb_data['guests'][guest_name]['disk_total']}.")
|
||||
proxlb_data["nodes"][guest_node_current]["cpu_assigned"] += proxlb_data["guests"][guest_name]["cpu_total"]
|
||||
@@ -93,6 +93,83 @@ class Calculations:
|
||||
|
||||
logger.debug("Finished: set_node_assignments.")
|
||||
|
||||
def set_node_hot(proxlb_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Evaluates node 'full' pressure metrics for memory, cpu, and io
|
||||
against defined thresholds and sets <metric>_pressure_hot = True
|
||||
when a node is considered HOT.
|
||||
|
||||
Returns the modified proxlb_data dict.
|
||||
"""
|
||||
logger.debug("Starting: set_node_hot.")
|
||||
balancing_cfg = proxlb_data.get("meta", {}).get("balancing", {})
|
||||
thresholds = balancing_cfg.get("psi_thresholds", balancing_cfg.get("psi", {}).get("nodes", {}))
|
||||
nodes = proxlb_data.get("nodes", {})
|
||||
|
||||
for node_name, node in nodes.items():
|
||||
|
||||
if node.get("maintenance"):
|
||||
continue
|
||||
|
||||
if node.get("ignore"):
|
||||
continue
|
||||
|
||||
# PSI metrics are only availavble on Proxmox VE 9.0 and higher.
|
||||
if proxlb_data["meta"]["balancing"].get("mode", "used") == "psi":
|
||||
|
||||
if tuple(map(int, proxlb_data["nodes"][node["name"]]["pve_version"].split('.'))) < tuple(map(int, "9.0".split('.'))):
|
||||
logger.critical(f"Proxmox node {node['name']} runs Proxmox VE version {proxlb_data['nodes'][node['name']]['pve_version']}."
|
||||
" PSI metrics require Proxmox VE 9.0 or higher. Balancing deactivated!")
|
||||
|
||||
for metric, threshold in thresholds.items():
|
||||
pressure_full = node.get(f"{metric}_pressure_full_percent", 0.0)
|
||||
pressure_some = node.get(f"{metric}_pressure_some_percent", 0.0)
|
||||
pressure_spikes = node.get(f"{metric}_pressure_full_spikes_percent", 0.0)
|
||||
is_hot = (pressure_full >= threshold["pressure_full"] and pressure_some >= threshold["pressure_some"]) or (pressure_spikes >= threshold["pressure_spikes"])
|
||||
|
||||
if is_hot:
|
||||
logger.debug(f"Set node {node['name']} as hot based on {metric} pressure metrics.")
|
||||
proxlb_data["nodes"][node["name"]][f"{metric}_pressure_hot"] = True
|
||||
proxlb_data["nodes"][node["name"]][f"pressure_hot"] = True
|
||||
else:
|
||||
logger.debug(f"Node {node['name']} is not hot based on {metric} pressure metrics.")
|
||||
|
||||
logger.debug("Finished: set_node_hot.")
|
||||
return proxlb_data
|
||||
|
||||
def set_guest_hot(proxlb_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Evaluates guest 'full' pressure metrics for memory, cpu, and io
|
||||
against defined thresholds and sets <metric>_pressure_hot = True
|
||||
when a guest is considered HOT.
|
||||
|
||||
Returns the modified proxlb_data dict.
|
||||
"""
|
||||
logger.debug("Starting: set_guest_hot.")
|
||||
balancing_cfg = proxlb_data.get("meta", {}).get("balancing", {})
|
||||
thresholds = balancing_cfg.get("psi_thresholds", balancing_cfg.get("psi", {}).get("guests", {}))
|
||||
guests = proxlb_data.get("guests", {})
|
||||
|
||||
for guest_name, guest in guests.items():
|
||||
if guest.get("ignore"):
|
||||
continue
|
||||
|
||||
for metric, threshold in thresholds.items():
|
||||
pressure_full = guest.get(f"{metric}_pressure_full_percent", 0.0)
|
||||
pressure_some = guest.get(f"{metric}_pressure_some_percent", 0.0)
|
||||
pressure_spikes = guest.get(f"{metric}_pressure_full_spikes_percent", 0.0)
|
||||
is_hot = (pressure_full >= threshold["pressure_full"] and pressure_some >= threshold["pressure_some"]) or (pressure_spikes >= threshold["pressure_spikes"])
|
||||
|
||||
if is_hot:
|
||||
logger.debug(f"Set guest {guest['name']} as hot based on {metric} pressure metrics.")
|
||||
proxlb_data["guests"][guest["name"]][f"{metric}_pressure_hot"] = True
|
||||
proxlb_data["guests"][guest["name"]][f"pressure_hot"] = True
|
||||
else:
|
||||
logger.debug(f"guest {guest['name']} is not hot based on {metric} pressure metrics.")
|
||||
|
||||
logger.debug("Finished: set_guest_hot.")
|
||||
return proxlb_data
|
||||
|
||||
@staticmethod
|
||||
def get_balanciness(proxlb_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
@@ -113,7 +190,66 @@ class Calculations:
|
||||
method = proxlb_data["meta"]["balancing"].get("method", "memory")
|
||||
mode = proxlb_data["meta"]["balancing"].get("mode", "used")
|
||||
balanciness = proxlb_data["meta"]["balancing"].get("balanciness", 10)
|
||||
method_value = [node_meta[f"{method}_{mode}_percent"] for node_meta in proxlb_data["nodes"].values()]
|
||||
|
||||
if mode == "assigned":
|
||||
method_value = [node_meta[f"{method}_{mode}_percent"] for node_meta in proxlb_data["nodes"].values()]
|
||||
|
||||
if proxlb_data["meta"]["balancing"].get(f"{method}_threshold", None):
|
||||
threshold = proxlb_data["meta"]["balancing"].get(f"{method}_threshold")
|
||||
highest_usage_node = max(proxlb_data["nodes"].values(), key=lambda x: x[f"{method}_{mode}_percent"])
|
||||
highest_node_value = highest_usage_node[f"{method}_{mode}_percent"]
|
||||
|
||||
if highest_node_value >= threshold:
|
||||
logger.debug(f"Guest balancing is required. Highest {method} usage node {highest_usage_node['name']} is above the defined threshold of {threshold}% with a value of {highest_node_value}%.")
|
||||
proxlb_data["meta"]["balancing"]["balance"] = True
|
||||
else:
|
||||
logger.debug(f"Guest balancing is ok. Highest {method} usage node {highest_usage_node['name']} is below the defined threshold of {threshold}% with a value of {highest_node_value}%.")
|
||||
proxlb_data["meta"]["balancing"]["balance"] = False
|
||||
|
||||
else:
|
||||
logger.debug(f"No {method} threshold defined for balancing. Skipping threshold check.")
|
||||
|
||||
elif mode == "used":
|
||||
method_value = [node_meta[f"{method}_{mode}_percent"] for node_meta in proxlb_data["nodes"].values()]
|
||||
|
||||
if proxlb_data["meta"]["balancing"].get(f"{method}_threshold", None):
|
||||
threshold = proxlb_data["meta"]["balancing"].get(f"{method}_threshold")
|
||||
highest_usage_node = max(proxlb_data["nodes"].values(), key=lambda x: x[f"{method}_{mode}_percent"])
|
||||
highest_node_value = highest_usage_node[f"{method}_{mode}_percent"]
|
||||
|
||||
if highest_node_value >= threshold:
|
||||
logger.debug(f"Guest balancing is required. Highest {method} usage node {highest_usage_node['name']} is above the defined threshold of {threshold}% with a value of {highest_node_value}%.")
|
||||
proxlb_data["meta"]["balancing"]["balance"] = True
|
||||
else:
|
||||
logger.debug(f"Guest balancing is ok. Highest {method} usage node {highest_usage_node['name']} is below the defined threshold of {threshold}% with a value of {highest_node_value}%.")
|
||||
proxlb_data["meta"]["balancing"]["balance"] = False
|
||||
|
||||
else:
|
||||
logger.debug(f"No {method} threshold defined for balancing. Skipping threshold check.")
|
||||
|
||||
elif mode == "psi":
|
||||
method_value = [node_meta[f"{method}_pressure_full_spikes_percent"] for node_meta in proxlb_data["nodes"].values()]
|
||||
any_node_hot = any(node.get(f"{method}_pressure_hot", False) for node in proxlb_data["nodes"].values())
|
||||
any_guest_hot = any(node.get(f"{method}_pressure_hot", False) for node in proxlb_data["guests"].values())
|
||||
|
||||
if any_node_hot:
|
||||
logger.debug(f"Guest balancing is required. A node is marked as HOT based on {method} pressure metrics.")
|
||||
proxlb_data["meta"]["balancing"]["balance"] = True
|
||||
else:
|
||||
logger.debug(f"Guest balancing is ok. No node is marked as HOT based on {method} pressure metrics.")
|
||||
|
||||
if any_guest_hot:
|
||||
logger.debug(f"Guest balancing is required. A guest is marked as HOT based on {method} pressure metrics.")
|
||||
proxlb_data["meta"]["balancing"]["balance"] = True
|
||||
else:
|
||||
logger.debug(f"Guest balancing is ok. No guest is marked as HOT based on {method} pressure metrics.")
|
||||
|
||||
return proxlb_data
|
||||
|
||||
else:
|
||||
logger.critical(f"Unknown balancing mode: {mode} provided. Cannot get balanciness.")
|
||||
sys.exit(1)
|
||||
|
||||
method_value_highest = max(method_value)
|
||||
method_value_lowest = min(method_value)
|
||||
|
||||
@@ -159,7 +295,23 @@ class Calculations:
|
||||
# Filter by the defined methods and modes for balancing
|
||||
method = proxlb_data["meta"]["balancing"].get("method", "memory")
|
||||
mode = proxlb_data["meta"]["balancing"].get("mode", "used")
|
||||
lowest_usage_node = min(filtered_nodes, key=lambda x: x[f"{method}_{mode}_percent"])
|
||||
|
||||
if mode == "assigned":
|
||||
logger.debug(f"Get best node for balancing by assigned {method} resources.")
|
||||
lowest_usage_node = min(filtered_nodes, key=lambda x: x[f"{method}_{mode}_percent"])
|
||||
|
||||
elif mode == "used":
|
||||
logger.debug(f"Get best node for balancing by used {method} resources.")
|
||||
lowest_usage_node = min(filtered_nodes, key=lambda x: x[f"{method}_{mode}_percent"])
|
||||
|
||||
elif mode == "psi":
|
||||
logger.debug(f"Get best node for balancing by pressure of {method} resources.")
|
||||
lowest_usage_node = min(filtered_nodes, key=lambda x: x[f"{method}_pressure_full_spikes_percent"])
|
||||
|
||||
else:
|
||||
logger.critical(f"Unknown balancing mode: {mode} provided. Cannot get best node.")
|
||||
sys.exit(1)
|
||||
|
||||
proxlb_data["meta"]["balancing"]["balance_reason"] = 'resources'
|
||||
proxlb_data["meta"]["balancing"]["balance_next_node"] = lowest_usage_node["name"]
|
||||
|
||||
@@ -188,7 +340,7 @@ class Calculations:
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
logger.debug("Starting: get_most_free_node.")
|
||||
logger.debug("Starting: relocate_guests_on_maintenance_nodes.")
|
||||
proxlb_data["meta"]["balancing"]["balance_next_guest"] = ""
|
||||
|
||||
for guest_name in proxlb_data["groups"]["maintenance"]:
|
||||
@@ -199,7 +351,7 @@ class Calculations:
|
||||
Calculations.update_node_resources(proxlb_data)
|
||||
logger.warning(f"Warning: Balancing may not be perfect because guest {guest_name} was located on a node which is in maintenance mode.")
|
||||
|
||||
logger.debug("Finished: get_most_free_node.")
|
||||
logger.debug("Finished: relocate_guests_on_maintenance_nodes.")
|
||||
|
||||
@staticmethod
|
||||
def relocate_guests(proxlb_data: Dict[str, Any]):
|
||||
@@ -233,7 +385,26 @@ class Calculations:
|
||||
Calculations.get_most_free_node(proxlb_data)
|
||||
|
||||
for guest_name in proxlb_data["groups"]["affinity"][group_name]["guests"]:
|
||||
proxlb_data["meta"]["balancing"]["balance_next_guest"] = guest_name
|
||||
mode = proxlb_data["meta"]["balancing"].get("mode", "used")
|
||||
|
||||
if mode == 'psi':
|
||||
logger.debug(f"Evaluating guest relocation based on {mode} mode.")
|
||||
method = proxlb_data["meta"]["balancing"].get("method", "memory")
|
||||
processed_guests_psi = proxlb_data["meta"]["balancing"].setdefault("processed_guests_psi", [])
|
||||
unprocessed_guests_psi = [guest for guest in proxlb_data["guests"].values() if guest["name"] not in processed_guests_psi]
|
||||
|
||||
# Filter by the defined methods and modes for balancing
|
||||
highest_usage_guest = max(unprocessed_guests_psi, key=lambda x: x[f"{method}_pressure_full_spikes_percent"])
|
||||
|
||||
# Append guest to the psi based processed list of guests
|
||||
if highest_usage_guest["name"] == guest_name and guest_name not in proxlb_data["meta"]["balancing"]["processed_guests_psi"]:
|
||||
proxlb_data["meta"]["balancing"]["processed_guests_psi"].append(guest_name)
|
||||
proxlb_data["meta"]["balancing"]["balance_next_guest"] = guest_name
|
||||
|
||||
else:
|
||||
logger.debug(f"Evaluating guest relocation based on {mode} mode.")
|
||||
proxlb_data["meta"]["balancing"]["balance_next_guest"] = guest_name
|
||||
|
||||
Calculations.val_anti_affinity(proxlb_data, guest_name)
|
||||
Calculations.val_node_relationships(proxlb_data, guest_name)
|
||||
Calculations.update_node_resources(proxlb_data)
|
||||
@@ -328,7 +499,7 @@ class Calculations:
|
||||
logger.debug("Finished: val_node_relationships.")
|
||||
|
||||
@staticmethod
|
||||
def update_node_resources(proxlb_data):
|
||||
def update_node_resources(proxlb_data: Dict[str, Any]):
|
||||
"""
|
||||
Updates the resource allocation and usage statistics for nodes when a guest
|
||||
is moved from one node to another.
|
||||
@@ -348,6 +519,11 @@ class Calculations:
|
||||
"""
|
||||
logger.debug("Starting: update_node_resources.")
|
||||
guest_name = proxlb_data["meta"]["balancing"]["balance_next_guest"]
|
||||
|
||||
if guest_name == "":
|
||||
logger.debug("No guest defined to update node resources for.")
|
||||
return
|
||||
|
||||
node_current = proxlb_data["guests"][guest_name]["node_current"]
|
||||
node_target = proxlb_data["meta"]["balancing"]["balance_next_node"]
|
||||
|
||||
@@ -392,3 +568,142 @@ class Calculations:
|
||||
logger.debug(f"Set guest {guest_name} from node {node_current} to node {node_target}.")
|
||||
|
||||
logger.debug("Finished: update_node_resources.")
|
||||
|
||||
def validate_affinity_map(proxlb_data: Dict[str, Any]):
|
||||
"""
|
||||
Validates the affinity and anti-affinity constraints for all guests in the ProxLB data structure.
|
||||
|
||||
This function iterates through each guest and checks both affinity and anti-affinity rules.
|
||||
If any guest violates these constraints, it sets the enforce_affinity flag to trigger rebalancing
|
||||
and skips further validation for efficiency.
|
||||
|
||||
Args:
|
||||
proxlb_data (Dict[str, Any]): A dictionary containing ProxLB configuration with the following structure:
|
||||
- "guests" (list): List of guest identifiers to validate
|
||||
- "meta" (dict): Metadata dictionary containing:
|
||||
- "balancing" (dict): Balancing configuration with "enforce_affinity" flag
|
||||
|
||||
Returns:
|
||||
None: Modifies proxlb_data in-place by updating the "enforce_affinity" flag in meta.balancing
|
||||
|
||||
Raises:
|
||||
None: Function handles validation gracefully and logs outcomes
|
||||
"""
|
||||
logger.debug("Starting: validate_current_affinity.")
|
||||
balancing_ok = True
|
||||
|
||||
for guest in proxlb_data["guests"]:
|
||||
|
||||
# We do not need to validate anymore if rebalancing is required
|
||||
if balancing_ok is False:
|
||||
proxlb_data["meta"]["balancing"]["enforce_affinity"] = True
|
||||
logger.debug(f"Rebalancing based on affinity/anti-affinity map is required. Skipping further validation...")
|
||||
break
|
||||
|
||||
balancing_state_affinity = Calculations.validate_current_affinity(proxlb_data, guest)
|
||||
balancing_state_anti_affinity = Calculations.validate_current_anti_affinity(proxlb_data, guest)
|
||||
logger.debug(f"Affinity for guest {guest} is {'valid' if balancing_state_affinity else 'NOT valid'}")
|
||||
logger.debug(f"Anti-affinity for guest {guest} is {'valid' if balancing_state_anti_affinity else 'NOT valid'}")
|
||||
|
||||
balancing_ok = not balancing_state_affinity or not balancing_state_anti_affinity
|
||||
|
||||
if balancing_ok:
|
||||
logger.debug(f"Rebalancing based on affinity/anti-affinity map is not required.")
|
||||
proxlb_data["meta"]["balancing"]["enforce_affinity"] = False
|
||||
|
||||
logger.debug("Finished: validate_current_affinity.")
|
||||
|
||||
@staticmethod
|
||||
def get_guest_node(proxlb_data: Dict[str, Any], guest_name: str) -> str:
|
||||
"""
|
||||
Return a currently assoicated PVE node where the guest is running on.
|
||||
|
||||
Args:
|
||||
proxlb_data (Dict[str, Any]): A dictionary containing ProxLB configuration.
|
||||
|
||||
Returns:
|
||||
node_name_current (str): The name of the current node where the guest runs on.
|
||||
|
||||
"""
|
||||
return proxlb_data["guests"][guest_name]["node_current"]
|
||||
|
||||
@staticmethod
|
||||
def validate_current_affinity(proxlb_data: Dict[str, Any], guest_name: str) -> bool:
|
||||
"""
|
||||
Validate that all guests in affinity groups containing the specified guest are on the same non-maintenance node.
|
||||
|
||||
This function checks affinity group constraints for a given guest. It ensures that:
|
||||
1. All guests within an affinity group are located on the same physical node
|
||||
2. The node hosting the affinity group is not in maintenance mode
|
||||
|
||||
Args:
|
||||
proxlb_data (Dict[str, Any]): A dictionary containing the complete ProxLB state including:
|
||||
- "groups": Dictionary with "affinity" key containing affinity group definitions
|
||||
- "guests": Dictionary with guest information
|
||||
- "nodes": Dictionary with node information including maintenance status
|
||||
guest_name (str): The name of the guest to validate affinity for
|
||||
|
||||
Returns:
|
||||
bool: True if all affinity groups containing the guest are valid (all members on same
|
||||
non-maintenance node), False otherwise
|
||||
"""
|
||||
logger.debug("Starting: validate_current_affinity.")
|
||||
for group_name, grp in proxlb_data["groups"]["affinity"].items():
|
||||
if guest_name not in grp["guests"]:
|
||||
continue
|
||||
|
||||
nodes = []
|
||||
for group in grp["guests"]:
|
||||
if group not in proxlb_data["guests"]:
|
||||
continue
|
||||
|
||||
node = Calculations.get_guest_node(proxlb_data, group)
|
||||
if proxlb_data["nodes"][node]["maintenance"]:
|
||||
logger.debug(f"Group '{group_name}' invalid: node '{node}' in maintenance.")
|
||||
return False
|
||||
nodes.append(node)
|
||||
|
||||
if len(set(nodes)) != 1:
|
||||
logger.debug(f"Group '{group_name}' invalid: guests spread across nodes {set(nodes)}.")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
@staticmethod
|
||||
def validate_current_anti_affinity(proxlb_data: Dict[str, Any], guest_name: str) -> bool:
|
||||
"""
|
||||
Validate that all guests in anti-affinity groups containing the specified guest are not on the same node.
|
||||
|
||||
This function checks anti-affinity group constraints for a given guest. It ensures that:
|
||||
1. All guests within an anti-affinity group are located on the same physical node
|
||||
2. The node hosting the anti-affinity group is not in maintenance mode
|
||||
|
||||
Args:
|
||||
proxlb_data (Dict[str, Any]): A dictionary containing the complete ProxLB state including:
|
||||
- "groups": Dictionary with "affinity" key containing affinity group definitions
|
||||
- "guests": Dictionary with guest information
|
||||
- "nodes": Dictionary with node information including maintenance status
|
||||
guest_name (str): The name of the guest to validate affinity for
|
||||
|
||||
Returns:
|
||||
bool: True if all anti-affinity groups containing the guest are valid (all members on different
|
||||
non-maintenance node), False otherwise
|
||||
"""
|
||||
logger.debug("Starting: validate_current_anti_affinity.")
|
||||
for group_name, grp in proxlb_data["groups"]["anti_affinity"].items():
|
||||
if guest_name not in grp["guests"]:
|
||||
continue
|
||||
nodes = []
|
||||
for group in grp["guests"]:
|
||||
if group not in proxlb_data["guests"]:
|
||||
continue
|
||||
|
||||
node = Calculations.get_guest_node(proxlb_data, group)
|
||||
if proxlb_data["nodes"][node]["maintenance"]:
|
||||
return False
|
||||
nodes.append(node)
|
||||
|
||||
if len(nodes) != len(set(nodes)):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
90
proxlb/models/features.py
Normal file
90
proxlb/models/features.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""
|
||||
ProxLB Features module for validating and adjusting feature flags
|
||||
based on Proxmox VE node versions and cluster compatibility.
|
||||
"""
|
||||
|
||||
__author__ = "Florian Paul Azim Hoberg <gyptazy>"
|
||||
__copyright__ = "Copyright (C) 2025 Florian Paul Azim Hoberg (@gyptazy)"
|
||||
__license__ = "GPL-3.0"
|
||||
|
||||
|
||||
from typing import List
|
||||
from typing import Dict, Any
|
||||
from utils.logger import SystemdLogger
|
||||
from packaging import version
|
||||
|
||||
logger = SystemdLogger()
|
||||
|
||||
|
||||
class Features:
|
||||
"""
|
||||
ProxLB Features module for validating and adjusting feature flags
|
||||
based on Proxmox VE node versions and cluster compatibility.
|
||||
|
||||
Responsibilities:
|
||||
- Validate and adjust feature flags based on Proxmox VE node versions.
|
||||
|
||||
Methods:
|
||||
__init__():
|
||||
No-op initializer.
|
||||
|
||||
validate_available_features(proxlb_data: dict) -> None:
|
||||
Static method that inspects proxlb_data["nodes"] versions and disables
|
||||
incompatible balancing features for Proxmox VE versions < 9.0.0.
|
||||
This function mutates proxlb_data in place.
|
||||
|
||||
Notes:
|
||||
- Expects proxlb_data to be a dict with "nodes" and "meta" keys.
|
||||
"""
|
||||
def __init__(self):
|
||||
"""
|
||||
Initializes the Features class.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def validate_available_features(proxlb_data: any) -> None:
|
||||
"""
|
||||
Validate and adjust feature flags in the provided proxlb_data according to Proxmox VE versions.
|
||||
|
||||
This function inspects the cluster node versions in proxlb_data and disables features
|
||||
that are incompatible with Proxmox VE versions older than 9.0.0. Concretely, if any node
|
||||
reports a 'pve_version' lower than "9.0.0":
|
||||
- If meta.balancing.with_conntrack_state is truthy, it is set to False and a warning is logged.
|
||||
- If meta.balancing.mode equals "psi", meta.balancing.enable is set to False and a warning is logged.
|
||||
|
||||
proxlb_data (dict): Cluster data structure that must contain:
|
||||
- "nodes": a mapping (e.g., dict) whose values are mappings containing a 'pve_version' string.
|
||||
- "meta": a mapping that may contain a "balancing" mapping with keys:
|
||||
- "with_conntrack_state" (bool, optional)
|
||||
- "mode" (str, optional)
|
||||
- "enable" (bool, optional)
|
||||
|
||||
None: The function mutates proxlb_data in place to disable incompatible features.
|
||||
|
||||
Side effects:
|
||||
- Mutates proxlb_data["meta"]["balancing"] when incompatible features are detected.
|
||||
- Emits debug and warning log messages.
|
||||
|
||||
Notes:
|
||||
- Unexpected or missing keys/types in proxlb_data may raise KeyError or TypeError.
|
||||
- Version comparison uses semantic version parsing; callers should provide versions as strings.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
logger.debug("Starting: validate_available_features.")
|
||||
|
||||
any_non_pve9_node = any(version.parse(n['pve_version']) < version.parse("9.0.0") for n in proxlb_data["nodes"].values())
|
||||
if any_non_pve9_node:
|
||||
|
||||
with_conntrack_state = proxlb_data["meta"].get("balancing", {}).get("with_conntrack_state", False)
|
||||
if with_conntrack_state:
|
||||
logger.warning("Non Proxmox VE 9 systems detected: Deactivating migration option 'with-conntrack-state'!")
|
||||
proxlb_data["meta"]["balancing"]["with_conntrack_state"] = False
|
||||
|
||||
psi_balancing = proxlb_data["meta"].get("balancing", {}).get("mode", None)
|
||||
if psi_balancing == "psi":
|
||||
logger.warning("Non Proxmox VE 9 systems detected: Deactivating balancing!")
|
||||
proxlb_data["meta"]["balancing"]["enable"] = False
|
||||
|
||||
logger.debug("Finished: validate_available_features.")
|
||||
@@ -10,6 +10,7 @@ __license__ = "GPL-3.0"
|
||||
|
||||
from typing import Dict, Any
|
||||
from utils.logger import SystemdLogger
|
||||
from models.pools import Pools
|
||||
from models.tags import Tags
|
||||
import time
|
||||
|
||||
@@ -35,7 +36,7 @@ class Guests:
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def get_guests(proxmox_api: any, nodes: Dict[str, Any], meta: Dict[str, Any]) -> Dict[str, Any]:
|
||||
def get_guests(proxmox_api: any, pools: Dict[str, Any], nodes: Dict[str, Any], meta: Dict[str, Any], proxlb_config: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Get metrics of all guests in a Proxmox cluster.
|
||||
|
||||
@@ -46,6 +47,8 @@ class Guests:
|
||||
Args:
|
||||
proxmox_api (any): The Proxmox API client instance.
|
||||
nodes (Dict[str, Any]): A dictionary containing information about the nodes in the Proxmox cluster.
|
||||
meta (Dict[str, Any]): A dictionary containing metadata information.
|
||||
proxmox_config (Dict[str, Any]): A dictionary containing the ProxLB configuration.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: A dictionary containing metrics and information for all running guests.
|
||||
@@ -62,24 +65,40 @@ class Guests:
|
||||
# resource metrics for rebalancing to ensure that we do not overprovisiong the node.
|
||||
for guest in proxmox_api.nodes(node).qemu.get():
|
||||
if guest['status'] == 'running':
|
||||
|
||||
guests['guests'][guest['name']] = {}
|
||||
guests['guests'][guest['name']]['name'] = guest['name']
|
||||
guests['guests'][guest['name']]['cpu_total'] = int(guest['cpus'])
|
||||
guests['guests'][guest['name']]['cpu_used'] = Guests.get_guest_cpu_usage(proxmox_api, node, guest['vmid'], guest['name'])
|
||||
guests['guests'][guest['name']]['cpu_used'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'cpu', None)
|
||||
guests['guests'][guest['name']]['cpu_pressure_some_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'cpu', 'some')
|
||||
guests['guests'][guest['name']]['cpu_pressure_full_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'cpu', 'full')
|
||||
guests['guests'][guest['name']]['cpu_pressure_some_spikes_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'cpu', 'some', spikes=True)
|
||||
guests['guests'][guest['name']]['cpu_pressure_full_spikes_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'cpu', 'full', spikes=True)
|
||||
guests['guests'][guest['name']]['cpu_pressure_hot'] = False
|
||||
guests['guests'][guest['name']]['memory_total'] = guest['maxmem']
|
||||
guests['guests'][guest['name']]['memory_used'] = guest['mem']
|
||||
guests['guests'][guest['name']]['memory_pressure_some_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'memory', 'some')
|
||||
guests['guests'][guest['name']]['memory_pressure_full_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'memory', 'full')
|
||||
guests['guests'][guest['name']]['memory_pressure_some_spikes_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'memory', 'some', spikes=True)
|
||||
guests['guests'][guest['name']]['memory_pressure_full_spikes_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'memory', 'full', spikes=True)
|
||||
guests['guests'][guest['name']]['memory_pressure_hot'] = False
|
||||
guests['guests'][guest['name']]['disk_total'] = guest['maxdisk']
|
||||
guests['guests'][guest['name']]['disk_used'] = guest['disk']
|
||||
guests['guests'][guest['name']]['disk_pressure_some_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'disk', 'some')
|
||||
guests['guests'][guest['name']]['disk_pressure_full_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'disk', 'full')
|
||||
guests['guests'][guest['name']]['disk_pressure_some_spikes_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'disk', 'some', spikes=True)
|
||||
guests['guests'][guest['name']]['disk_pressure_full_spikes_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'disk', 'full', spikes=True)
|
||||
guests['guests'][guest['name']]['disk_pressure_hot'] = False
|
||||
guests['guests'][guest['name']]['id'] = guest['vmid']
|
||||
guests['guests'][guest['name']]['node_current'] = node
|
||||
guests['guests'][guest['name']]['node_target'] = node
|
||||
guests['guests'][guest['name']]['processed'] = False
|
||||
guests['guests'][guest['name']]['pressure_hot'] = False
|
||||
guests['guests'][guest['name']]['tags'] = Tags.get_tags_from_guests(proxmox_api, node, guest['vmid'], 'vm')
|
||||
guests['guests'][guest['name']]['affinity_groups'] = Tags.get_affinity_groups(guests['guests'][guest['name']]['tags'])
|
||||
guests['guests'][guest['name']]['anti_affinity_groups'] = Tags.get_anti_affinity_groups(guests['guests'][guest['name']]['tags'])
|
||||
guests['guests'][guest['name']]['pools'] = Pools.get_pools_for_guest(guest['name'], pools)
|
||||
guests['guests'][guest['name']]['affinity_groups'] = Tags.get_affinity_groups(guests['guests'][guest['name']]['tags'], guests['guests'][guest['name']]['pools'], proxlb_config)
|
||||
guests['guests'][guest['name']]['anti_affinity_groups'] = Tags.get_anti_affinity_groups(guests['guests'][guest['name']]['tags'], guests['guests'][guest['name']]['pools'], proxlb_config)
|
||||
guests['guests'][guest['name']]['ignore'] = Tags.get_ignore(guests['guests'][guest['name']]['tags'])
|
||||
guests['guests'][guest['name']]['node_relationships'] = Tags.get_node_relationships(guests['guests'][guest['name']]['tags'], nodes)
|
||||
guests['guests'][guest['name']]['node_relationships'] = Tags.get_node_relationships(guests['guests'][guest['name']]['tags'], nodes, guests['guests'][guest['name']]['pools'], proxlb_config)
|
||||
guests['guests'][guest['name']]['type'] = 'vm'
|
||||
|
||||
logger.debug(f"Resources of Guest {guest['name']} (type VM) added: {guests['guests'][guest['name']]}")
|
||||
@@ -94,20 +113,37 @@ class Guests:
|
||||
guests['guests'][guest['name']] = {}
|
||||
guests['guests'][guest['name']]['name'] = guest['name']
|
||||
guests['guests'][guest['name']]['cpu_total'] = int(guest['cpus'])
|
||||
guests['guests'][guest['name']]['cpu_used'] = Guests.get_guest_cpu_usage(proxmox_api, node, guest['vmid'], guest['name'])
|
||||
guests['guests'][guest['name']]['cpu_used'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'cpu', None)
|
||||
guests['guests'][guest['name']]['cpu_pressure_some_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'cpu', 'some')
|
||||
guests['guests'][guest['name']]['cpu_pressure_full_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'cpu', 'full')
|
||||
guests['guests'][guest['name']]['cpu_pressure_some_spikes_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'cpu', 'some', spikes=True)
|
||||
guests['guests'][guest['name']]['cpu_pressure_full_spikes_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'cpu', 'full', spikes=True)
|
||||
guests['guests'][guest['name']]['cpu_pressure_hot'] = False
|
||||
guests['guests'][guest['name']]['memory_total'] = guest['maxmem']
|
||||
guests['guests'][guest['name']]['memory_used'] = guest['mem']
|
||||
guests['guests'][guest['name']]['memory_pressure_some_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'memory', 'some')
|
||||
guests['guests'][guest['name']]['memory_pressure_full_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'memory', 'full')
|
||||
guests['guests'][guest['name']]['memory_pressure_some_spikes_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'memory', 'some', spikes=True)
|
||||
guests['guests'][guest['name']]['memory_pressure_full_spikes_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'memory', 'full', spikes=True)
|
||||
guests['guests'][guest['name']]['memory_pressure_hot'] = False
|
||||
guests['guests'][guest['name']]['disk_total'] = guest['maxdisk']
|
||||
guests['guests'][guest['name']]['disk_used'] = guest['disk']
|
||||
guests['guests'][guest['name']]['disk_pressure_some_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'disk', 'some')
|
||||
guests['guests'][guest['name']]['disk_pressure_full_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'disk', 'full')
|
||||
guests['guests'][guest['name']]['disk_pressure_some_spikes_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'disk', 'some', spikes=True)
|
||||
guests['guests'][guest['name']]['disk_pressure_full_spikes_percent'] = Guests.get_guest_rrd_data(proxmox_api, node, guest['vmid'], guest['name'], 'disk', 'full', spikes=True)
|
||||
guests['guests'][guest['name']]['disk_pressure_hot'] = False
|
||||
guests['guests'][guest['name']]['id'] = guest['vmid']
|
||||
guests['guests'][guest['name']]['node_current'] = node
|
||||
guests['guests'][guest['name']]['node_target'] = node
|
||||
guests['guests'][guest['name']]['processed'] = False
|
||||
guests['guests'][guest['name']]['pressure_hot'] = False
|
||||
guests['guests'][guest['name']]['tags'] = Tags.get_tags_from_guests(proxmox_api, node, guest['vmid'], 'ct')
|
||||
guests['guests'][guest['name']]['affinity_groups'] = Tags.get_affinity_groups(guests['guests'][guest['name']]['tags'])
|
||||
guests['guests'][guest['name']]['anti_affinity_groups'] = Tags.get_anti_affinity_groups(guests['guests'][guest['name']]['tags'])
|
||||
guests['guests'][guest['name']]['pools'] = Pools.get_pools_for_guest(guest['name'], pools)
|
||||
guests['guests'][guest['name']]['affinity_groups'] = Tags.get_affinity_groups(guests['guests'][guest['name']]['tags'], guests['guests'][guest['name']]['pools'], proxlb_config)
|
||||
guests['guests'][guest['name']]['anti_affinity_groups'] = Tags.get_anti_affinity_groups(guests['guests'][guest['name']]['tags'], guests['guests'][guest['name']]['pools'], proxlb_config)
|
||||
guests['guests'][guest['name']]['ignore'] = Tags.get_ignore(guests['guests'][guest['name']]['tags'])
|
||||
guests['guests'][guest['name']]['node_relationships'] = Tags.get_node_relationships(guests['guests'][guest['name']]['tags'], nodes)
|
||||
guests['guests'][guest['name']]['node_relationships'] = Tags.get_node_relationships(guests['guests'][guest['name']]['tags'], nodes, guests['guests'][guest['name']]['pools'], proxlb_config)
|
||||
guests['guests'][guest['name']]['type'] = 'ct'
|
||||
|
||||
logger.debug(f"Resources of Guest {guest['name']} (type CT) added: {guests['guests'][guest['name']]}")
|
||||
@@ -118,36 +154,55 @@ class Guests:
|
||||
return guests
|
||||
|
||||
@staticmethod
|
||||
def get_guest_cpu_usage(proxmox_api, node_name: str, vm_id: int, vm_name: str) -> float:
|
||||
def get_guest_rrd_data(proxmox_api, node_name: str, vm_id: int, vm_name: str, object_name: str, object_type: str, spikes=False) -> float:
|
||||
"""
|
||||
Retrieve the average CPU usage of a guest instance (VM/CT) over the past hour.
|
||||
|
||||
This method queries the Proxmox VE API for RRD (Round-Robin Database) data
|
||||
related to CPU usage of a specific guest instance and calculates the average CPU usage
|
||||
over the last hour using the "AVERAGE" consolidation function.
|
||||
Retrieves the rrd data metrics for a specific resource (CPU, memory, disk) of a guest VM or CT.
|
||||
|
||||
Args:
|
||||
proxmox_api: An instance of the Proxmox API client.
|
||||
node_name (str): The name of the Proxmox node hosting the VM.
|
||||
vm_id (int): The unique identifier of the guest instance (VM/CT).
|
||||
vm_name (str): The name of the guest instance (VM/CT).
|
||||
proxmox_api (Any): The Proxmox API client instance.
|
||||
node_name (str): The name of the node hosting the guest.
|
||||
vm_id (int): The ID of the guest VM or CT.
|
||||
vm_name (str): The name of the guest VM or CT.
|
||||
object_name (str): The resource type to query (e.g., 'cpu', 'memory', 'disk').
|
||||
object_type (str, optional): The pressure type ('some', 'full') or None for average usage.
|
||||
spikes (bool, optional): Whether to consider spikes in the calculation. Defaults to False.
|
||||
|
||||
Returns:
|
||||
float: The average CPU usage as a fraction (0.0 to 1.0) over the past hour.
|
||||
Returns 0.0 if no data is available.
|
||||
float: The calculated average usage value for the specified resource.
|
||||
"""
|
||||
logger.debug("Finished: get_guest_cpu_usage.")
|
||||
logger.debug("Starting: get_guest_rrd_data.")
|
||||
time.sleep(0.1)
|
||||
|
||||
try:
|
||||
logger.debug(f"Getting RRD dara for guest: {vm_name}.")
|
||||
guest_data_rrd = proxmox_api.nodes(node_name).qemu(vm_id).rrddata.get(timeframe="hour", cf="AVERAGE")
|
||||
if spikes:
|
||||
logger.debug(f"Getting spike RRD data for {object_name} from guest: {vm_name}.")
|
||||
guest_data_rrd = proxmox_api.nodes(node_name).qemu(vm_id).rrddata.get(timeframe="hour", cf="MAX")
|
||||
else:
|
||||
logger.debug(f"Getting average RRD data for {object_name} from guest: {vm_name}.")
|
||||
guest_data_rrd = proxmox_api.nodes(node_name).qemu(vm_id).rrddata.get(timeframe="hour", cf="AVERAGE")
|
||||
except Exception:
|
||||
logger.error(f"Failed to retrieve RRD data for guest: {vm_name} (ID: {vm_id}) on node: {node_name}. Using 0.0 as CPU usage.")
|
||||
logger.debug("Finished: get_guest_cpu_usage.")
|
||||
return 0.0
|
||||
logger.error(f"Failed to retrieve RRD data for guest: {vm_name} (ID: {vm_id}) on node: {node_name}. Using 0.0 as value.")
|
||||
logger.debug("Finished: get_guest_rrd_data.")
|
||||
return float(0.0)
|
||||
|
||||
cpu_usage = sum(entry.get("cpu", 0.0) for entry in guest_data_rrd) / len(guest_data_rrd)
|
||||
logger.debug(f"CPU RRD data for guest: {vm_name}: {cpu_usage}")
|
||||
logger.debug("Finished: get_guest_cpu_usage.")
|
||||
return cpu_usage
|
||||
if object_type:
|
||||
|
||||
lookup_key = f"pressure{object_name}{object_type}"
|
||||
if spikes:
|
||||
# RRD data is collected every minute, so we look at the last 6 entries
|
||||
# and take the maximum value to represent the spike
|
||||
logger.debug(f"Getting RRD data (spike: {spikes}) of pressure for {object_name} {object_type} from guest: {vm_name}.")
|
||||
rrd_data_value = [row.get(lookup_key) for row in guest_data_rrd if row.get(lookup_key) is not None]
|
||||
rrd_data_value = max(rrd_data_value[-6:], default=0.0)
|
||||
else:
|
||||
# Calculate the average value from the RRD data entries
|
||||
logger.debug(f"Getting RRD data (spike: {spikes}) of pressure for {object_name} {object_type} from guest: {vm_name}.")
|
||||
rrd_data_value = sum(entry.get(lookup_key, 0.0) for entry in guest_data_rrd) / len(guest_data_rrd)
|
||||
|
||||
else:
|
||||
logger.debug(f"Getting RRD data of cpu usage from guest: {vm_name}.")
|
||||
rrd_data_value = sum(entry.get("cpu", 0.0) for entry in guest_data_rrd) / len(guest_data_rrd)
|
||||
|
||||
logger.debug(f"RRD data (spike: {spikes}) for {object_name} from guest: {vm_name}: {rrd_data_value}")
|
||||
logger.debug("Finished: get_guest_rrd_data.")
|
||||
return rrd_data_value
|
||||
|
||||
@@ -21,6 +21,7 @@ __copyright__ = "Copyright (C) 2025 Florian Paul Azim Hoberg (@gyptazy)"
|
||||
__license__ = "GPL-3.0"
|
||||
|
||||
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
from utils.logger import SystemdLogger
|
||||
|
||||
@@ -47,6 +48,7 @@ class Nodes:
|
||||
|
||||
Args:
|
||||
proxmox_api (any): The Proxmox API client instance.
|
||||
proxmox_config (Dict[str, Any]): A dictionary containing the ProxLB configuration.
|
||||
nodes (Dict[str, Any]): A dictionary containing information about the nodes in the Proxmox cluster.
|
||||
|
||||
Returns:
|
||||
@@ -60,6 +62,8 @@ class Nodes:
|
||||
if node["status"] == "online" and not Nodes.set_node_ignore(proxlb_config, node["node"]):
|
||||
nodes["nodes"][node["node"]] = {}
|
||||
nodes["nodes"][node["node"]]["name"] = node["node"]
|
||||
nodes["nodes"][node["node"]]["pve_version"] = Nodes.get_node_pve_version(proxmox_api, node["node"])
|
||||
nodes["nodes"][node["node"]]["pressure_hot"] = False
|
||||
nodes["nodes"][node["node"]]["maintenance"] = False
|
||||
nodes["nodes"][node["node"]]["cpu_total"] = node["maxcpu"]
|
||||
nodes["nodes"][node["node"]]["cpu_assigned"] = 0
|
||||
@@ -68,6 +72,11 @@ class Nodes:
|
||||
nodes["nodes"][node["node"]]["cpu_assigned_percent"] = nodes["nodes"][node["node"]]["cpu_assigned"] / nodes["nodes"][node["node"]]["cpu_total"] * 100
|
||||
nodes["nodes"][node["node"]]["cpu_free_percent"] = nodes["nodes"][node["node"]]["cpu_free"] / node["maxcpu"] * 100
|
||||
nodes["nodes"][node["node"]]["cpu_used_percent"] = nodes["nodes"][node["node"]]["cpu_used"] / node["maxcpu"] * 100
|
||||
nodes["nodes"][node["node"]]["cpu_pressure_some_percent"] = Nodes.get_node_rrd_data(proxmox_api, node["node"], "cpu", "some")
|
||||
nodes["nodes"][node["node"]]["cpu_pressure_full_percent"] = Nodes.get_node_rrd_data(proxmox_api, node["node"], "cpu", "full")
|
||||
nodes["nodes"][node["node"]]["cpu_pressure_some_spikes_percent"] = Nodes.get_node_rrd_data(proxmox_api, node["node"], "cpu", "some", spikes=True)
|
||||
nodes["nodes"][node["node"]]["cpu_pressure_full_spikes_percent"] = Nodes.get_node_rrd_data(proxmox_api, node["node"], "cpu", "full", spikes=True)
|
||||
nodes["nodes"][node["node"]]["cpu_pressure_hot"] = False
|
||||
nodes["nodes"][node["node"]]["memory_total"] = node["maxmem"]
|
||||
nodes["nodes"][node["node"]]["memory_assigned"] = 0
|
||||
nodes["nodes"][node["node"]]["memory_used"] = node["mem"]
|
||||
@@ -75,6 +84,11 @@ class Nodes:
|
||||
nodes["nodes"][node["node"]]["memory_assigned_percent"] = nodes["nodes"][node["node"]]["memory_assigned"] / nodes["nodes"][node["node"]]["memory_total"] * 100
|
||||
nodes["nodes"][node["node"]]["memory_free_percent"] = nodes["nodes"][node["node"]]["memory_free"] / node["maxmem"] * 100
|
||||
nodes["nodes"][node["node"]]["memory_used_percent"] = nodes["nodes"][node["node"]]["memory_used"] / node["maxmem"] * 100
|
||||
nodes["nodes"][node["node"]]["memory_pressure_some_percent"] = Nodes.get_node_rrd_data(proxmox_api, node["node"], "memory", "some")
|
||||
nodes["nodes"][node["node"]]["memory_pressure_full_percent"] = Nodes.get_node_rrd_data(proxmox_api, node["node"], "memory", "full")
|
||||
nodes["nodes"][node["node"]]["memory_pressure_some_spikes_percent"] = Nodes.get_node_rrd_data(proxmox_api, node["node"], "memory", "some", spikes=True)
|
||||
nodes["nodes"][node["node"]]["memory_pressure_full_spikes_percent"] = Nodes.get_node_rrd_data(proxmox_api, node["node"], "memory", "full", spikes=True)
|
||||
nodes["nodes"][node["node"]]["memory_pressure_hot"] = False
|
||||
nodes["nodes"][node["node"]]["disk_total"] = node["maxdisk"]
|
||||
nodes["nodes"][node["node"]]["disk_assigned"] = 0
|
||||
nodes["nodes"][node["node"]]["disk_used"] = node["disk"]
|
||||
@@ -82,11 +96,17 @@ class Nodes:
|
||||
nodes["nodes"][node["node"]]["disk_assigned_percent"] = nodes["nodes"][node["node"]]["disk_assigned"] / nodes["nodes"][node["node"]]["disk_total"] * 100
|
||||
nodes["nodes"][node["node"]]["disk_free_percent"] = nodes["nodes"][node["node"]]["disk_free"] / node["maxdisk"] * 100
|
||||
nodes["nodes"][node["node"]]["disk_used_percent"] = nodes["nodes"][node["node"]]["disk_used"] / node["maxdisk"] * 100
|
||||
nodes["nodes"][node["node"]]["disk_pressure_some_percent"] = Nodes.get_node_rrd_data(proxmox_api, node["node"], "disk", "some")
|
||||
nodes["nodes"][node["node"]]["disk_pressure_full_percent"] = Nodes.get_node_rrd_data(proxmox_api, node["node"], "disk", "full")
|
||||
nodes["nodes"][node["node"]]["disk_pressure_some_spikes_percent"] = Nodes.get_node_rrd_data(proxmox_api, node["node"], "disk", "some", spikes=True)
|
||||
nodes["nodes"][node["node"]]["disk_pressure_full_spikes_percent"] = Nodes.get_node_rrd_data(proxmox_api, node["node"], "disk", "full", spikes=True)
|
||||
nodes["nodes"][node["node"]]["disk_pressure_hot"] = False
|
||||
|
||||
# Evaluate if node should be set to maintenance mode
|
||||
if Nodes.set_node_maintenance(proxmox_api, proxlb_config, node["node"]):
|
||||
nodes["nodes"][node["node"]]["maintenance"] = True
|
||||
|
||||
logger.debug(f"Node metrics collected: {nodes}")
|
||||
logger.debug("Finished: get_nodes.")
|
||||
return nodes
|
||||
|
||||
@@ -153,3 +173,83 @@ class Nodes:
|
||||
return True
|
||||
|
||||
logger.debug("Finished: set_node_ignore.")
|
||||
|
||||
@staticmethod
|
||||
def get_node_rrd_data(proxmox_api, node_name: str, object_name: str, object_type: str, spikes=False) -> float:
|
||||
"""
|
||||
Retrieves the rrd data metrics for a specific resource (CPU, memory, disk) of a node.
|
||||
|
||||
Args:
|
||||
proxmox_api (Any): The Proxmox API client instance.
|
||||
node_name (str): The name of the node hosting the guest.
|
||||
object_name (str): The resource type to query (e.g., 'cpu', 'memory', 'disk').
|
||||
object_type (str, optional): The pressure type ('some', 'full') or None for average usage.
|
||||
spikes (bool, optional): Whether to consider spikes in the calculation. Defaults to False.
|
||||
|
||||
Returns:
|
||||
float: The calculated average usage value for the specified resource.
|
||||
"""
|
||||
logger.debug("Starting: get_node_rrd_data.")
|
||||
time.sleep(0.1)
|
||||
|
||||
try:
|
||||
if spikes:
|
||||
logger.debug(f"Getting spike RRD data for {object_name} from node: {node_name}.")
|
||||
node_data_rrd = proxmox_api.nodes(node_name).rrddata.get(timeframe="hour", cf="MAX")
|
||||
else:
|
||||
logger.debug(f"Getting average RRD data for {object_name} from node: {node_name}.")
|
||||
node_data_rrd = proxmox_api.nodes(node_name).rrddata.get(timeframe="hour", cf="AVERAGE")
|
||||
|
||||
except Exception:
|
||||
logger.error(f"Failed to retrieve RRD data for guest: {node_name}. Using 0.0 as value.")
|
||||
logger.debug("Finished: get_node_rrd_data.")
|
||||
return 0.0
|
||||
|
||||
lookup_key = f"pressure{object_name}{object_type}"
|
||||
|
||||
if spikes:
|
||||
# RRD data is collected every minute, so we look at the last 6 entries
|
||||
# and take the maximum value to represent the spike
|
||||
rrd_data_value = [row.get(lookup_key) for row in node_data_rrd if row.get(lookup_key) is not None]
|
||||
rrd_data_value = max(rrd_data_value[-6:], default=0.0)
|
||||
else:
|
||||
# Calculate the average value from the RRD data entries
|
||||
rrd_data_value = sum(entry.get(lookup_key, 0.0) for entry in node_data_rrd) / len(node_data_rrd)
|
||||
|
||||
logger.debug(f"RRD data (spike: {spikes}) for {object_name} from node: {node_name}: {rrd_data_value}")
|
||||
logger.debug("Finished: get_node_rrd_data.")
|
||||
return rrd_data_value
|
||||
|
||||
@staticmethod
|
||||
def get_node_pve_version(proxmox_api, node_name: str) -> float:
|
||||
"""
|
||||
Return the Proxmox VE (PVE) version for a given node by querying the Proxmox API.
|
||||
|
||||
This function calls proxmox_api.nodes(node_name).version.get() and extracts the
|
||||
'version' field from the returned mapping. The value is expected to be numeric
|
||||
(or convertible to float) and is returned as a float.
|
||||
|
||||
Args:
|
||||
proxmox_api (Any): The Proxmox API client instance.
|
||||
node_name (str): The name of the node hosting the guest.
|
||||
|
||||
Returns:
|
||||
float: The PVE version for the specified node as a floating point number.
|
||||
|
||||
Raises:
|
||||
Exception: If the proxmox_api call fails, returns an unexpected structure, or the
|
||||
'version' field is missing or cannot be converted to float. Callers should
|
||||
handle or propagate exceptions as appropriate.
|
||||
"""
|
||||
logger.debug("Starting: get_node_pve_version.")
|
||||
time.sleep(0.1)
|
||||
|
||||
try:
|
||||
logger.debug(f"Trying to get PVE version for node: {node_name}.")
|
||||
version = proxmox_api.nodes(node_name).version.get()
|
||||
except Exception:
|
||||
logger.error(f"Failed to get PVE version for node: {node_name}.")
|
||||
|
||||
logger.debug(f"Got version {version['version']} for node {node_name}.")
|
||||
logger.debug("Finished: get_node_pve_version.")
|
||||
return version["version"]
|
||||
|
||||
117
proxlb/models/pools.py
Normal file
117
proxlb/models/pools.py
Normal file
@@ -0,0 +1,117 @@
|
||||
"""
|
||||
The Pools class retrieves all present pools defined on a Proxmox cluster
|
||||
including the chield objects.
|
||||
"""
|
||||
|
||||
__author__ = "Florian Paul Azim Hoberg <gyptazy>"
|
||||
__copyright__ = "Copyright (C) 2025 Florian Paul Azim Hoberg (@gyptazy)"
|
||||
__license__ = "GPL-3.0"
|
||||
|
||||
|
||||
from typing import Dict, Any
|
||||
from utils.logger import SystemdLogger
|
||||
from models.tags import Tags
|
||||
import time
|
||||
|
||||
logger = SystemdLogger()
|
||||
|
||||
|
||||
class Pools:
|
||||
"""
|
||||
The Pools class retrieves all present pools defined on a Proxmox cluster
|
||||
including the chield objects.
|
||||
|
||||
Methods:
|
||||
__init__:
|
||||
Initializes the Pools class.
|
||||
|
||||
get_pools(proxmox_api: any) -> Dict[str, Any]:
|
||||
Retrieve pool definitions and membership from the Proxmox cluster.
|
||||
Returns a dict with a top-level "pools" mapping each poolid to
|
||||
{"name": <poolid>, "members": [<member_names>...]}.
|
||||
This method does not collect per-member metrics or perform node filtering.
|
||||
"""
|
||||
def __init__(self):
|
||||
"""
|
||||
Initializes the Pools class with the provided ProxLB data.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def get_pools(proxmox_api: any) -> Dict[str, Any]:
|
||||
"""
|
||||
Retrieve all pools and their members from a Proxmox cluster.
|
||||
|
||||
Queries the Proxmox API for pool definitions and returns a dictionary
|
||||
containing each pool's id/name and a list of its member VM/CT names.
|
||||
This function does not perform per-member metric collection or node
|
||||
filtering — it only gathers pool membership information.
|
||||
|
||||
Args:
|
||||
proxmox_api (any): Proxmox API client instance.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: Dictionary with a top-level "pools" key mapping poolid
|
||||
to {"name": <poolid>, "members": [<member_names>...]}.
|
||||
"""
|
||||
logger.debug("Starting: get_pools.")
|
||||
pools = {"pools": {}}
|
||||
|
||||
# Pool objects: iterate over all pools in the cluster.
|
||||
# We keep pool members even if their nodes are ignored so resource accounting
|
||||
# for rebalancing remains correct and we avoid overprovisioning nodes.
|
||||
for pool in proxmox_api.pools.get():
|
||||
logger.debug(f"Got pool: {pool['poolid']}")
|
||||
pools['pools'][pool['poolid']] = {}
|
||||
pools['pools'][pool['poolid']]['name'] = pool['poolid']
|
||||
pools['pools'][pool['poolid']]['members'] = []
|
||||
|
||||
# Fetch pool details and collect member names
|
||||
pool_details = proxmox_api.pools(pool['poolid']).get()
|
||||
for member in pool_details.get("members", []):
|
||||
|
||||
# We might also have objects without the key "name", e.g. storage pools
|
||||
if "name" not in member:
|
||||
logger.debug(f"Skipping member without name in pool: {pool['poolid']}")
|
||||
continue
|
||||
|
||||
logger.debug(f"Got member: {member['name']} for pool: {pool['poolid']}")
|
||||
pools['pools'][pool['poolid']]['members'].append(member["name"])
|
||||
|
||||
logger.debug("Finished: get_pools.")
|
||||
return pools
|
||||
|
||||
@staticmethod
|
||||
def get_pools_for_guest(guest_name: str, pools: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Return the list of pool names that include the given guest.
|
||||
|
||||
Args:
|
||||
guest_name (str): Name of the VM or CT to look up.
|
||||
pools (Dict[str, Any]): Pools structure as returned by get_pools(),
|
||||
expected to contain a top-level "pools" mapping each poolid to
|
||||
{"name": <poolid>, "members": [<member_names>...]}.
|
||||
|
||||
Returns:
|
||||
list[str]: Names of pools the guest is a member of (empty list if none).
|
||||
"""
|
||||
logger.debug("Starting: get_pools_for_guests.")
|
||||
guest_pools = []
|
||||
|
||||
for pool in pools.items():
|
||||
for pool_id, pool_data in pool[1].items():
|
||||
|
||||
if type(pool_data) is dict:
|
||||
pool_name = pool_data.get("name", "")
|
||||
pool_name_members = pool_data.get("members", [])
|
||||
|
||||
if guest_name in pool_name_members:
|
||||
logger.debug(f"Guest: {guest_name} is member of Pool: {pool_name}.")
|
||||
guest_pools.append(pool_name)
|
||||
else:
|
||||
logger.debug(f"Guest: {guest_name} is NOT member of Pool: {pool_name}.")
|
||||
|
||||
else:
|
||||
logger.debug(f"Pool data for pool_id {pool_id} is not a dict: {pool_data}")
|
||||
|
||||
logger.debug("Finished: get_pools_for_guests.")
|
||||
return guest_pools
|
||||
@@ -80,15 +80,18 @@ class Tags:
|
||||
return tags
|
||||
|
||||
@staticmethod
|
||||
def get_affinity_groups(tags: List[str]) -> List[str]:
|
||||
def get_affinity_groups(tags: List[str], pools: List[str], proxlb_config: Dict[str, Any]) -> List[str]:
|
||||
"""
|
||||
Get affinity tags for a guest from the Proxmox cluster by the API.
|
||||
|
||||
This method retrieves all tags for a given guest and evaluates the
|
||||
affinity tags which are required during the balancing calculations.
|
||||
This method retrieves all tags for a given guest or based on a
|
||||
membership of a pool and evaluates the affinity groups which are
|
||||
required during the balancing calculations.
|
||||
|
||||
Args:
|
||||
tags (List): A list holding all defined tags for a given guest.
|
||||
pools (List): A list holding all defined pools for a given guest.
|
||||
proxlb_config (Dict): A dict holding the ProxLB configuration.
|
||||
|
||||
Returns:
|
||||
List: A list including all affinity tags for the given guest.
|
||||
@@ -99,21 +102,36 @@ class Tags:
|
||||
if len(tags) > 0:
|
||||
for tag in tags:
|
||||
if tag.startswith("plb_affinity"):
|
||||
logger.debug(f"Adding affinity group for tag {tag}.")
|
||||
affinity_tags.append(tag)
|
||||
else:
|
||||
logger.debug(f"Skipping affinity group for tag {tag}.")
|
||||
|
||||
if len(pools) > 0:
|
||||
for pool in pools:
|
||||
if pool in (proxlb_config['balancing'].get('pools') or {}):
|
||||
if proxlb_config['balancing']['pools'][pool].get('type', None) == 'affinity':
|
||||
logger.debug(f"Adding affinity group for pool {pool}.")
|
||||
affinity_tags.append(pool)
|
||||
else:
|
||||
logger.debug(f"Skipping affinity group for pool {pool}.")
|
||||
|
||||
logger.debug("Finished: get_affinity_groups.")
|
||||
return affinity_tags
|
||||
|
||||
@staticmethod
|
||||
def get_anti_affinity_groups(tags: List[str]) -> List[str]:
|
||||
def get_anti_affinity_groups(tags: List[str], pools: List[str], proxlb_config: Dict[str, Any]) -> List[str]:
|
||||
"""
|
||||
Get anti-affinity tags for a guest from the Proxmox cluster by the API.
|
||||
|
||||
This method retrieves all tags for a given guest and evaluates the
|
||||
anti-affinity tags which are required during the balancing calculations.
|
||||
This method retrieves all tags for a given guest or based on a
|
||||
membership of a pool and evaluates the anti-affinity groups which
|
||||
are required during the balancing calculations.
|
||||
|
||||
Args:
|
||||
tags (List): A list holding all defined tags for a given guest.
|
||||
pools (List): A list holding all defined pools for a given guest.
|
||||
proxlb_config (Dict): A dict holding the ProxLB configuration.
|
||||
|
||||
Returns:
|
||||
List: A list including all anti-affinity tags for the given guest..
|
||||
@@ -124,7 +142,19 @@ class Tags:
|
||||
if len(tags) > 0:
|
||||
for tag in tags:
|
||||
if tag.startswith("plb_anti_affinity"):
|
||||
logger.debug(f"Adding anti-affinity group for tag {tag}.")
|
||||
anti_affinity_tags.append(tag)
|
||||
else:
|
||||
logger.debug(f"Skipping anti-affinity group for tag {tag}.")
|
||||
|
||||
if len(pools) > 0:
|
||||
for pool in pools:
|
||||
if pool in (proxlb_config['balancing'].get('pools') or {}):
|
||||
if proxlb_config['balancing']['pools'][pool].get('type', None) == 'anti-affinity':
|
||||
logger.debug(f"Adding anti-affinity group for pool {pool}.")
|
||||
anti_affinity_tags.append(pool)
|
||||
else:
|
||||
logger.debug(f"Skipping anti-affinity group for pool {pool}.")
|
||||
|
||||
logger.debug("Finished: get_anti_affinity_groups.")
|
||||
return anti_affinity_tags
|
||||
@@ -155,10 +185,10 @@ class Tags:
|
||||
return ignore_tag
|
||||
|
||||
@staticmethod
|
||||
def get_node_relationships(tags: List[str], nodes: Dict[str, Any]) -> str:
|
||||
def get_node_relationships(tags: List[str], nodes: Dict[str, Any], pools: List[str], proxlb_config: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Get a node relationship tag for a guest from the Proxmox cluster by the API to pin
|
||||
a guest to a node.
|
||||
a guest to a node or by defined pools from ProxLB configuration.
|
||||
|
||||
This method retrieves a relationship tag between a guest and a specific
|
||||
hypervisor node to pin the guest to a specific node (e.g., for licensing reason).
|
||||
@@ -166,24 +196,44 @@ class Tags:
|
||||
Args:
|
||||
tags (List): A list holding all defined tags for a given guest.
|
||||
nodes (Dict): A dictionary holding all available nodes in the cluster.
|
||||
pools (List): A list holding all defined pools for a given guest.
|
||||
proxlb_config (Dict): A dict holding the ProxLB configuration.
|
||||
|
||||
Returns:
|
||||
Str: The related hypervisor node name.
|
||||
Str: The related hypervisor node name(s).
|
||||
"""
|
||||
logger.debug("Starting: get_node_relationships.")
|
||||
node_relationship_tags = []
|
||||
|
||||
if len(tags) > 0:
|
||||
logger.debug("Validating node pinning by tags.")
|
||||
for tag in tags:
|
||||
if tag.startswith("plb_pin"):
|
||||
node_relationship_tag = tag.replace("plb_pin_", "")
|
||||
|
||||
# Validate if the node to pin is present in the cluster
|
||||
if Helper.validate_node_presence(node_relationship_tag, nodes):
|
||||
logger.info(f"Tag {node_relationship_tag} is valid! Defined node exists in the cluster.")
|
||||
logger.debug(f"Tag {node_relationship_tag} is valid! Defined node exists in the cluster.")
|
||||
logger.debug(f"Setting node relationship because of tag {tag} to {node_relationship_tag}.")
|
||||
node_relationship_tags.append(node_relationship_tag)
|
||||
else:
|
||||
logger.warning(f"Tag {node_relationship_tag} is invalid! Defined node does not exist in the cluster. Not applying pinning.")
|
||||
|
||||
if len(pools) > 0:
|
||||
logger.debug("Validating node pinning by pools.")
|
||||
for pool in pools:
|
||||
if pool in (proxlb_config['balancing'].get('pools') or {}):
|
||||
|
||||
node = proxlb_config['balancing']['pools'][pool].get('pin', None)
|
||||
# Validate if the node to pin is present in the cluster
|
||||
if Helper.validate_node_presence(node, nodes):
|
||||
logger.debug(f"Pool pinning tag {node} is valid! Defined node exists in the cluster.")
|
||||
logger.debug(f"Setting node relationship because of pool {pool} to {node}.")
|
||||
node_relationship_tags.append(node)
|
||||
else:
|
||||
logger.warning(f"Pool pinning tag {node} is invalid! Defined node does not exist in the cluster. Not applying pinning.")
|
||||
else:
|
||||
logger.debug(f"Skipping pinning for pool {pool}. Pool is not defined in ProxLB configuration.")
|
||||
|
||||
logger.debug("Finished: get_node_relationships.")
|
||||
return node_relationship_tags
|
||||
|
||||
@@ -11,6 +11,7 @@ __license__ = "GPL-3.0"
|
||||
import json
|
||||
import uuid
|
||||
import re
|
||||
import socket
|
||||
import sys
|
||||
import time
|
||||
import utils.version
|
||||
@@ -307,3 +308,28 @@ class Helper:
|
||||
logger.warning(f"Node {node} not found in cluster. Not applying pinning!")
|
||||
logger.debug("Finished: validate_node_presence.")
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def tcp_connect_test(addr_family: int, host: str, port: int, timeout: int) -> tuple[bool, int | None]:
|
||||
"""
|
||||
Attempt a TCP connection to the specified host and port to test the reachability.
|
||||
|
||||
Args:
|
||||
addr_family (int): Address family for the socket (e.g., socket.AF_INET for IPv4, socket.AF_INET6 for IPv6).
|
||||
host (str): The hostname or IP address to connect to.
|
||||
port (int): The port number to connect to.
|
||||
timeout (int): Connection timeout in seconds.
|
||||
|
||||
Returns:
|
||||
tuple[bool, int | None]: A tuple containing:
|
||||
- bool: True if the connection was successful, False otherwise.
|
||||
- int | None: None if the connection was successful, otherwise the errno code indicating the reason for failure.
|
||||
"""
|
||||
test_socket = socket.socket(addr_family, socket.SOCK_STREAM)
|
||||
test_socket.settimeout(timeout)
|
||||
|
||||
try:
|
||||
rc = test_socket.connect_ex((host, port))
|
||||
return (rc == 0, rc if rc != 0 else None)
|
||||
finally:
|
||||
test_socket.close()
|
||||
|
||||
@@ -88,7 +88,7 @@ class SystemdLogger:
|
||||
# logging is preferred.
|
||||
if SYSTEMD_PRESENT:
|
||||
# Add a JournalHandler for systemd integration
|
||||
handler = JournalHandler()
|
||||
handler = JournalHandler(SYSLOG_IDENTIFIER="ProxLB")
|
||||
else:
|
||||
# Add a stdout handler as a fallback
|
||||
handler = logging.StreamHandler(sys.stdout)
|
||||
|
||||
@@ -13,6 +13,7 @@ __copyright__ = "Copyright (C) 2025 Florian Paul Azim Hoberg (@gyptazy)"
|
||||
__license__ = "GPL-3.0"
|
||||
|
||||
|
||||
import errno
|
||||
try:
|
||||
import proxmoxer
|
||||
PROXMOXER_PRESENT = True
|
||||
@@ -175,49 +176,40 @@ class ProxmoxApi:
|
||||
logger.debug("Starting: api_connect_get_hosts.")
|
||||
# Pre-validate the given API endpoints
|
||||
if not isinstance(proxmox_api_endpoints, list):
|
||||
logger.critical(f"The proxmox_api hosts are not defined as a list type.")
|
||||
logger.critical("The proxmox_api hosts are not defined as a list type.")
|
||||
sys.exit(1)
|
||||
|
||||
if not proxmox_api_endpoints:
|
||||
logger.critical(f"No proxmox_api hosts are defined.")
|
||||
logger.critical("No proxmox_api hosts are defined.")
|
||||
sys.exit(1)
|
||||
|
||||
if len(proxmox_api_endpoints) == 0:
|
||||
logger.critical(f"No proxmox_api hosts are defined.")
|
||||
sys.exit(1)
|
||||
validated_api_hosts: list[tuple[str, int]] = []
|
||||
|
||||
# If we have multiple Proxmox API endpoints, we need to check each one by
|
||||
# doing a connection attempt for IPv4 and IPv6. If we find a working one,
|
||||
# we return that one. This allows us to define multiple endpoints in a cluster.
|
||||
validated_api_hosts = []
|
||||
for host in proxmox_api_endpoints:
|
||||
retries = proxlb_config["proxmox_api"].get("retries", 1)
|
||||
wait_time = proxlb_config["proxmox_api"].get("wait_time", 1)
|
||||
|
||||
# Get or set a default value for a maximum of retries when connecting to
|
||||
# the Proxmox API
|
||||
api_connection_retries = proxlb_config["proxmox_api"].get("retries", 1)
|
||||
api_connection_wait_time = proxlb_config["proxmox_api"].get("wait_time", 1)
|
||||
|
||||
for api_connection_attempt in range(api_connection_retries):
|
||||
validated_api_host, api_port = self.test_api_proxmox_host(host)
|
||||
if validated_api_host:
|
||||
validated_api_hosts.append(validated_api_host)
|
||||
for attempt in range(retries):
|
||||
candidate_host, candidate_port = self.test_api_proxmox_host(host)
|
||||
if candidate_host:
|
||||
validated_api_hosts.append((candidate_host, candidate_port))
|
||||
break
|
||||
else:
|
||||
logger.warning(f"Attempt {api_connection_attempt + 1}/{api_connection_retries} failed for host {host}. Retrying in {api_connection_wait_time} seconds...")
|
||||
time.sleep(api_connection_wait_time)
|
||||
logger.warning(
|
||||
f"Attempt {attempt + 1}/{retries} failed for host {host}. "
|
||||
f"Retrying in {wait_time} seconds..."
|
||||
)
|
||||
time.sleep(wait_time)
|
||||
|
||||
if len(validated_api_hosts) > 0:
|
||||
# Choose a random host to distribute the load across the cluster
|
||||
# as a simple load balancing mechanism.
|
||||
return random.choice(validated_api_hosts), api_port
|
||||
if validated_api_hosts:
|
||||
chosen_host, chosen_port = random.choice(validated_api_hosts)
|
||||
return chosen_host, chosen_port
|
||||
|
||||
logger.critical("No valid Proxmox API hosts found.")
|
||||
print("No valid Proxmox API hosts found.")
|
||||
|
||||
logger.debug("Finished: api_connect_get_hosts.")
|
||||
sys.exit(1)
|
||||
|
||||
def test_api_proxmox_host(self, host: str) -> str:
|
||||
def test_api_proxmox_host(self, host: str) -> tuple[str, int | None, None]:
|
||||
"""
|
||||
Tests the connectivity to a Proxmox host by resolving its IP address and
|
||||
checking both IPv4 and IPv6 addresses.
|
||||
@@ -237,31 +229,36 @@ class ProxmoxApi:
|
||||
"""
|
||||
logger.debug("Starting: test_api_proxmox_host.")
|
||||
|
||||
# Validate for custom ports in API hosts which might indicate
|
||||
# that an external loadbalancer will be used.
|
||||
# Validate for custom port configurations (e.g., by given external
|
||||
# loadbalancer systems)
|
||||
host, port = Helper.get_host_port_from_string(host)
|
||||
if port is None:
|
||||
port = 8006
|
||||
|
||||
# Try resolving DNS to IP and log non-resolvable ones
|
||||
try:
|
||||
ip = socket.getaddrinfo(host, None, socket.AF_UNSPEC)
|
||||
infos = socket.getaddrinfo(host, None, socket.AF_UNSPEC)
|
||||
except socket.gaierror:
|
||||
logger.warning(f"Could not resolve {host}.")
|
||||
return False
|
||||
return (None, None)
|
||||
|
||||
# Validate if given object is IPv4 or IPv6
|
||||
for address_type in ip:
|
||||
if address_type[0] == socket.AF_INET:
|
||||
logger.debug(f"{host} is type ipv4.")
|
||||
if self.test_api_proxmox_host_ipv4(host, port):
|
||||
return host, port
|
||||
elif address_type[0] == socket.AF_INET6:
|
||||
logger.debug(f"{host} is type ipv6.")
|
||||
if self.test_api_proxmox_host_ipv6(host, port):
|
||||
return host, port
|
||||
else:
|
||||
return False
|
||||
# Check both families that are actually present
|
||||
saw_family = set()
|
||||
for family, *_rest in infos:
|
||||
saw_family.add(family)
|
||||
|
||||
logger.debug("Finished: test_api_proxmox_host.")
|
||||
if socket.AF_INET in saw_family:
|
||||
logger.debug(f"{host} has IPv4.")
|
||||
if self.test_api_proxmox_host_ipv4(host, port):
|
||||
return (host, port)
|
||||
|
||||
if socket.AF_INET6 in saw_family:
|
||||
logger.debug(f"{host} has IPv6.")
|
||||
if self.test_api_proxmox_host_ipv6(host, port):
|
||||
return (host, port)
|
||||
|
||||
logger.debug("Finished: test_api_proxmox_host (unreachable).")
|
||||
return (None, None)
|
||||
|
||||
def test_api_proxmox_host_ipv4(self, host: str, port: int = 8006, timeout: int = 1) -> bool:
|
||||
"""
|
||||
@@ -280,18 +277,16 @@ class ProxmoxApi:
|
||||
bool: True if the host is reachable on the specified port, False otherwise.
|
||||
"""
|
||||
logger.debug("Starting: test_api_proxmox_host_ipv4.")
|
||||
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
sock.settimeout(timeout)
|
||||
logger.warning(f"Warning: Host {host} ran into a timeout when connecting on IPv4 for tcp/{port}.")
|
||||
result = sock.connect_ex((host, port))
|
||||
|
||||
if result == 0:
|
||||
sock.close()
|
||||
ok, rc = Helper.tcp_connect_test(socket.AF_INET, host, port, timeout)
|
||||
if ok:
|
||||
logger.debug(f"Host {host} is reachable on IPv4 for tcp/{port}.")
|
||||
logger.debug("Finished: test_api_proxmox_host_ipv4.")
|
||||
return True
|
||||
|
||||
sock.close()
|
||||
logger.warning(f"Host {host} is unreachable on IPv4 for tcp/{port}.")
|
||||
if rc == errno.ETIMEDOUT:
|
||||
logger.warning(f"Timeout connecting to {host} on IPv4 tcp/{port}.")
|
||||
else:
|
||||
logger.warning(f"Host {host} is unreachable on IPv4 for tcp/{port} (errno {rc}).")
|
||||
|
||||
logger.debug("Finished: test_api_proxmox_host_ipv4.")
|
||||
return False
|
||||
@@ -313,18 +308,16 @@ class ProxmoxApi:
|
||||
bool: True if the host is reachable on the specified port, False otherwise.
|
||||
"""
|
||||
logger.debug("Starting: test_api_proxmox_host_ipv6.")
|
||||
sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
|
||||
sock.settimeout(timeout)
|
||||
logger.warning(f"Host {host} ran into a timeout when connecting via IPv6 for tcp/{port}.")
|
||||
result = sock.connect_ex((host, port))
|
||||
|
||||
if result == 0:
|
||||
sock.close()
|
||||
ok, rc = Helper.tcp_connect_test(socket.AF_INET6, host, port, timeout)
|
||||
if ok:
|
||||
logger.debug(f"Host {host} is reachable on IPv6 for tcp/{port}.")
|
||||
logger.debug("Finished: test_api_proxmox_host_ipv6.")
|
||||
return True
|
||||
|
||||
sock.close()
|
||||
logger.warning(f"Host {host} is unreachable on IPv6 for tcp/{port}.")
|
||||
if rc == errno.ETIMEDOUT:
|
||||
logger.warning(f"Timeout connecting to {host} on IPv6 tcp/{port}.")
|
||||
else:
|
||||
logger.warning(f"Host {host} is unreachable on IPv6 for tcp/{port} (errno {rc}).")
|
||||
|
||||
logger.debug("Finished: test_api_proxmox_host_ipv6.")
|
||||
return False
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
__app_name__ = "ProxLB"
|
||||
__app_desc__ = "A DRS alike loadbalancer for Proxmox clusters."
|
||||
__app_desc__ = "An advanced resource scheduler and load balancer for Proxmox clusters."
|
||||
__author__ = "Florian Paul Azim Hoberg <gyptazy>"
|
||||
__copyright__ = "Copyright (C) 2025 Florian Paul Azim Hoberg (@gyptazy)"
|
||||
__license__ = "GPL-3.0"
|
||||
__version__ = "1.1.7"
|
||||
__version__ = "1.1.10"
|
||||
__url__ = "https://github.com/gyptazy/ProxLB"
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
packaging
|
||||
proxmoxer
|
||||
requests
|
||||
urllib3
|
||||
PyYAML
|
||||
PyYAML
|
||||
|
||||
6
setup.py
6
setup.py
@@ -2,9 +2,9 @@ from setuptools import setup
|
||||
|
||||
setup(
|
||||
name="proxlb",
|
||||
version="1.1.7",
|
||||
description="A DRS alike loadbalancer for Proxmox clusters.",
|
||||
long_description="An advanced DRS alike loadbalancer for Proxmox clusters that also supports maintenance modes and affinity/anti-affinity rules.",
|
||||
version="1.1.10",
|
||||
description="An advanced resource scheduler and load balancer for Proxmox clusters.",
|
||||
long_description="An advanced resource scheduler and load balancer for Proxmox clusters that also supports maintenance modes and affinity/anti-affinity rules.",
|
||||
author="Florian Paul Azim Hoberg",
|
||||
author_email="gyptazy@gyptazy.com",
|
||||
maintainer="Florian Paul Azim Hoberg",
|
||||
|
||||
Reference in New Issue
Block a user