Pangolin postgres excessive CPU and Disk I/O causing VM to be inaccessible and unresponsive #1024

Closed
opened 2026-04-05 18:07:33 +02:00 by MrUnknownDE · 0 comments
Owner

Originally created by @abpai94 on 8/25/2025

Version: pangolin:postgresql-1.8.0
Deployment: docker

I found the server inaccessible with CPU usuage at 150% and the disk IO hitting 11,000 blocks/s. I am unsure the cause of the issue but saw the following logs which might help other if they encounter the same issue. I will update to the newer 1.9.0 as it was released recently hopefully mitigating the issue.

The pangolin web UI and all services connected to it were completely inaccessibly. I couldn't even SSH into the machine.

useDockerSocket initialized for site ID: 3
Error: Failed query: select "resources"."resourceId", "resources"."siteId", "resources"."orgId", "resources"."name", "resources"."subdomain", "resources"."fullDomain", "resources"."domainId", "resources"."ssl", "resources"."blockAccess", "resources"."sso", "resources"."http", "resources"."protocol", "resources"."proxyPort", "resources"."emailWhitelistEnabled", "resources"."applyRules", "resources"."enabled", "resources"."stickySession", "resources"."tlsServerName", "resources"."setHostHeader", "resources"."enableProxy", "resourcePincode"."pincodeId", "resourcePincode"."resourceId", "resourcePincode"."pincodeHash", "resourcePincode"."digitLength", "resourcePassword"."passwordId", "resourcePassword"."resourceId", "resourcePassword"."passwordHash" from "resources" left join "resourcePincode" on "resourcePincode"."resourceId" = "resources"."resourceId" left join "resourcePassword" on "resourcePassword"."resourceId" = "resources"."resourceId" where "resources"."fullDomain" = $1 limit $2
params: cloud.example.co.uk,1
    at async SE (server/routers/badger/verifySession.ts:141:29)
  139 |
  140 |         if (!resourceData) {
> 141 |             const [result] = await db
      |                             ^
  142 |                 .select()
  143 |                 .from(resources)
  144 |                 .leftJoin( {
  query: 'select "resources"."resourceId", "resources"."siteId", "resources"."orgId", "resources"."name", "resources"."subdomain", "resources"."fullDomain", "resources"."domainId", "resources"."ssl", "resources"."blockAccess", "resources"."sso", "resources"."http", "resources"."protocol", "resources"."proxyPort", "resources"."emailWhitelistEnabled", "resources"."applyRules", "resources"."enabled", "resources"."stickySession", "resources"."tlsServerName", "resources"."setHostHeader", "resources"."enableProxy", "resourcePincode"."pincodeId", "resourcePincode"."resourceId", "resourcePincode"."pincodeHash", "resourcePincode"."digitLength", "resourcePassword"."passwordId", "resourcePassword"."resourceId", "resourcePassword"."passwordHash" from "resources" left join "resourcePincode" on "resourcePincode"."resourceId" = "resources"."resourceId" left join "resourcePassword" on "resourcePassword"."resourceId" = "resources"."resourceId" where "resources"."fullDomain" = $1 limit $2',
  params: [Array],
  [cause]: Error: Connection terminated due to connection timeout
      at async SE (server/routers/badger/verifySession.ts:141:29)
    139 |
    140 |         if (!resourceData) {
  > 141 |             const [result] = await db
        |                             ^
    142 |                 .select()
    143 |                 .from(resources)
    144 |                 .leftJoin( {
    [cause]: [Error: Connection terminated unexpectedly]
  }
}
2025-08-25T07:59:31.620Z [error]: Error in offline checker interval {"error":{"query":"update \"clients\" set \"online\" = $1 where \"clients\".\"lastPing\" < $2","params":[false,"2025-08-25T07:57:24.627Z"],"cause":{}}}
2025-08-25T07:59:32.444Z [error]: Error updating bandwidth data: Connection terminated due to connection timeout
Stack: Error: Connection terminated due to connection timeout
    at /app/node_modules/pg-pool/index.js:45:11
    at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
    at async NodePgSession.transaction (file:///app/node_modules/drizzle-orm/node-postgres/session.js:181:69)
    at async gE (file:///app/dist/server.mjs:32:94004) {"cause":{}}
Error: Failed query: select "resources"."resourceId", "resources"."siteId", "resources"."orgId", "resources"."name", "resources"."subdomain", "resources"."fullDomain", "resources"."domainId", "resources"."ssl", "resources"."blockAccess", "resources"."sso", "resources"."http", "resources"."protocol", "resources"."proxyPort", "resources"."emailWhitelistEnabled", "resources"."applyRules", "resources"."enabled", "resources"."stickySession", "resources"."tlsServerName", "resources"."setHostHeader", "resources"."enableProxy", "resourcePincode"."pincodeId", "resourcePincode"."resourceId", "resourcePincode"."pincodeHash", "resourcePincode"."digitLength", "resourcePassword"."passwordId", "resourcePassword"."resourceId", "resourcePassword"."passwordHash" from "resources" left join "resourcePincode" on "resourcePincode"."resourceId" = "resources"."resourceId" left join "resourcePassword" on "resourcePassword"."resourceId" = "resources"."resourceId" where "resources"."fullDomain" = $1 limit $2
params: cloud.example.co.uk,1
    at async SE (server/routers/badger/verifySession.ts:141:29)
  139 |
  140 |         if (!resourceData) {
> 141 |             const [result] = await db
      |                             ^
  142 |                 .select()
  143 |                 .from(resources)
  144 |                 .leftJoin( {
  query: 'select "resources"."resourceId", "resources"."siteId", "resources"."orgId", "resources"."name", "resources"."subdomain", "resources"."fullDomain", "resources"."domainId", "resources"."ssl", "resources"."blockAccess", "resources"."sso", "resources"."http", "resources"."protocol", "resources"."proxyPort", "resources"."emailWhitelistEnabled", "resources"."applyRules", "resources"."enabled", "resources"."stickySession", "resources"."tlsServerName", "resources"."setHostHeader", "resources"."enableProxy", "resourcePincode"."pincodeId", "resourcePincode"."resourceId", "resourcePincode"."pincodeHash", "resourcePincode"."digitLength", "resourcePassword"."passwordId", "resourcePassword"."resourceId", "resourcePassword"."passwordHash" from "resources" left join "resourcePincode" on "resourcePincode"."resourceId" = "resources"."resourceId" left join "resourcePassword" on "resourcePassword"."resourceId" = "resources"."resourceId" where "resources"."fullDomain" = $1 limit $2',
  params: [Array],
  [cause]: Error: Connection terminated due to connection timeout
      at async SE (server/routers/badger/verifySession.ts:141:29)
    139 |
    140 |         if (!resourceData) {
  > 141 |             const [result] = await db
        |                             ^
    142 |                 .select()
    143 |                 .from(resources)
    144 |                 .leftJoin( {
    [cause]: [Error: Connection terminated unexpectedly]
  }
}
2025-08-25T08:10:09.171Z [error]: Error in offline checker interval {"error":{"query":"update \"clients\" set \"online\" = $1 where \"clients\".\"lastPing\" < $2","params":[false,"2025-08-25T08:05:43.579Z"],"cause":{"errno":-3001,"code":"EAI_AGAIN","syscall":"getaddrinfo","hostname":"postgres"}}}
2025-08-25T08:10:09.594Z [error]: Failed to clean up expired security key challenges Failed query: delete from "webauthnChallenge" where "webauthnChallenge"."expiresAt" < $1
params: 1756109285329
Stack: Error: Failed query: delete from "webauthnChallenge" where "webauthnChallenge"."expiresAt" < $1
params: 1756109285329
    at NodePgPreparedQuery.queryWithCache (file:///app/node_modules/drizzle-orm/pg-core/session.js:42:15)
    at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
    at async Timeout._onTimeout (file:///app/dist/server.mjs:32:19506) {"query":"delete from \"webauthnChallenge\" where \"webauthnChallenge\".\"expiresAt\" < $1","params":[1756109285329],"cause":{"errno":-3001,"code":"EAI_AGAIN","syscall":"getaddrinfo","hostname":"postgres"}}
2025-08-25T08:10:10.329Z [error]: Failed to clean up expired security key challenges Failed query: delete from "webauthnChallenge" where "webauthnChallenge"."expiresAt" < $1
params: 1756108931320
Stack: Error: Failed query: delete from "webauthnChallenge" where "webauthnChallenge"."expiresAt" < $1
params: 1756108931320
    at NodePgPreparedQuery.queryWithCache (file:///app/node_modules/drizzle-orm/pg-core/session.js:42:15)
    at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
    at async Timeout._onTimeout (file:///app/dist/server.mjs:32:19506) {"query":"delete from \"webauthnChallenge\" where \"webauthnChallenge\".\"expiresAt\" < $1","params":[1756108931320],"cause":{}}
2025-08-25T08:10:10.606Z [error]: Error in offline checker interval {"error":{"query":"update \"clients\" set \"online\" = $1 where \"clients\".\"lastPing\" < $2","params":[false,"2025-08-25T08:00:01.450Z"],"cause":{}}}
2025-08-25T08:14:57.322Z [error]: Error in offline checker interval {"error":{"query":"update \"clients\" set \"online\" = $1 where \"clients\".\"lastPing\" < $2","params":[false,"2025-08-25T08:11:40.132Z"],"cause":{"errno":-3001,"code":"EAI_AGAIN","syscall":"getaddrinfo","hostname":"postgres"}}}
2025-08-25T08:15:00.049Z [error]: Failed to clean up expired security key challenges Failed query: delete from "webauthnChallenge" where "webauthnChallenge"."expiresAt" < $1
params: 1756109634470
Stack: Error: Failed query: delete from "webauthnChallenge" where "webauthnChallenge"."expiresAt" < $1
params: 1756109634470
    at NodePgPreparedQuery.queryWithCache (file:///app/node_modules/drizzle-orm/pg-core/session.js:42:15)
    at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
    at async Timeout._onTimeout (file:///app/dist/server.mjs:32:19506) {"query":"delete from \"webauthnChallenge\" where \"webauthnChallenge\".\"expiresAt\" < $1","params":[1756109634470],"cause":{"errno":-3001,"code":"EAI_AGAIN","syscall":"getaddrinfo","hostname":"postgres"}}
2025-08-25T08:15:04.088Z [error]: Error in offline checker interval {"error":{"query":"update \"clients\" set \"online\" = $1 where \"clients\".\"lastPing\" < $2","params":[false,"2025-08-25T08:08:10.716Z"],"cause":{}}}
2025-08-25T08:16:17.614Z [error]: Failed to build Traefik config: Error: Connection terminated due to connection timeout

Its entirely possible its an issue with the postgres instance I am running. I will add more information if/when I discover them.

*Originally created by @abpai94 on 8/25/2025* Version: `pangolin:postgresql-1.8.0` Deployment: `docker` I found the server inaccessible with CPU usuage at 150% and the disk IO hitting 11,000 blocks/s. I am unsure the cause of the issue but saw the following logs which might help other if they encounter the same issue. I will update to the newer 1.9.0 as it was released recently hopefully mitigating the issue. The pangolin web UI and all services connected to it were completely inaccessibly. I couldn't even SSH into the machine. ``` useDockerSocket initialized for site ID: 3 Error: Failed query: select "resources"."resourceId", "resources"."siteId", "resources"."orgId", "resources"."name", "resources"."subdomain", "resources"."fullDomain", "resources"."domainId", "resources"."ssl", "resources"."blockAccess", "resources"."sso", "resources"."http", "resources"."protocol", "resources"."proxyPort", "resources"."emailWhitelistEnabled", "resources"."applyRules", "resources"."enabled", "resources"."stickySession", "resources"."tlsServerName", "resources"."setHostHeader", "resources"."enableProxy", "resourcePincode"."pincodeId", "resourcePincode"."resourceId", "resourcePincode"."pincodeHash", "resourcePincode"."digitLength", "resourcePassword"."passwordId", "resourcePassword"."resourceId", "resourcePassword"."passwordHash" from "resources" left join "resourcePincode" on "resourcePincode"."resourceId" = "resources"."resourceId" left join "resourcePassword" on "resourcePassword"."resourceId" = "resources"."resourceId" where "resources"."fullDomain" = $1 limit $2 params: cloud.example.co.uk,1 at async SE (server/routers/badger/verifySession.ts:141:29) 139 | 140 | if (!resourceData) { > 141 | const [result] = await db | ^ 142 | .select() 143 | .from(resources) 144 | .leftJoin( { query: 'select "resources"."resourceId", "resources"."siteId", "resources"."orgId", "resources"."name", "resources"."subdomain", "resources"."fullDomain", "resources"."domainId", "resources"."ssl", "resources"."blockAccess", "resources"."sso", "resources"."http", "resources"."protocol", "resources"."proxyPort", "resources"."emailWhitelistEnabled", "resources"."applyRules", "resources"."enabled", "resources"."stickySession", "resources"."tlsServerName", "resources"."setHostHeader", "resources"."enableProxy", "resourcePincode"."pincodeId", "resourcePincode"."resourceId", "resourcePincode"."pincodeHash", "resourcePincode"."digitLength", "resourcePassword"."passwordId", "resourcePassword"."resourceId", "resourcePassword"."passwordHash" from "resources" left join "resourcePincode" on "resourcePincode"."resourceId" = "resources"."resourceId" left join "resourcePassword" on "resourcePassword"."resourceId" = "resources"."resourceId" where "resources"."fullDomain" = $1 limit $2', params: [Array], [cause]: Error: Connection terminated due to connection timeout at async SE (server/routers/badger/verifySession.ts:141:29) 139 | 140 | if (!resourceData) { > 141 | const [result] = await db | ^ 142 | .select() 143 | .from(resources) 144 | .leftJoin( { [cause]: [Error: Connection terminated unexpectedly] } } 2025-08-25T07:59:31.620Z [error]: Error in offline checker interval {"error":{"query":"update \"clients\" set \"online\" = $1 where \"clients\".\"lastPing\" < $2","params":[false,"2025-08-25T07:57:24.627Z"],"cause":{}}} 2025-08-25T07:59:32.444Z [error]: Error updating bandwidth data: Connection terminated due to connection timeout Stack: Error: Connection terminated due to connection timeout at /app/node_modules/pg-pool/index.js:45:11 at process.processTicksAndRejections (node:internal/process/task_queues:95:5) at async NodePgSession.transaction (file:///app/node_modules/drizzle-orm/node-postgres/session.js:181:69) at async gE (file:///app/dist/server.mjs:32:94004) {"cause":{}} Error: Failed query: select "resources"."resourceId", "resources"."siteId", "resources"."orgId", "resources"."name", "resources"."subdomain", "resources"."fullDomain", "resources"."domainId", "resources"."ssl", "resources"."blockAccess", "resources"."sso", "resources"."http", "resources"."protocol", "resources"."proxyPort", "resources"."emailWhitelistEnabled", "resources"."applyRules", "resources"."enabled", "resources"."stickySession", "resources"."tlsServerName", "resources"."setHostHeader", "resources"."enableProxy", "resourcePincode"."pincodeId", "resourcePincode"."resourceId", "resourcePincode"."pincodeHash", "resourcePincode"."digitLength", "resourcePassword"."passwordId", "resourcePassword"."resourceId", "resourcePassword"."passwordHash" from "resources" left join "resourcePincode" on "resourcePincode"."resourceId" = "resources"."resourceId" left join "resourcePassword" on "resourcePassword"."resourceId" = "resources"."resourceId" where "resources"."fullDomain" = $1 limit $2 params: cloud.example.co.uk,1 at async SE (server/routers/badger/verifySession.ts:141:29) 139 | 140 | if (!resourceData) { > 141 | const [result] = await db | ^ 142 | .select() 143 | .from(resources) 144 | .leftJoin( { query: 'select "resources"."resourceId", "resources"."siteId", "resources"."orgId", "resources"."name", "resources"."subdomain", "resources"."fullDomain", "resources"."domainId", "resources"."ssl", "resources"."blockAccess", "resources"."sso", "resources"."http", "resources"."protocol", "resources"."proxyPort", "resources"."emailWhitelistEnabled", "resources"."applyRules", "resources"."enabled", "resources"."stickySession", "resources"."tlsServerName", "resources"."setHostHeader", "resources"."enableProxy", "resourcePincode"."pincodeId", "resourcePincode"."resourceId", "resourcePincode"."pincodeHash", "resourcePincode"."digitLength", "resourcePassword"."passwordId", "resourcePassword"."resourceId", "resourcePassword"."passwordHash" from "resources" left join "resourcePincode" on "resourcePincode"."resourceId" = "resources"."resourceId" left join "resourcePassword" on "resourcePassword"."resourceId" = "resources"."resourceId" where "resources"."fullDomain" = $1 limit $2', params: [Array], [cause]: Error: Connection terminated due to connection timeout at async SE (server/routers/badger/verifySession.ts:141:29) 139 | 140 | if (!resourceData) { > 141 | const [result] = await db | ^ 142 | .select() 143 | .from(resources) 144 | .leftJoin( { [cause]: [Error: Connection terminated unexpectedly] } } 2025-08-25T08:10:09.171Z [error]: Error in offline checker interval {"error":{"query":"update \"clients\" set \"online\" = $1 where \"clients\".\"lastPing\" < $2","params":[false,"2025-08-25T08:05:43.579Z"],"cause":{"errno":-3001,"code":"EAI_AGAIN","syscall":"getaddrinfo","hostname":"postgres"}}} 2025-08-25T08:10:09.594Z [error]: Failed to clean up expired security key challenges Failed query: delete from "webauthnChallenge" where "webauthnChallenge"."expiresAt" < $1 params: 1756109285329 Stack: Error: Failed query: delete from "webauthnChallenge" where "webauthnChallenge"."expiresAt" < $1 params: 1756109285329 at NodePgPreparedQuery.queryWithCache (file:///app/node_modules/drizzle-orm/pg-core/session.js:42:15) at process.processTicksAndRejections (node:internal/process/task_queues:95:5) at async Timeout._onTimeout (file:///app/dist/server.mjs:32:19506) {"query":"delete from \"webauthnChallenge\" where \"webauthnChallenge\".\"expiresAt\" < $1","params":[1756109285329],"cause":{"errno":-3001,"code":"EAI_AGAIN","syscall":"getaddrinfo","hostname":"postgres"}} 2025-08-25T08:10:10.329Z [error]: Failed to clean up expired security key challenges Failed query: delete from "webauthnChallenge" where "webauthnChallenge"."expiresAt" < $1 params: 1756108931320 Stack: Error: Failed query: delete from "webauthnChallenge" where "webauthnChallenge"."expiresAt" < $1 params: 1756108931320 at NodePgPreparedQuery.queryWithCache (file:///app/node_modules/drizzle-orm/pg-core/session.js:42:15) at process.processTicksAndRejections (node:internal/process/task_queues:95:5) at async Timeout._onTimeout (file:///app/dist/server.mjs:32:19506) {"query":"delete from \"webauthnChallenge\" where \"webauthnChallenge\".\"expiresAt\" < $1","params":[1756108931320],"cause":{}} 2025-08-25T08:10:10.606Z [error]: Error in offline checker interval {"error":{"query":"update \"clients\" set \"online\" = $1 where \"clients\".\"lastPing\" < $2","params":[false,"2025-08-25T08:00:01.450Z"],"cause":{}}} 2025-08-25T08:14:57.322Z [error]: Error in offline checker interval {"error":{"query":"update \"clients\" set \"online\" = $1 where \"clients\".\"lastPing\" < $2","params":[false,"2025-08-25T08:11:40.132Z"],"cause":{"errno":-3001,"code":"EAI_AGAIN","syscall":"getaddrinfo","hostname":"postgres"}}} 2025-08-25T08:15:00.049Z [error]: Failed to clean up expired security key challenges Failed query: delete from "webauthnChallenge" where "webauthnChallenge"."expiresAt" < $1 params: 1756109634470 Stack: Error: Failed query: delete from "webauthnChallenge" where "webauthnChallenge"."expiresAt" < $1 params: 1756109634470 at NodePgPreparedQuery.queryWithCache (file:///app/node_modules/drizzle-orm/pg-core/session.js:42:15) at process.processTicksAndRejections (node:internal/process/task_queues:95:5) at async Timeout._onTimeout (file:///app/dist/server.mjs:32:19506) {"query":"delete from \"webauthnChallenge\" where \"webauthnChallenge\".\"expiresAt\" < $1","params":[1756109634470],"cause":{"errno":-3001,"code":"EAI_AGAIN","syscall":"getaddrinfo","hostname":"postgres"}} 2025-08-25T08:15:04.088Z [error]: Error in offline checker interval {"error":{"query":"update \"clients\" set \"online\" = $1 where \"clients\".\"lastPing\" < $2","params":[false,"2025-08-25T08:08:10.716Z"],"cause":{}}} 2025-08-25T08:16:17.614Z [error]: Failed to build Traefik config: Error: Connection terminated due to connection timeout ``` Its entirely possible its an issue with the postgres instance I am running. I will add more information if/when I discover them.
Sign in to join this conversation.
1 Participants
Notifications
Due Date
No due date set.
Dependencies

No dependencies set.

Reference: github/pangolin#1024