How can I pause my GKE Autopilot cluster nightly in GCP? This cluster supports the Cloud Composer service in our Data stage area.
We know that it’s not possible to do this natively because the service does not support this option, but it is possible to reduce the number of nodes to zero. However, Composer creates an Autopilot cluster in GKE. When we reduce the number of nodes, the service automatically enables new nodes.
We scheduled a job in Cloud Scheduler, created a topic in Pub/Sub, and triggered a Cloud Function in Python to handle the process of pausing and resuming the cluster.
import os
from google.cloud import container_v1
from google.auth import default
from kubernetes import client, config
from kubernetes.client.rest import ApiException
def main(event, context):
cluster_name = "xxxxxxxxxxxxxxxxx
location = "xxxxxxxx"
project_id = "xxxxxxxxxxx"
print("Autenticando no GKE e obtendo credenciais do cluster...")
# Authenticate to GKE and get cluster credentials
credentials, project = default()
client = container_v1.ClusterManagerClient(credentials=credentials)
cluster_info = client.get_cluster(name=f"projects/{project_id}/locations/{location}/clusters/{cluster_name}")
print(f"Conectado ao cluster {cluster_name}.")
# Configure Kubernetes client
print("Configurando cliente Kubernetes...")
config.load_kube_config_from_dict({
"apiVersion": "v1",
"clusters": [{
"cluster": {
"server": "https://" + cluster_info.endpoint,
"certificate-authority-data": cluster_info.master_auth.cluster_ca_certificate,
},
"name": cluster_info.name,
}],
"contexts": [{
"context": {
"cluster": cluster_info.name,
"user": "default",
},
"name": "default",
}],
"current-context": "default",
"kind": "Config",
"preferences": {},
"users": [{
"name": "default",
"user": {
"auth-provider": {
"config": {
"access-token": credentials.token,
"cmd-args": "config config-helper --format=json",
"cmd-path": "gcloud",
"expiry-key": "{.credential.token_expiry}",
"token-key": "{.credential.access_token}",
},
"name": "gcp",
},
},
}],
})
# Scale up all deployments in the default namespace to the original number of replicas
v1 = client.AppsV1Api()
deployments = v1.list_namespaced_deployment(namespace='default')
print("Escalonando todos os deployments para três réplicas...")
for deployment in deployments.items:
# You should store the original number of replicas somewhere; here we assume 3 for all
deployment.spec.replicas = 3
try:
v1.patch_namespaced_deployment(
name=deployment.metadata.name,
namespace='default',
body=deployment
)
print(f"Deployment {deployment.metadata.name} escalonado para três réplicas")
except ApiException as e:
print(f"Exceção ao escalar deployment {deployment.metadata.name}: {e}")
print("Retomar cluster concluído.")
But it’s not working.
Please, help us!
user336879 is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.