I have created python script which basically turn off the autoscale of GKE cluster , then stop the underlying nodes in each MIG(Managed Instance Group) present zone wise. Now the problem is while stopping the instances , instances being recreated and status is getting changed from UNKNOWN to READY state immediate after stopping the instance. Though I turned off autoscale group before instance being stopped. on the other hand this is not the case while I manually stop the instance from console itself in each MIG per ZONE. why its happening ?can anyone suggest what needs to be checked or what more I need to apply on the code.
I have checked autoscale group is getting turned off without any issues. For your kind info autoscaling at MIG level is off and also health check also disabled at MIG level. Autoscale is there only at Node Pool level.
import subprocess
import json
import argparse
import time
def get_node_pool_name(cluster_name, project_name, region_name):
try:
cmd = [
'gcloud', 'container', 'clusters', 'describe', cluster_name,
'--project', project_name,
'--region', region_name,
'--format', 'json'
]
...
...
return node_pool_name
except Exception as e:
print(f"Error occurred while getting node pool name: {str(e)}")
raise
def disable_autoscaler(cluster_name, project_name, region_name, node_pool_name):
try:
cmd = [
'gcloud', 'container', 'node-pools', 'update', node_pool_name,
'--cluster', cluster_name,
'--project', project_name,
'--region', region_name,
'--no-enable-autoscaling'
]
...
...
except Exception as e:
print(f"Error occurred while disabling node pool autoscaler: {str(e)}")
raise
def get_instance_groups(cluster_name, project_name, region_name):
try:
cmd = [
'gcloud', 'container', 'clusters', 'describe', cluster_name,
'--project', project_name,
'--region', region_name,
'--format', 'json'
]
...
...
return instance_groups
except Exception as e:
print(f"Error occurred while getting instance-groups: {str(e)}")
raise
def get_instances(instance_group_name, project_name, zone):
try:
cmd = [
'gcloud', 'compute', 'instance-groups', 'list-instances', instance_group_name,
'--project', project_name,
'--zone', zone,
'--format', 'json'
]
...
...
return instance_names
except Exception as e:
print(f"Error occurred while getting instances: {str(e)}")
raise
def stop_instances(instance_names, project_name, zone):
try:
for instance in instance_names:
cmd = [
'gcloud', 'compute', 'instances', 'stop', instance,
'--project', project_name,
'--zone', zone
]
subprocess.run(cmd, check=True)
print(f"Instance stopped: {instance} in zone {zone}")
except Exception as e:
print(f"Error occurred while stopping instances: {str(e)}")
raise
def main(cluster_name, project_name, region_name):
node_pool_name = get_node_pool_name(cluster_name, project_name, region_name)
disable_autoscaler(cluster_name, project_name, region_name, node_pool_name)
time.sleep(60)
instance_groups = get_instance_groups(cluster_name, project_name, region_name)
for instance_group, zone in instance_groups:
instance_names = get_instances(instance_group, project_name, zone)
stop_instances(instance_names, project_name, zone)
time.sleep(30)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Disable GKE autoscaler and stop instances.")
parser.add_argument('cluster_name', type=str, help='Name of the GKE cluster')
parser.add_argument('project_name', type=str, help='Google Cloud project name')
parser.add_argument('region_name', type=str, help='Region name of the GKE cluster')
args = parser.parse_args()
main(args.cluster_name, args.project_name, args.region_name)