Custom gpumon Python script to collect GPU data from AWS EC2 instances

 View Only

Custom gpumon Python script to collect GPU data from AWS EC2 instances 

Mon April 25, 2022 01:43 PM

Cloudability enables users to view recommendations based on GPU data from AWS EC2 instances.

Use the following custom gpumon Python script as directed.

Follow the instructions here - https://help.apptio.com/en-us/cloudability/admin/gpu-agent-install.htm 

# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#  
#  or in the "license" file accompanying this file. This file is distributed 
#  on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 
#  express or implied. See the License for the specific language governing 
#  permissions and limitations under the License.

import threading
import urllib2
import boto3
from pynvml import *
from datetime import datetime
from time import sleep
from functools import reduce
import logging
from logging.handlers import RotatingFileHandler
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import wait
from concurrent.futures import FIRST_COMPLETED

### CHOOSE REGION ####
EC2_REGION = sys.argv[1]

###CHOOSE NAMESPACE PARMETERS HERE###
my_NameSpace = 'DeepLearningTrain'

### CHOOSE PUSH INTERVAL ####
sleep_interval = 10

### CHOOSE STORAGE RESOLUTION (BETWEEN 1-60) ####
store_reso = 60

#Instance information
BASE_URL = 'http://169.254.169.254/latest/meta-data/'
INSTANCE_ID = urllib2.urlopen(BASE_URL + 'instance-id').read()
IMAGE_ID = urllib2.urlopen(BASE_URL + 'ami-id').read()
INSTANCE_TYPE = urllib2.urlopen(BASE_URL + 'instance-type').read()
INSTANCE_AZ = urllib2.urlopen(BASE_URL + 'placement/availability-zone').read()
EC2_REGION = INSTANCE_AZ[:-1]

TIMESTAMP = datetime.now().strftime('%Y-%m-%dT%H')
# Needs to be changed according to operating system #
TMP_FILE_PATH = sys.argv[2]
TMP_FILE = TMP_FILE_PATH+'/gpumon.log'
TMP_FILE_SAVED = TMP_FILE + TIMESTAMP+ '.log'

# Create CloudWatch client
cloudwatch = boto3.client('cloudwatch', region_name=EC2_REGION)


# Flag to push to CloudWatch
PUSH_TO_CW = True

def getPowerDraw(handle):
    try:
        powDraw = nvmlDeviceGetPowerUsage(handle) / 1000.0
        powDrawStr = '%.2f' % powDraw
    except NVMLError as err:
        powDrawStr = handleError(err)
        PUSH_TO_CW = False
    return powDrawStr

def getTemp(handle):
    try:
        temp = str(nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU))
    except NVMLError as err:
        temp = handleError(err)
        PUSH_TO_CW = False
    return temp

def getUtilization(handle):
    try:
        util = nvmlDeviceGetUtilizationRates(handle)
        gpu_util = str(util.gpu)
        mem_util = str(util.memory)
    except NVMLError as err:
        error = handleError(err)
        gpu_util = error
        mem_util = error
        PUSH_TO_CW = False
    return util, gpu_util, mem_util

def logResults(i, util, gpu_util, mem_util):
    try:
        logger.setLevel(logging.INFO)
        writeString = 'GPU Utilizatio' + ' : ' + gpu_util + ', GPU Memory Utilization : ' + mem_util +  '\n'
        logger.info(writeString)
    except Exception as e:
        logger.setLevel(logging.ERROR)
        logger.error("Error writing to file :",str(e))
    if (PUSH_TO_CW):
        MY_DIMENSIONS=[
                    {
                        'Name': 'InstanceId',
                        'Value': INSTANCE_ID
                    }
                ]   
        cloudwatch.put_metric_data(
            MetricData=[
                {
                    'MetricName': 'GPU Usage',
                    'Dimensions': MY_DIMENSIONS,
                    'Unit': 'Percent',
                    'StorageResolution': store_reso,
                    'Value': util.gpu
                },
                {
                    'MetricName': 'Memory Usage',
                    'Dimensions': MY_DIMENSIONS,
                    'Unit': 'Percent',
                    'StorageResolution': store_reso,
                    'Value': util.memory
                },
        ],
            Namespace=my_NameSpace
        )



logger = logging.getLogger('gpu_mon_logger')
handler = RotatingFileHandler(TMP_FILE, maxBytes=10000000, backupCount=1)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

def average(lst):
    return reduce(lambda first, second: first + second, lst) / len(lst)


def main():
    try:
        nvmlInit()
        deviceCount = nvmlDeviceGetCount()
        while True:
            PUSH_TO_CW = True
            gpu_util_list = []
            mem_util_list = []
            # Find the metrics for each GPU on instance
            for i in range(deviceCount):
                handle = nvmlDeviceGetHandleByIndex(i)
                powDrawStr = getPowerDraw(handle)
                temp = getTemp(handle)
                util, gpu_util, mem_util = getUtilization(handle)
                gpu_util_list.append(util.gpu)
                mem_util_list.append(util.memory)
            gpu_util = average(gpu_util_list)
            mem_util = average(mem_util_list)
            util.gpu = gpu_util
            util.memory = mem_util
            gpu_util = str(gpu_util)
            mem_util = str(mem_util)
            logResults(i, util, gpu_util, mem_util)

            sleep(sleep_interval)
        

    finally:
        nvmlShutdown()


if __name__=='__main__':
# start the thread pool
    with ThreadPoolExecutor(1) as executor:
        # submit tasks and collect futures
        while True:
            try :
                futures = [executor.submit(main) for i in range(1)]
                done, not_done = wait(futures, return_when=FIRST_COMPLETED)
                if len(done) > 0:
                    future = done.pop()
                    result = future.result()
                    print(result)
            except Exception as e:
                if len(not_done) > 0:
                    future = not_done.pop()
                    result = future.result()
                    print(result)   
                logger.setLevel(logging.ERROR)
                logger.error("Exception in main method :"+ str(e))
                ​




#Cloudability
#configuration
#AWS

Statistics
0 Favorited
0 Views
0 Files
0 Shares
0 Downloads