IBM Cloudability

IBM Cloudability

 View Only

Custom gpumon Python script to collect GPU data from AWS EC2 instances 

Mon April 25, 2022 01:43 PM

Cloudability enables users to view recommendations based on GPU data from AWS EC2 instances.

Use the following custom gpumon Python script as directed.

Follow the instructions here - https://help.apptio.com/en-us/cloudability/admin/gpu-agent-install.htm 

# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#  
#  or in the "license" file accompanying this file. This file is distributed 
#  on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 
#  express or implied. See the License for the specific language governing 
#  permissions and limitations under the License.

import threading
import urllib2
import boto3
from pynvml import *
from datetime import datetime
from time import sleep
from functools import reduce
import logging
from logging.handlers import RotatingFileHandler
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import wait
from concurrent.futures import FIRST_COMPLETED

### CHOOSE REGION ####
EC2_REGION = sys.argv[1]

###CHOOSE NAMESPACE PARMETERS HERE###
my_NameSpace = 'DeepLearningTrain'

### CHOOSE PUSH INTERVAL ####
sleep_interval = 10

### CHOOSE STORAGE RESOLUTION (BETWEEN 1-60) ####
store_reso = 60

#Instance information
BASE_URL = 'http://169.254.169.254/latest/meta-data/'
INSTANCE_ID = urllib2.urlopen(BASE_URL + 'instance-id').read()
IMAGE_ID = urllib2.urlopen(BASE_URL + 'ami-id').read()
INSTANCE_TYPE = urllib2.urlopen(BASE_URL + 'instance-type').read()
INSTANCE_AZ = urllib2.urlopen(BASE_URL + 'placement/availability-zone').read()
EC2_REGION = INSTANCE_AZ[:-1]

TIMESTAMP = datetime.now().strftime('%Y-%m-%dT%H')
# Needs to be changed according to operating system #
TMP_FILE_PATH = sys.argv[2]
TMP_FILE = TMP_FILE_PATH+'/gpumon.log'
TMP_FILE_SAVED = TMP_FILE + TIMESTAMP+ '.log'

# Create CloudWatch client
cloudwatch = boto3.client('cloudwatch', region_name=EC2_REGION)


# Flag to push to CloudWatch
PUSH_TO_CW = True

def getPowerDraw(handle):
    try:
        powDraw = nvmlDeviceGetPowerUsage(handle) / 1000.0
        powDrawStr = '%.2f' % powDraw
    except NVMLError as err:
        powDrawStr = handleError(err)
        PUSH_TO_CW = False
    return powDrawStr

def getTemp(handle):
    try:
        temp = str(nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU))
    except NVMLError as err:
        temp = handleError(err)
        PUSH_TO_CW = False
    return temp

def getUtilization(handle):
    try:
        util = nvmlDeviceGetUtilizationRates(handle)
        gpu_util = str(util.gpu)
        mem_util = str(util.memory)
    except NVMLError as err:
        error = handleError(err)
        gpu_util = error
        mem_util = error
        PUSH_TO_CW = False
    return util, gpu_util, mem_util

def logResults(i, util, gpu_util, mem_util):
    try:
        logger.setLevel(logging.INFO)
        writeString = 'GPU Utilizatio' + ' : ' + gpu_util + ', GPU Memory Utilization : ' + mem_util +  '\n'
        logger.info(writeString)
    except Exception as e:
        logger.setLevel(logging.ERROR)
        logger.error("Error writing to file :",str(e))
    if (PUSH_TO_CW):
        MY_DIMENSIONS=[
                    {
                        'Name': 'InstanceId',
                        'Value': INSTANCE_ID
                    }
                ]   
        cloudwatch.put_metric_data(
            MetricData=[
                {
                    'MetricName': 'GPU Usage',
                    'Dimensions': MY_DIMENSIONS,
                    'Unit': 'Percent',
                    'StorageResolution': store_reso,
                    'Value': util.gpu
                },
                {
                    'MetricName': 'Memory Usage',
                    'Dimensions': MY_DIMENSIONS,
                    'Unit': 'Percent',
                    'StorageResolution': store_reso,
                    'Value': util.memory
                },
        ],
            Namespace=my_NameSpace
        )



logger = logging.getLogger('gpu_mon_logger')
handler = RotatingFileHandler(TMP_FILE, maxBytes=10000000, backupCount=1)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

def average(lst):
    return reduce(lambda first, second: first + second, lst) / len(lst)


def main():
    try:
        nvmlInit()
        deviceCount = nvmlDeviceGetCount()
        while True:
            PUSH_TO_CW = True
            gpu_util_list = []
            mem_util_list = []
            # Find the metrics for each GPU on instance
            for i in range(deviceCount):
                handle = nvmlDeviceGetHandleByIndex(i)
                powDrawStr = getPowerDraw(handle)
                temp = getTemp(handle)
                util, gpu_util, mem_util = getUtilization(handle)
                gpu_util_list.append(util.gpu)
                mem_util_list.append(util.memory)
            gpu_util = average(gpu_util_list)
            mem_util = average(mem_util_list)
            util.gpu = gpu_util
            util.memory = mem_util
            gpu_util = str(gpu_util)
            mem_util = str(mem_util)
            logResults(i, util, gpu_util, mem_util)

            sleep(sleep_interval)
        

    finally:
        nvmlShutdown()


if __name__=='__main__':
# start the thread pool
    with ThreadPoolExecutor(1) as executor:
        # submit tasks and collect futures
        while True:
            try :
                futures = [executor.submit(main) for i in range(1)]
                done, not_done = wait(futures, return_when=FIRST_COMPLETED)
                if len(done) > 0:
                    future = done.pop()
                    result = future.result()
                    print(result)
            except Exception as e:
                if len(not_done) > 0:
                    future = not_done.pop()
                    result = future.result()
                    print(result)   
                logger.setLevel(logging.ERROR)
                logger.error("Exception in main method :"+ str(e))
                ​




#Cloudability
#configuration
#AWS

Statistics
0 Favorited
0 Views
0 Files
0 Shares
0 Downloads