Cloudability enables users to view recommendations based on GPU data from AWS EC2 instances.Use the following custom gpumon Python script as directed.Follow the instructions here - https://help.apptio.com/en-us/cloudability/admin/gpu-agent-install.htm
# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). # You may not use this file except in compliance with the License. # A copy of the License is located at # # http://www.apache.org/licenses/LICENSE-2.0 # # or in the "license" file accompanying this file. This file is distributed # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either # express or implied. See the License for the specific language governing # permissions and limitations under the License. import threading import urllib2 import boto3 from pynvml import * from datetime import datetime from time import sleep from functools import reduce import logging from logging.handlers import RotatingFileHandler from concurrent.futures import ThreadPoolExecutor from concurrent.futures import wait from concurrent.futures import FIRST_COMPLETED ### CHOOSE REGION #### EC2_REGION = sys.argv[1] ###CHOOSE NAMESPACE PARMETERS HERE### my_NameSpace = 'DeepLearningTrain' ### CHOOSE PUSH INTERVAL #### sleep_interval = 10 ### CHOOSE STORAGE RESOLUTION (BETWEEN 1-60) #### store_reso = 60 #Instance information BASE_URL = 'http://169.254.169.254/latest/meta-data/' INSTANCE_ID = urllib2.urlopen(BASE_URL + 'instance-id').read() IMAGE_ID = urllib2.urlopen(BASE_URL + 'ami-id').read() INSTANCE_TYPE = urllib2.urlopen(BASE_URL + 'instance-type').read() INSTANCE_AZ = urllib2.urlopen(BASE_URL + 'placement/availability-zone').read() EC2_REGION = INSTANCE_AZ[:-1] TIMESTAMP = datetime.now().strftime('%Y-%m-%dT%H') # Needs to be changed according to operating system # TMP_FILE_PATH = sys.argv[2] TMP_FILE = TMP_FILE_PATH+'/gpumon.log' TMP_FILE_SAVED = TMP_FILE + TIMESTAMP+ '.log' # Create CloudWatch client cloudwatch = boto3.client('cloudwatch', region_name=EC2_REGION) # Flag to push to CloudWatch PUSH_TO_CW = True def getPowerDraw(handle): try: powDraw = nvmlDeviceGetPowerUsage(handle) / 1000.0 powDrawStr = '%.2f' % powDraw except NVMLError as err: powDrawStr = handleError(err) PUSH_TO_CW = False return powDrawStr def getTemp(handle): try: temp = str(nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU)) except NVMLError as err: temp = handleError(err) PUSH_TO_CW = False return temp def getUtilization(handle): try: util = nvmlDeviceGetUtilizationRates(handle) gpu_util = str(util.gpu) mem_util = str(util.memory) except NVMLError as err: error = handleError(err) gpu_util = error mem_util = error PUSH_TO_CW = False return util, gpu_util, mem_util def logResults(i, util, gpu_util, mem_util): try: logger.setLevel(logging.INFO) writeString = 'GPU Utilizatio' + ' : ' + gpu_util + ', GPU Memory Utilization : ' + mem_util + '\n' logger.info(writeString) except Exception as e: logger.setLevel(logging.ERROR) logger.error("Error writing to file :",str(e)) if (PUSH_TO_CW): MY_DIMENSIONS=[ { 'Name': 'InstanceId', 'Value': INSTANCE_ID } ] cloudwatch.put_metric_data( MetricData=[ { 'MetricName': 'GPU Usage', 'Dimensions': MY_DIMENSIONS, 'Unit': 'Percent', 'StorageResolution': store_reso, 'Value': util.gpu }, { 'MetricName': 'Memory Usage', 'Dimensions': MY_DIMENSIONS, 'Unit': 'Percent', 'StorageResolution': store_reso, 'Value': util.memory }, ], Namespace=my_NameSpace ) logger = logging.getLogger('gpu_mon_logger') handler = RotatingFileHandler(TMP_FILE, maxBytes=10000000, backupCount=1) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) def average(lst): return reduce(lambda first, second: first + second, lst) / len(lst) def main(): try: nvmlInit() deviceCount = nvmlDeviceGetCount() while True: PUSH_TO_CW = True gpu_util_list = [] mem_util_list = [] # Find the metrics for each GPU on instance for i in range(deviceCount): handle = nvmlDeviceGetHandleByIndex(i) powDrawStr = getPowerDraw(handle) temp = getTemp(handle) util, gpu_util, mem_util = getUtilization(handle) gpu_util_list.append(util.gpu) mem_util_list.append(util.memory) gpu_util = average(gpu_util_list) mem_util = average(mem_util_list) util.gpu = gpu_util util.memory = mem_util gpu_util = str(gpu_util) mem_util = str(mem_util) logResults(i, util, gpu_util, mem_util) sleep(sleep_interval) finally: nvmlShutdown() if __name__=='__main__': # start the thread pool with ThreadPoolExecutor(1) as executor: # submit tasks and collect futures while True: try : futures = [executor.submit(main) for i in range(1)] done, not_done = wait(futures, return_when=FIRST_COMPLETED) if len(done) > 0: future = done.pop() result = future.result() print(result) except Exception as e: if len(not_done) > 0: future = not_done.pop() result = future.result() print(result) logger.setLevel(logging.ERROR) logger.error("Exception in main method :"+ str(e))