A place for Apptio product users to learn, connect, share, and grow together
Join/Log In
IBM TechXchange Dev Day: Virtual Agents
Join us 23 January from 11 AM - 6 PM ET as over 30 speakers from IBM and key AI industry leaders discuss the latest AI trends.
Cloudability enables users to view recommendations based on GPU data from AWS EC2 instances.Use the following custom gpumon Python script as directed.Follow the instructions here - https://help.apptio.com/en-us/cloudability/admin/gpu-agent-install.htm
# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). # You may not use this file except in compliance with the License. # A copy of the License is located at # # http://www.apache.org/licenses/LICENSE-2.0 # # or in the "license" file accompanying this file. This file is distributed # on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either # express or implied. See the License for the specific language governing # permissions and limitations under the License. import threading import urllib2 import boto3 from pynvml import * from datetime import datetime from time import sleep from functools import reduce import logging from logging.handlers import RotatingFileHandler from concurrent.futures import ThreadPoolExecutor from concurrent.futures import wait from concurrent.futures import FIRST_COMPLETED ### CHOOSE REGION #### EC2_REGION = sys.argv[1] ###CHOOSE NAMESPACE PARMETERS HERE### my_NameSpace = 'DeepLearningTrain' ### CHOOSE PUSH INTERVAL #### sleep_interval = 10 ### CHOOSE STORAGE RESOLUTION (BETWEEN 1-60) #### store_reso = 60 #Instance information BASE_URL = 'http://169.254.169.254/latest/meta-data/' INSTANCE_ID = urllib2.urlopen(BASE_URL + 'instance-id').read() IMAGE_ID = urllib2.urlopen(BASE_URL + 'ami-id').read() INSTANCE_TYPE = urllib2.urlopen(BASE_URL + 'instance-type').read() INSTANCE_AZ = urllib2.urlopen(BASE_URL + 'placement/availability-zone').read() EC2_REGION = INSTANCE_AZ[:-1] TIMESTAMP = datetime.now().strftime('%Y-%m-%dT%H') # Needs to be changed according to operating system # TMP_FILE_PATH = sys.argv[2] TMP_FILE = TMP_FILE_PATH+'/gpumon.log' TMP_FILE_SAVED = TMP_FILE + TIMESTAMP+ '.log' # Create CloudWatch client cloudwatch = boto3.client('cloudwatch', region_name=EC2_REGION) # Flag to push to CloudWatch PUSH_TO_CW = True def getPowerDraw(handle): try: powDraw = nvmlDeviceGetPowerUsage(handle) / 1000.0 powDrawStr = '%.2f' % powDraw except NVMLError as err: powDrawStr = handleError(err) PUSH_TO_CW = False return powDrawStr def getTemp(handle): try: temp = str(nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU)) except NVMLError as err: temp = handleError(err) PUSH_TO_CW = False return temp def getUtilization(handle): try: util = nvmlDeviceGetUtilizationRates(handle) gpu_util = str(util.gpu) mem_util = str(util.memory) except NVMLError as err: error = handleError(err) gpu_util = error mem_util = error PUSH_TO_CW = False return util, gpu_util, mem_util def logResults(i, util, gpu_util, mem_util): try: logger.setLevel(logging.INFO) writeString = 'GPU Utilizatio' + ' : ' + gpu_util + ', GPU Memory Utilization : ' + mem_util + '\n' logger.info(writeString) except Exception as e: logger.setLevel(logging.ERROR) logger.error("Error writing to file :",str(e)) if (PUSH_TO_CW): MY_DIMENSIONS=[ { 'Name': 'InstanceId', 'Value': INSTANCE_ID } ] cloudwatch.put_metric_data( MetricData=[ { 'MetricName': 'GPU Usage', 'Dimensions': MY_DIMENSIONS, 'Unit': 'Percent', 'StorageResolution': store_reso, 'Value': util.gpu }, { 'MetricName': 'Memory Usage', 'Dimensions': MY_DIMENSIONS, 'Unit': 'Percent', 'StorageResolution': store_reso, 'Value': util.memory }, ], Namespace=my_NameSpace ) logger = logging.getLogger('gpu_mon_logger') handler = RotatingFileHandler(TMP_FILE, maxBytes=10000000, backupCount=1) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) def average(lst): return reduce(lambda first, second: first + second, lst) / len(lst) def main(): try: nvmlInit() deviceCount = nvmlDeviceGetCount() while True: PUSH_TO_CW = True gpu_util_list = [] mem_util_list = [] # Find the metrics for each GPU on instance for i in range(deviceCount): handle = nvmlDeviceGetHandleByIndex(i) powDrawStr = getPowerDraw(handle) temp = getTemp(handle) util, gpu_util, mem_util = getUtilization(handle) gpu_util_list.append(util.gpu) mem_util_list.append(util.memory) gpu_util = average(gpu_util_list) mem_util = average(mem_util_list) util.gpu = gpu_util util.memory = mem_util gpu_util = str(gpu_util) mem_util = str(mem_util) logResults(i, util, gpu_util, mem_util) sleep(sleep_interval) finally: nvmlShutdown() if __name__=='__main__': # start the thread pool with ThreadPoolExecutor(1) as executor: # submit tasks and collect futures while True: try : futures = [executor.submit(main) for i in range(1)] done, not_done = wait(futures, return_when=FIRST_COMPLETED) if len(done) > 0: future = done.pop() result = future.result() print(result) except Exception as e: if len(not_done) > 0: future = not_done.pop() result = future.result() print(result) logger.setLevel(logging.ERROR) logger.error("Exception in main method :"+ str(e))