Supplement
A ROP (retinopathy of prematurity) Screening Classifier for Retinal Fundus Photograph based on Transfer Learning
DNN Model Test Results
- AlexNet_from_scratch.txt . . . . . . 612 B . . . . . . 5/5/2022 12:04:28 PM
- AlexNet_from_scratch_pr_curve.png . . . . . . 31 KB . . . . . . 5/5/2022 12:04:28 PM
- AlexNet_from_scratch_roc_curve.png . . . . . . 40 KB . . . . . . 5/5/2022 12:04:28 PM
- AlexNet_from_scratch_softmax_probs.txt . . . . . . 48 KB . . . . . . 5/5/2022 12:04:28 PM
- AlexNet_transfer_learning.txt . . . . . . 635 B . . . . . . 5/5/2022 12:04:28 PM
- AlexNet_transfer_learning_pr_curve.png . . . . . . 32 KB . . . . . . 5/5/2022 12:04:28 PM
- AlexNet_transfer_learning_roc_curve.png . . . . . . 41 KB . . . . . . 5/5/2022 12:04:28 PM
- AlexNet_transfer_learning_softmax_probs.txt . . . . . . 49 KB . . . . . . 5/5/2022 12:04:28 PM
- VGG-16_from_scratch.txt . . . . . . 587 B . . . . . . 5/5/2022 12:04:28 PM
- VGG-16_from_scratch_pr_curve.png . . . . . . 33 KB . . . . . . 5/5/2022 12:04:28 PM
- VGG-16_from_scratch_roc_curve.png . . . . . . 40 KB . . . . . . 5/5/2022 12:04:28 PM
- VGG-16_from_scratch_softmax_probs.txt . . . . . . 49 KB . . . . . . 5/5/2022 12:04:28 PM
- VGG-16_transfer_learning.txt . . . . . . 628 B . . . . . . 5/5/2022 12:04:28 PM
- VGG-16_transfer_learning_pr_curve.png . . . . . . 25 KB . . . . . . 5/5/2022 12:04:28 PM
- VGG-16_transfer_learning_roc_curve.png . . . . . . 39 KB . . . . . . 5/5/2022 12:04:28 PM
- VGG-16_transfer_learning_softmax_probs.txt . . . . . . 53 KB . . . . . . 5/5/2022 12:04:28 PM
- GoogLeNet_from_cratch.txt . . . . . . 601 B . . . . . . 5/5/2022 12:04:28 PM
- GoogLeNet_from_cratch_softmax_probs.txt . . . . . . 49 KB . . . . . . 5/5/2022 12:04:28 PM
- GoogLeNet_from_scratch.txt . . . . . . 603 B . . . . . . 5/5/2022 12:04:28 PM
- GoogLeNet_from_scratch_pr_curve.png . . . . . . 33 KB . . . . . . 5/5/2022 12:04:28 PM
- GoogLeNet_from_scratch_roc_curve.png . . . . . . 41 KB . . . . . . 5/5/2022 12:04:28 PM
- GoogLeNet_from_scratch_softmax_probs.txt . . . . . . 49 KB . . . . . . 5/5/2022 12:04:28 PM
- GoogLeNet_Transfer_Learning.txt . . . . . . 610 B . . . . . . 5/5/2022 12:04:28 PM
- GoogLeNet_Transfer_Learning_pr_curve.png . . . . . . 26 KB . . . . . . 5/5/2022 12:04:28 PM
- GoogLeNet_Transfer_Learning_roc_curve.png . . . . . . 39 KB . . . . . . 5/5/2022 12:04:28 PM
- GoogLeNet_transfer_learning_softmax_probs.txt . . . . . . 233 KB . . . . . . 5/5/2022 12:04:28 PM
Human Experts Performance Data
- CHEN_DESKTOP-C7O8G5I20170802183814.txt . . . . . . 68 KB . . . . . . 5/5/2022 12:04:28 PM
- TIAN_8L-HZB-220170803195744.txt . . . . . . 67 KB . . . . . . 5/5/2022 12:04:28 PM
- WU_8L-HZB-220170802173250.txt . . . . . . 67 KB . . . . . . 5/5/2022 12:04:28 PM
- ZENG_8L-HZB-220170803141821.txt . . . . . . 67 KB . . . . . . 5/5/2022 12:04:28 PM
- ZHANG_RETCAM320170803174040.txt . . . . . . 67 KB . . . . . . 5/5/2022 12:04:28 PM
- DNN_measures.txt . . . . . . 3 KB . . . . . . 5/5/2022 12:04:28 PM
- DNN_pr_curve.txt . . . . . . 8 KB . . . . . . 5/5/2022 12:04:28 PM
- DNN_softmax_probs.txt . . . . . . 53 KB . . . . . . 5/5/2022 12:04:28 PM
- DNN_vs_Human_pr_curve.png . . . . . . 26 KB . . . . . . 5/5/2022 12:04:28 PM
- DNN_vs_Human_roc_curve.png . . . . . . 37 KB . . . . . . 5/5/2022 12:04:28 PM
- Human_Performance_Summary.txt . . . . . . 7 KB . . . . . . 5/5/2022 12:04:28 PM
Source Codes and Scripts for DNN Batch Inference
binaryproto2npy.py . . . . . . 395 B . . . . . . 5/5/2022 12:04:28 PM
# convert mean file from binaryproto to npy import caffe import numpy as np import sys if len(sys.argv) != 3: print "Usage: python convert_protomean.py proto.mean out.npy" sys.exit() blob = caffe.proto.caffe_pb2.BlobProto() data = open( sys.argv[1] , 'rb' ).read() blob.ParseFromString(data) arr = np.array( caffe.io.blobproto_to_array(blob) ) out = arr[0] np.save( sys.argv[2] , out )
digits_classify_folder.py . . . . . . 5 KB . . . . . . 5/5/2022 12:04:28 PM
# Using digits to classify images under a specific folder # Copyright (c) 2017, zhangys@zjgsu.edu.cn # requires NVidia Digits to be installed SOURCE_FOLDER = "/home/zys/seagate/Extra_20170723/" SOURCE_FILE_LIST = "/home/zys/seagate/Extra_20170723.txt" REF_FOLDER = '' #"/home/zys/seagate/ROP_Classified_20170603_Combined/images"+"/" # the files with the same name in this folder will not be processed TARGET_FOLDER = '/home/zys/seagate/Extra_Selected_20170723/' #"/home/zys/seagate/ROP_201707/A"+"/" JOBS_DIR = "/home/zys/DIGITS-master/digits/jobs" # "/var/lib/digits/jobs/" import os import sys import shutil caffe_root = os.environ['CAFFE_ROOT'] sys.path.insert(0,caffe_root+'python') # Add path for DIGITS package sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) import digits.config from digits.inference.errors import InferenceError from digits.job import Job from digits import utils # To solve error "Check failed: error == cudaSuccess (10 vs. 0) invalid device ordinal" import caffe caffe.set_device(0) """ Perform inference on a list of images using the specified model """ def nv_digits_infer(input_list, output_dir, jobs_dir, model_id, epoch, batch_size, layers, gpu): """ Perform inference on a list of images using the specified model """ # job directory defaults to that defined in DIGITS config if jobs_dir == 'none': jobs_dir = digits.config.config_value('jobs_dir') # load model job model_dir = os.path.join(jobs_dir, model_id) assert os.path.isdir(model_dir), "Model dir %s does not exist" % model_dir model = Job.load(model_dir) # load dataset job dataset_dir = os.path.join(jobs_dir, model.dataset_id) assert os.path.isdir(dataset_dir), "Dataset dir %s does not exist" % dataset_dir dataset = Job.load(dataset_dir) for task in model.tasks: task.dataset = dataset # retrieve snapshot file task = model.train_task() snapshot_filename = None epoch = float(epoch) if epoch == -1 and len(task.snapshots): # use last epoch epoch = task.snapshots[-1][1] snapshot_filename = task.snapshots[-1][0] else: for f, e in task.snapshots: if e == epoch: snapshot_filename = f break if not snapshot_filename: raise InferenceError("Unable to find snapshot for epoch=%s" % repr(epoch)) # retrieve image dimensions and resize mode image_dims = dataset.get_feature_dims() height = image_dims[0] width = image_dims[1] channels = image_dims[2] resize_mode = dataset.resize_mode if hasattr(dataset, 'resize_mode') else 'squash' n_input_samples = 0 # number of samples we were able to load input_ids = [] # indices of samples within file list input_data = [] # sample data # load paths from file paths = None with open(input_list) as infile: paths = infile.readlines() # load and resize images for idx, path in enumerate(paths): path = path.strip() try: image = utils.image.load_image(path.strip()) image = utils.image.resize_image( image, height, width, channels=channels, resize_mode=resize_mode) input_ids.append(idx) input_data.append(image) n_input_samples = n_input_samples + 1 except utils.errors.LoadImageError as e: print e # perform inference if layers != 'none': raise InferenceError("Layer visualization is not supported for multiple inference") outputs = model.train_task().infer_many( input_data, snapshot_epoch=epoch, gpu=gpu, resize=True) return outputs["softmax"] def nv_digits_classify(filelist, model): DIGITS_JOB_ID = DICT[model] softmax = nv_digits_infer(filelist, "/home/zys/data/C3R/test/tmp/", JOBS_DIR, DIGITS_JOB_ID, -1, 1, 'none', 0) cls = [] for idx, p in enumerate(softmax): cls.append(p.argmax()) i = 0 with open(filelist) as f: for line in f: line = line.strip() c = str(cls[i]) i+=1 directory = TARGET_FOLDER+"/" + c; if not os.path.exists(directory): os.makedirs(directory) shutil.copyfile(line, directory + "/" + os.path.basename(line)) def split_filelist(filelist, MAX_LINE_PER_FILE): i=0 fidx=0 files=[] currentfile = None with open(filelist) as f: for line in f: if(i%MAX_LINE_PER_FILE == 0): if (currentfile is not None): currentfile.close() currentfile = open(filelist+str(fidx), 'w') files.append(filelist+str(fidx)) fidx+=1 line = line.strip() currentfile.write("%s\n" % line) i+=1 return files # nv_digits_classify(SOURCE_FILE_LIST, "VGG-16") files = split_filelist(SOURCE_FILE_LIST, 500) i=0 import time caffe.set_mode_gpu() # call in the shell: Every 3 rounds, will throw error. So use for loop to handle it. # for((i=30;i<=132;i=i+3));do sudo python ...py $i;done print ("------",sys.argv[1],"------") for f in files: if(i >= int( sys.argv[1])): nv_digits_classify(f, "VGG-16") print(i, "*************", f, " Finished *************") i+=1; #if(i%5==0): # In GPU mode, CUDA handle is not released immediately # caffe.set_mode_gpu() #else: # caffe.set_mode_cpu() #i+=1 time.sleep(0)
digits_predict_folder.py . . . . . . 12 KB . . . . . . 5/5/2022 12:04:28 PM
# Using digits to classify images in the test data set # Copyright (c) 2017, zhangys@zjgsu.edu.cn # requires NVidia Digits to be installed DICT = {} DICT["VGG-16_transfer_learning"] = "20170719-171742-5c31" DICT["VGG-16_from_scratch"] = '20170722-143332-f077' DICT["GoogLeNet_from_scratch"] = '20170722-113606-c59c' DICT["GoogLeNet_transfer_learning"] = '20170722-190454-1af1' # '20170719-185942-bf65' DICT["AlexNet_transfer_learning"] = '20170803-121318-b153'#'20170802-164043-96f9'# '20170722-205930-3ac4' # '20170720-094939-b665' DICT["AlexNet_from_scratch"]= '20170802-163356-7f3d'#'20170802-162531-3531' #'20170722-152531-da8d' def GetDictKeys(): return DICT.keys() FILE_LIST="/home/zys/Desktop/test2/data/FileList.txt" LABEL_LIST="/home/zys/Desktop/test2/data/LabelList.txt" JOBS_DIR = "/home/zys/DIGITS-master/digits/jobs" # "/var/lib/digits/jobs/" import os import sys caffe_root = os.environ['CAFFE_ROOT'] # '/home/zys/nv-caffe-0.15' # # When use sudo, the environ var may be different than the current terminal sys.path.insert(0,caffe_root+'python') import numpy import pandas from sklearn.metrics import roc_curve, auc, precision_recall_curve from pandas_ml import ConfusionMatrix import matplotlib.pyplot as plt import shutil import inspect # Add path for DIGITS package sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) import digits.config from digits.inference.errors import InferenceError from digits.job import Job from digits import utils # To solve error "Check failed: error == cudaSuccess (10 vs. 0) invalid device ordinal" import caffe print (caffe.__file__) caffe.set_device(0) """ Perform inference on a list of images using the specified model """ def nv_digits_infer(input_list, output_dir, jobs_dir, model_id, epoch, batch_size, layers, gpu): """ Perform inference on a list of images using the specified model """ # job directory defaults to that defined in DIGITS config if jobs_dir == 'none': jobs_dir = digits.config.config_value('jobs_dir') # load model job model_dir = os.path.join(jobs_dir, model_id) assert os.path.isdir(model_dir), "Model dir %s does not exist" % model_dir model = Job.load(model_dir) # load dataset job dataset_dir = os.path.join(jobs_dir, model.dataset_id) assert os.path.isdir(dataset_dir), "Dataset dir %s does not exist" % dataset_dir dataset = Job.load(dataset_dir) for task in model.tasks: task.dataset = dataset # retrieve snapshot file task = model.train_task() snapshot_filename = None epoch = float(epoch) if epoch == -1 and len(task.snapshots): # use last epoch epoch = task.snapshots[-1][1] snapshot_filename = task.snapshots[-1][0] else: for f, e in task.snapshots: if e == epoch: snapshot_filename = f break if not snapshot_filename: raise InferenceError("Unable to find snapshot for epoch=%s" % repr(epoch)) # retrieve image dimensions and resize mode image_dims = dataset.get_feature_dims() height = image_dims[0] width = image_dims[1] channels = image_dims[2] resize_mode = dataset.resize_mode if hasattr(dataset, 'resize_mode') else 'squash' n_input_samples = 0 # number of samples we were able to load input_ids = [] # indices of samples within file list input_data = [] # sample data # load paths from file paths = None with open(input_list) as infile: paths = infile.readlines() # load and resize images for idx, path in enumerate(paths): path = path.strip() try: image = utils.image.load_image(path.strip()) image = utils.image.resize_image( image, height, width, channels=channels, resize_mode=resize_mode) input_ids.append(idx) input_data.append(image) n_input_samples = n_input_samples + 1 except utils.errors.LoadImageError as e: print e # perform inference if layers != 'none': raise InferenceError("Layer visualization is not supported for multiple inference") outputs = model.train_task().infer_many( input_data, snapshot_epoch=epoch, gpu=gpu, resize=True) return outputs["softmax"] def nv_digits_infer_one_round(threshold, softmax1, softmax2): predicts1 = [] for idx, probs in enumerate(softmax1): # print paths[idx], probs[0], probs[1], probs[2] prediction = False if(probs[0]>threshold): prediction = True # predictions.append(probs.argmax()) predicts1.append(prediction) predicts2 = [] for idx, probs in enumerate(softmax2): # print paths[idx], probs[0], probs[1], probs[2] prediction = False if(probs[0]>threshold): prediction = True # predictions.append(probs.argmax()) predicts2.append(prediction) P= len(predicts1) N= len(predicts2) tp = predicts1.count(True) fn = predicts1.count(False) tn = predicts2.count(False) fp = predicts2.count(True) precision = 1.0*tp/(tp+fp) #ppv recall = 1.0*tp/(tp+fn) f1 = 2.0*precision*recall/(precision+recall) accuracy1 = (predicts1.count(0) * 1.0 / P) accuracy2 = (predicts2.count(1) * 1.0 / N) accuracy = (tp + tn)*1.0/(P+N) sensativity = 1.0*tp/(tp+fn) #tpr, recall specificity = 1.0*tn/(fp+tn) #tnr fpr = 1.0*fp/N # fp/(fp+tn) = 1.0-specificity fpr = 1.0*fp/(fp+tn) fpr = 1-specificity tpr = sensativity recall = sensativity print P, N, threshold, accuracy1, accuracy2, accuracy, tp, tn, fp, fn, precision, recall, f1, sensativity, specificity, fpr return tpr, fpr def nv_digits_infer_test_model(model): print ('####### ' + model + ' #########') labels = [] with open(LABEL_LIST) as f: for line in f: if('0' in line): labels.append(True) elif('1' in line): labels.append(False) else: print "Unrecognized label!" DIGITS_JOB_ID = DICT[model] softmax = nv_digits_infer(FILE_LIST, "/home/zys/data/C3R/test/tmp/", JOBS_DIR, DIGITS_JOB_ID, -1, 1, 'none', 0) lines=[] probs = [] for idx, p in enumerate(softmax): probs.append(p[0]) lines.append(str(p[0])+" "+str(p[1])+" "+str(p[2])) with open(model+'_softmax_probs.txt', 'w') as the_file: for line in lines: the_file.write(line+'\n') nv_digits_infer_test_model_analyze_result(probs, labels, model) def nv_digits_infer_test_model2(fname1, fname2, model): probs=[] with open(fname1) as f: for line in f: fields = line.split() if(len(fields) == 8): ind = fields.index("Disease") v = float(fields[ind+1].strip('%'))/100 probs.append(v) labels = [] with open(fname2) as f: for line in f: if('0' in line): labels.append(True) elif('1' in line): labels.append(False) else: print "Unrecognized label!" nv_digits_infer_test_model_analyze_result(probs, labels, model) def nv_digits_infer_test_model_analyze_result(probs, labels, model): # fprs = [] # tprs = [] # #for threshold in range(1,1000): # tpr, fpr = nv_digits_infer_one_round(threshold*0.001, softmax1, softmax2) # fprs.append(fpr) # tprs.append(tpr) y_actu = labels y_pred = probs #### CONFUSINO MATRIX #### #cm = ConfusionMatrix(y_actu, y_pred) #print cm #cm.print_stats() #### ROC Curve ##### fpr, tpr, thresholds = roc_curve(y_actu, y_pred, pos_label=True) # The Youden index J (Youden, 1950) is defined as: # J = max { sensitivityc + specificityc - 1 } # where c ranges over all possible criterion values. # Graphically, J is the maximum vertical distance between the ROC curve and the diagonal line. # The criterion value corresponding with the Youden index J is the optimal criterion value only when disease prevalence is 50%, equal weight is given to sensitivity and specificity, and costs of various decisions are ignored. youden = tpr - fpr # = tpr + (1-fpr) -1 = tpr - fpr # Sensitivity + Specificity - 1 best_ind = youden.argmax() best_threshold = thresholds[best_ind] orig_stdout = sys.stdout f = open(model + '.txt', 'w') sys.stdout = f print 'ROC curve optimal cutoff ', model, ' = ', best_threshold # Find prediction to the dataframe applying threshold y_pred_labels = pandas.Series( y_pred).map(lambda x: True if x > best_threshold else False) cm = ConfusionMatrix(y_actu, y_pred_labels) # print cm cm.print_stats() # Get FP and FN samples cwd = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) directoryFP = cwd + '/' + model+'_FP' if os.path.exists(directoryFP): shutil.rmtree(directoryFP) os.mkdir(directoryFP) directoryFN = cwd + '/' + model+'_FN' if os.path.exists(directoryFN): shutil.rmtree(directoryFN) os.mkdir(directoryFN) i=0 with open(FILE_LIST) as f: for line in f: line = line.strip() pred_label = y_pred_labels[i] actu_label = y_actu[i] basename = os.path.basename(line) new_file_name="{0:.3f}".format(y_pred[i]) + "_" +basename if(actu_label == True and pred_label == False): shutil.copyfile(line, directoryFN+"/"+ new_file_name) if(actu_label == False and pred_label == True): shutil.copyfile(line, directoryFP+"/"+ new_file_name) i+=1 roc_auc = auc(fpr, tpr) print 'ROC AUC = ', roc_auc # Use precision recall curve for skewed / unbalanced data precision, recall, pr_thresholds = precision_recall_curve(y_actu, y_pred, pos_label=True) f1 = 2.0*precision*recall/(precision+recall) pr_best_ind = f1.argmax() pr_best_threshold = pr_thresholds[pr_best_ind] print 'precision_recall_curve optimal cutoff ', model, ' = ', pr_best_threshold sys.stdout = orig_stdout f.close() plt.figure() plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.3f)' % (roc_auc)) plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') plt.plot(fpr[best_ind], tpr[best_ind], 'ro', label='optimal cutoff') # (probability = %0.3f)' % (best_threshold)) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve (' + model + ') ' ) plt.legend(loc="lower right") plt.savefig(model + "_roc_curve.png") #plt.show() plt.close() # pr_auc=auc(precision, recall) plt.figure() plt.plot(recall, precision, lw=1, label='P-R Curve') # plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') # plt.plot(recall[pr_best_ind], precision[pr_best_ind], 'ro') #, label='best threshold (%0.3f)' % (best_threshold)) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.ylabel('Precision') plt.xlabel('Recall') plt.title('Precision-Recall Curve (' + model + ') ' ) plt.legend(loc="lower right") plt.savefig(model + "_pr_curve.png") #plt.show() plt.close() def nv_digits_infer_test_all_models(): i =0 for key in DICT: plt.figure(i) i += 1 nv_digits_infer_test_model(key) # nv_digits_infer_test_all_models() # special treatment for GoogLeNet_transfer_learning. It requires nv-caffe-0.15 # nv_digits_infer_test_model2("/home/zys/Desktop/scripts/GoogLeNet_ResultsFromDigits.txt", LABEL_LIST, "GoogLeNet_Transfer_Learning") # nv_digits_infer_test_model2("AlexNet_T1.txt","AlexNet_T2.txt", "AlexNet") # nv_digits_infer_test_model(DICT.keys()[int(sys.argv[1])]) # nv_digits_infer_test_model2("/home/zys/Desktop/scripts/GoogLeNet_ResultsFromDigits.txt", "/home/zys/Desktop/test2/data/LabelList.txt", "GoogLeNet_Transfer_Learning")
digits_predict_folder.sh . . . . . . 1 KB . . . . . . 5/5/2022 12:04:28 PM
# This is a linux shell script, which calls digits_predict_folder.py to batch process mutliple DNN models. # sudo python -c 'import digits_predict_folder; digits_predict_folder.nv_digits_infer_test_model("AlexNet_from_scratch")' # sudo python -c 'import digits_predict_folder; digits_predict_folder.nv_digits_infer_test_model("AlexNet_transfer_learning")' sudo python -c 'import digits_predict_folder; keys=digits_predict_folder.GetDictKeys(); digits_predict_folder.nv_digits_infer_test_model(keys[0])' sudo python -c 'import digits_predict_folder; keys=digits_predict_folder.GetDictKeys(); digits_predict_folder.nv_digits_infer_test_model(keys[1])' sudo python -c 'import digits_predict_folder; keys=digits_predict_folder.GetDictKeys(); digits_predict_folder.nv_digits_infer_test_model(keys[2])' sudo python -c 'import digits_predict_folder; keys=digits_predict_folder.GetDictKeys(); digits_predict_folder.nv_digits_infer_test_model(keys[4])' sudo python -c 'import digits_predict_folder; keys=digits_predict_folder.GetDictKeys(); digits_predict_folder.nv_digits_infer_test_model(keys[5])' ' # special treatment for GoogLeNet_transfer_learning. It requires nv-caffe-0.15 python -c 'import digits_predict_folder; digits_predict_folder.nv_digits_infer_test_model2("/home/zys/Desktop/scripts/GoogLeNet_ResultsFromDigits.txt", "/home/zys/Desktop/test2/data/LabelList.txt", "GoogLeNet_transfer_learning")' sudo chown $USER -R .
digits_predict_folder_with_human_experts.py . . . . . . 16 KB . . . . . . 5/5/2022 12:04:28 PM
# Analyze DNN performance with human experts # Copyright (c) 2017, zhangys@zjgsu.edu.cn # requires NVidia Digits to be installed DICT = {} DICT["DNN_vs_Human"] = "20170719-171742-5c31" DICT["VGG-16_transfer_learning"] = "20170719-171742-5c31" DICT["VGG-16_from_scratch"] = '20170722-143332-f077' DICT["GoogLeNet_from_scratch"] = '20170722-113606-c59c' DICT["GoogLeNet_transfer_learning"] = '20170722-190454-1af1' # '20170719-185942-bf65' DICT["AlexNet_transfer_learning"] = '20170803-121318-b153'#'20170802-164043-96f9'# '20170722-205930-3ac4' # '20170720-094939-b665' DICT["AlexNet_from_scratch"]= '20170802-163356-7f3d'#'20170802-162531-3531' #'20170722-152531-da8d' def GetDictKeys(): return DICT.keys() FILE_LIST="/home/zys/Desktop/test2/data/FileList.txt" LABEL_LIST="/home/zys/Desktop/test2/data/LabelList.txt" JOBS_DIR = "/home/zys/DIGITS-master/digits/jobs" # "/var/lib/digits/jobs/" import os import sys caffe_root = os.environ['CAFFE_ROOT'] # '/home/zys/nv-caffe-0.15' # # When use sudo, the environ var may be different than the current terminal sys.path.insert(0,caffe_root+'python') import numpy import pandas from sklearn.metrics import roc_curve, auc, precision_recall_curve from pandas_ml import ConfusionMatrix import matplotlib.pyplot as plt import shutil import inspect # Add path for DIGITS package sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) import digits.config from digits.inference.errors import InferenceError from digits.job import Job from digits import utils # To solve error "Check failed: error == cudaSuccess (10 vs. 0) invalid device ordinal" import caffe print (caffe.__file__) caffe.set_device(0) """ Perform inference on a list of images using the specified model """ def nv_digits_infer(input_list, output_dir, jobs_dir, model_id, epoch, batch_size, layers, gpu): """ Perform inference on a list of images using the specified model """ # job directory defaults to that defined in DIGITS config if jobs_dir == 'none': jobs_dir = digits.config.config_value('jobs_dir') # load model job model_dir = os.path.join(jobs_dir, model_id) assert os.path.isdir(model_dir), "Model dir %s does not exist" % model_dir model = Job.load(model_dir) # load dataset job dataset_dir = os.path.join(jobs_dir, model.dataset_id) assert os.path.isdir(dataset_dir), "Dataset dir %s does not exist" % dataset_dir dataset = Job.load(dataset_dir) for task in model.tasks: task.dataset = dataset # retrieve snapshot file task = model.train_task() snapshot_filename = None epoch = float(epoch) if epoch == -1 and len(task.snapshots): # use last epoch epoch = task.snapshots[-1][1] snapshot_filename = task.snapshots[-1][0] else: for f, e in task.snapshots: if e == epoch: snapshot_filename = f break if not snapshot_filename: raise InferenceError("Unable to find snapshot for epoch=%s" % repr(epoch)) # retrieve image dimensions and resize mode image_dims = dataset.get_feature_dims() height = image_dims[0] width = image_dims[1] channels = image_dims[2] resize_mode = dataset.resize_mode if hasattr(dataset, 'resize_mode') else 'squash' n_input_samples = 0 # number of samples we were able to load input_ids = [] # indices of samples within file list input_data = [] # sample data # load paths from file paths = None with open(input_list) as infile: paths = infile.readlines() # load and resize images for idx, path in enumerate(paths): path = path.strip() try: image = utils.image.load_image(path.strip()) image = utils.image.resize_image( image, height, width, channels=channels, resize_mode=resize_mode) input_ids.append(idx) input_data.append(image) n_input_samples = n_input_samples + 1 except utils.errors.LoadImageError as e: print e # perform inference if layers != 'none': raise InferenceError("Layer visualization is not supported for multiple inference") outputs = model.train_task().infer_many( input_data, snapshot_epoch=epoch, gpu=gpu, resize=True) return outputs["softmax"] def nv_digits_infer_test_model(model, human_results_dir): print ('####### ' + model + ' #########') labels = [] with open(LABEL_LIST) as f: for line in f: if('0' in line): labels.append(True) elif('1' in line): labels.append(False) else: print "Unrecognized label!" DIGITS_JOB_ID = DICT[model] softmax = nv_digits_infer(FILE_LIST, "/home/zys/data/C3R/test/tmp/", JOBS_DIR, DIGITS_JOB_ID, -1, 1, 'none', 0) lines=[] probs = [] for idx, p in enumerate(softmax): probs.append(p[0]) lines.append(str(p[0])+" "+str(p[1])+" "+str(p[2])) with open(model+'_softmax_probs.txt', 'w') as the_file: for line in lines: the_file.write(line+'\n') files=[] with open(FILE_LIST) as f: for line in f: line = line.strip() files.append(line) dic = {} i=0 for f in files: dic[os.path.basename(f)] = labels[i] i+=1 humanTitles = [] humanTPR = [] humanFPR = [] humanPrecision=[] humanRecall = [] for fn in os.listdir(human_results_dir): ks=[] ls=[] print "\n\n******", fn, "******" humanTitles.append(fn) with open(os.path.join(human_results_dir, fn)) as f: for line in f: line = line.strip() k = line.split(',')[0] l = line.split(',')[1] if(k not in ks): ks.append(k) if(l=='1'): ls.append(True) else: ls.append(False) rs=[] for k in ks: rs.append(dic[k]) cm = ConfusionMatrix(rs, ls) # print cm cm.print_stats() s= cm.stats() humanTPR.append(s['TPR']) humanFPR.append(s['FPR']) # 1- TNR humanPrecision.append(s['PPV']) humanRecall.append(s['TPR']) nv_digits_infer_test_model_analyze_result(probs, labels, model, humanTPR, humanFPR, humanPrecision, humanRecall) def nv_digits_infer_test_model3(fname1, fname2, model, human_results_dir): probs=[] with open(fname1) as f: for line in f: fields = line.split() if(len(fields) == 8): ind = fields.index("Disease") v = float(fields[ind+1].strip('%'))/100 probs.append(v) labels = [] with open(fname2) as f: for line in f: if('0' in line): labels.append(True) elif('1' in line): labels.append(False) else: print "Unrecognized label!" files=[] with open(FILE_LIST) as f: for line in f: line = line.strip() files.append(line) dic = {} i=0 for f in files: dic[os.path.basename(f)] = labels[i] i+=1 orig_stdout = sys.stdout log = open(model + '_humans.txt', 'w') sys.stdout = log humanTitles = [] humanTPR = [] humanFPR = [] humanPrecision=[] humanRecall = [] humanTNR=[] humanF1=[] humanMCC=[] humanInformedness=[] humanPPV=[] for fn in os.listdir(human_results_dir): ks=[] ls=[] rs=[] log.write( "\n\n******" + fn + "******\n") log.write( "\n---- Misclassified Files (filename, truth, label)---\n") humanTitles.append(fn) with open(os.path.join(human_results_dir, fn)) as f: for line in f: line = line.strip() k = line.split(',')[0] l = line.split(',')[1] if(k not in ks): ks.append(k) b=(l=='1') ls.append(b) rs.append(dic[k]) if(b!=dic[k]): log.write(k + '\t' + str( dic[k]) + '\t' + str(b) + '\n') cm = ConfusionMatrix(rs, ls) # print cm cm.print_stats() s= cm.stats() humanTPR.append(s['TPR']) humanFPR.append(s['FPR']) # 1- TNR humanPrecision.append(s['PPV']) humanRecall.append(s['TPR']) humanTNR.append(s['TNR']) humanF1.append(s['F1_score']) humanMCC.append(s['MCC']) humanInformedness.append(s['informedness']) humanPPV.append(s['PPV']) log.write( '\n\n ******** Summary *******\n') log.write( "\nExpert\tTPR\t\FPR\tPPV\tTNR\tF1_score\tMCC\tinformednessn\n") i=0 for human in humanTitles: print human, '\t', humanTPR[i], '\t', humanFPR[i], '\t', humanPPV[i], '\t', humanTNR[i], '\t', humanF1[i], '\t', humanMCC[i], '\t',humanInformedness[i], '\n\n' i+=1 sys.stdout = orig_stdout log.close() nv_digits_infer_test_model_analyze_result(probs, labels, model, humanTPR, humanFPR, humanPrecision, humanRecall) def nv_digits_infer_test_model_analyze_result(probs, labels, model, humanTPR, humanFPR, humanPrecision, humanRecall): # fprs = [] # tprs = [] # #for threshold in range(1,1000): # tpr, fpr = nv_digits_infer_one_round(threshold*0.001, softmax1, softmax2) # fprs.append(fpr) # tprs.append(tpr) y_actu = labels y_pred = probs #### CONFUSINO MATRIX #### #cm = ConfusionMatrix(y_actu, y_pred) #print cm #cm.print_stats() #### ROC Curve ##### fpr, tpr, thresholds = roc_curve(y_actu, y_pred, pos_label=True) # The Youden index J (Youden, 1950) is defined as: # J = max { sensitivityc + specificityc - 1 } # where c ranges over all possible criterion values. # Graphically, J is the maximum vertical distance between the ROC curve and the diagonal line. # The criterion value corresponding with the Youden index J is the optimal criterion value only when disease prevalence is 50%, equal weight is given to sensitivity and specificity, and costs of various decisions are ignored. youden = tpr - fpr # = tpr + (1-fpr) -1 = tpr - fpr # Sensitivity + Specificity - 1 best_ind = youden.argmax() best_threshold = thresholds[best_ind] orig_stdout = sys.stdout f = open(model + '.txt', 'w') sys.stdout = f thresh = 0.164 print '\nP-R curve cutoff = ', thresh # Find prediction to the dataframe applying threshold y_pred_labels = pandas.Series( y_pred).map(lambda x: True if x > thresh else False) cm = ConfusionMatrix(y_actu, y_pred_labels) # print cm cm.print_stats() thresh = 0.174 print '\nP-R curve cutoff = ', thresh # Find prediction to the dataframe applying threshold y_pred_labels = pandas.Series( y_pred).map(lambda x: True if x > thresh else False) cm = ConfusionMatrix(y_actu, y_pred_labels) # print cm cm.print_stats() thresh = 0.379 print '\nP-R curve cutoff = ', thresh # Find prediction to the dataframe applying threshold y_pred_labels = pandas.Series( y_pred).map(lambda x: True if x > thresh else False) cm = ConfusionMatrix(y_actu, y_pred_labels) # print cm cm.print_stats() thresh = 0.519 # argmax(F1 score) point for VGG print '\nP-R curve cutoff = ', thresh # Find prediction to the dataframe applying threshold y_pred_labels = pandas.Series( y_pred).map(lambda x: True if x > thresh else False) cm = ConfusionMatrix(y_actu, y_pred_labels) # print cm cm.print_stats() thresh = 0.535 print '\nP-R curve cutoff = ', thresh # Find prediction to the dataframe applying threshold y_pred_labels = pandas.Series( y_pred).map(lambda x: True if x > thresh else False) cm = ConfusionMatrix(y_actu, y_pred_labels) # print cm cm.print_stats() print '\nROC curve optimal cutoff = ', best_threshold # Find prediction to the dataframe applying threshold y_pred_labels = pandas.Series( y_pred).map(lambda x: True if x > best_threshold else False) cm = ConfusionMatrix(y_actu, y_pred_labels) # print cm cm.print_stats() # Get FP and FN samples thresh = 0.519 y_pred_labels = pandas.Series( y_pred).map(lambda x: True if x > thresh else False) cwd = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) directoryFP = cwd + '/' + model+'_FP' if os.path.exists(directoryFP): shutil.rmtree(directoryFP) os.mkdir(directoryFP) directoryFN = cwd + '/' + model+'_FN' if os.path.exists(directoryFN): shutil.rmtree(directoryFN) os.mkdir(directoryFN) i=0 with open(FILE_LIST) as f: for line in f: line = line.strip() pred_label = y_pred_labels[i] actu_label = y_actu[i] basename = os.path.basename(line) new_file_name="{0:.3f}".format(y_pred[i]) + "_" +basename if(actu_label == True and pred_label == False): shutil.copyfile(line, directoryFN+"/"+ new_file_name) if(actu_label == False and pred_label == True): shutil.copyfile(line, directoryFP+"/"+ new_file_name) i+=1 roc_auc = auc(fpr, tpr) print 'ROC AUC = ', roc_auc # Use precision recall curve for skewed / unbalanced data precision, recall, pr_thresholds = precision_recall_curve(y_actu, y_pred, pos_label=True) f1 = 2.0*precision*recall/(precision+recall) pr_best_ind = f1.argmax() pr_best_threshold = pr_thresholds[pr_best_ind] print 'precision_recall_curve optimal cutoff ', model, ' = ', pr_best_threshold sys.stdout = orig_stdout f.close() plt.figure() plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.3f)' % (roc_auc)) plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') # plt.plot(fpr[best_ind], tpr[best_ind], 'ro', label='optimal cutoff') # (probability = %0.3f)' % (best_threshold)) plt.plot(humanFPR, humanTPR, 'bo', label='human experts') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve (' + model + ') ' ) plt.legend(loc="lower right") plt.savefig(model + "_roc_curve.png") #plt.show() plt.close() # pr_auc=auc(precision, recall) plt.figure() plt.plot(recall, precision, lw=1, label='P-R Curve') plt.plot(humanRecall, humanPrecision, 'bo', label='human experts') # plt.plot(recall[pr_best_ind], precision[pr_best_ind], 'ro') #, label='best threshold (%0.3f)' % (best_threshold)) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.ylabel('Precision') plt.xlabel('Recall') plt.title('Precision-Recall Curve (' + model + ') ' ) plt.legend(loc="lower right") plt.savefig(model + "_pr_curve.png") #plt.show() plt.close() myfile = open(model + "_pr_curve.txt", "w") myfile.write('threshold\trecall\tprecision\n\n') i=0 for th in pr_thresholds: myfile.write(str(th) + str(recall[i]) + '\t'+ str(precision[i]) + '\n') i+=1 nv_digits_infer_test_model('DNN_vs_Human',"/home/zys/Desktop/scripts/human") # Batch inference # nv_digits_infer_test_model3("/home/zys/Desktop/scripts/GoogLeNet_ResultsFromDigits.txt", "/home/zys/Desktop/test2/data/LabelList.txt", "GoogLeNet vs Human", "/home/zys/Desktop/scripts/human") # special treatment for GoogLeNet_transfer_learning. It requires nv-caffe-0.15. Use provided probs file