I am working on a class project where we have 10 gzip files of PubMed data, each of which has 1000 PMIDs which each have their own features like Title, Abstract, Authors, and assigned MeSH terms. I am a novice at Python and have written the below code to find every article PMID, the title and abstract words, the unigrams for both, the tfidf of both, and then use those methods to perform a linear SVC prediction on which MeSH terms should be assigned to an article.
import gzip
import math
import re
import itertools
from itertools import *
import sklearn
import numpy as np
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.cluster import KMeans
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
import json
import lxml
from lxml import etree as Et
import re
import pandas as pd
import time
import util_5353
# Problem A [0 points]
def read_data(filenames):
data = None
# Begin CODE
data = {}
contents = []
for filename in filenames:
with gzip.open(filename,'rt') as f:
contents.append(f.read())
tween = []
pmid_list = []
for d in contents:
tween.extend(re.findall('^PMID- (.*?)SO - ', d, re.DOTALL|re.MULTILINE))
pmid_list.extend(re.findall('^PMID- (.*)', d, re.MULTILINE))
for i in range(len(tween)):
mh = re.findall('^MH - (.*)$', tween[i], re.MULTILINE)
content = tween[i].replace('\n ', ' ')
ti = re.findall('^TI - (.*)$', content, re.MULTILINE)
ab = re.findall('^AB - (.*)$', content, re.MULTILINE)
data.update({pmid_list[i]:{'Ti':ti, 'Ab':ab, 'Mh':mh}})
return data
# Problem B [0 points]
tokenizer = re.compile('\w+|[^\s\w]+')
def tokenize(text):
return tokenizer.findall(text.lower())
# Problem C [0 points]
def pmids(data):
pmids = []
# Begin CODE
for key in data:
pmids.append(key)
# End CODE
return pmids
# Problem 1 [10 points]
def unigrams(data, pmid):
unigrams = {}
# Begin CODE
article = data[pmid]
title = tokenize(article['Ti'][0])
abstract = (tokenize(article['Ab'][0]))
unique_words = (list(set(title + abstract)))
unigrams =dict(zip(unique_words,[1.0]*len(unique_words)))
# End CODE
return unigrams
# Problem 2 [10 points]
def tfidf(data, pmid):
tfidf = {}
# Begin CODE
article = data[pmid]
N = len(data)
title = tokenize(article['Ti'][0])
abstract = tokenize(article['Ab'][0])
pmid_words = title + abstract
pmid_counts = {}
for i in pmid_words:
pmid_counts[i] = pmid_counts.get(i, 0) + 1
doc_words = []
for key in data:
doc_words.extend(tokenize(data[key]['Ti'][0]))
doc_words.extend(tokenize(data[key]['Ab'][0]))
doc_counts = dict()
for i in doc_words:
doc_counts[i] = doc_counts.get(i, 0) + 1
for val in pmid_words:
tfidf.update({val:((pmid_counts[val])*math.log(N/doc_counts[val]))})
# End CODE
return tfidf
# Problem 3 [10 points]
def mesh(data, pmid):
mesh = []
# Begin CODE
work = []
article = data[pmid]
for term in article['Mh']:
work.extend(tokenize(term))
doc_words = []
i = 0
while i < len(article['Mh']):
if '/' in article['Mh'][i]:
x = article['Mh'][i]
x = x.split('/')
doc_words.append(x[0])
i+=1
else:
doc_words.append(article['Mh'][i])
i+=1
mesh = [s.replace('*', '') for s in doc_words]
# End CODE
return mesh
def outcomes(data, train):
bin_list = []
n=len(train)
for val in train:
bin_list.append(mesh(data, val))
i = 0
k = 0
outcomes = []
mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',
'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']
for val in mesh_list:
while i <len(bin_list):
if val in bin_list[i]:
outcomes.append('1')
i+=1
else:
outcomes.append('0')
i+=1
i = 0
outcomes = [outcomes[i:i+n] for i in range(0, len(outcomes), n)]
return outcomes
def linear_svm(data, train, test, mesh, func):
stuff = {}
pmids_list = pmids(data)
for val in pmids_list:
stuff.update({val:func(data, val)})
X = pd.DataFrame.from_dict(stuff, orient = "index")
X = X.replace({np.nan:0})
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict = []
predictions = {m:[] for m in mesh}
work = []
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work = []
i = 0
k = 0
m = len(test)
final = []
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m:[] for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
return predictions
# Problem 4 [10 points]
def svm_predict_unigram(data, train, test, mesh):
predictions = {m:[] for m in mesh}
# Begin CODE
predictions = linear_svm(data, train, test, mesh, unigrams)
# End CODE
return predictions
# Problem 5 [10 points]
def svm_predict_tfidf(data, train, test, mesh):
predictions = {m:[] for m in mesh}
# Begin CODE
predictions = linear_svm(data, train, test, mesh, tfidf)
# End CODE
return predictions
# Problem 6 [10 points]
def kmeans(data, k):
clusters = {}
# Begin CODE
stuff = {}
pmid_list = pmids(data)
for val in pmid_list:
stuff.update({val:unigrams(data, val)})
X = pd.DataFrame.from_dict(stuff, orient = "index")
X = X.replace({np.nan:0})
km = KMeans(n_clusters=10, random_state=0, init = 'random').fit(X)
labels = km.labels_
clusters = {pmid_list[i]:int(labels[i]) for i in range(len(pmid_list))}
# End CODE
return clusters
# Problem 7 [10 points]
def svm_predict_cluster(data, train, test, mesh, k):
predictions = {m:[] for m in mesh}
# Begin CODE
stuff = {}
stuff = (kmeans(data, k))
X = pd.DataFrame.from_dict(stuff, orient = "index")
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict = []
predictions = {m:[] for m in mesh}
work = []
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work = []
i = 0
k = 0
m = len(test)
final = []
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m:[] for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
# End CODE
return predictions
# Problem 8 [10 points]
def svm_predict_cluster_unigrams(data, train, test, mesh, k):
predictions = {m:[] for m in mesh}
# Begin CODE
stuff = {}
pmid_list = pmids(data)
tts = int(len(pmid_list) * 0.8)
train = pmid_list[:tts]
test = pmid_list[tts:]
stuff = {}
for val in pmid_list:
stuff.update({val:unigrams(data, val)})
k_stuff = (kmeans(data, k))
X = pd.DataFrame.from_dict(stuff, orient = "index")
X2 = pd.DataFrame.from_dict(k_stuff, orient = "index")
X = X.join(X2, how='outer')
X = X.replace({np.nan:0})
outcome_data = outcomes(data, train)
df1 = pd.DataFrame()
df2 = pd.DataFrame()
clf = LinearSVC()
predict = []
predictions = {m:[] for m in mesh}
work = []
for val in train:
df1 = df1.append(X.loc[val])
for val in test:
df2 = df2.append(X.loc[val])
for val in outcome_data:
clf.fit(df1, val)
predict.append(list(clf.predict(df2)))
work = []
i = 0
m = len(test)
final = []
for val in predict:
while i < len(val):
if val[i] == '1':
work.append(test[i])
i+=1
else:
work.append('0')
i+=1
i = 0
work = [work[i:i+m] for i in range(0, len(work), m)]
for val in work:
final.append(list(filter(lambda a: a != '0', val)))
predictions = {m:[] for m in mesh}
for i in range(0,10):
predictions.update({mesh[i]:final[i]})
# End CODE
return predictions
# Problem 9 [20 points]
def evaluate(data, test, mesh_predict):
evaluation = {}
# Begin CODE
outcome = outcomes(data, test)
final = []
i = 0
k = 0
while i < len(outcome):
while k < len(outcome[i]):
if outcome[i][k] == '1':
outcome[i][k] = test[k]
k+=1
else:
k+=1
k = 0
i+=1
for val in outcome:
final.append(list(filter(lambda a: a != '0', val)))
dic = {}
i = 0
for key in mesh_predict:
gold_vals = [pmid in final[i] for pmid in test]
predict_vals = [pmid in mesh_predict[key] for pmid in test]
recall = recall_score(gold_vals, predict_vals, average='macro')
accuracy = accuracy_score(gold_vals, predict_vals)
precision = precision_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))
f1 = f1_score(gold_vals, predict_vals, average='macro',labels=np.unique(predict_vals))
dic.update({key:{'accuracy': float(accuracy), 'precision': float(precision),'recall':float(recall),'f1':float(f1)}})
evaluation.update(dic)
# End CODE
return evaluation
# Note: don't mess with this code block! Your code will be tested by an outside
# program that will not call this __main__ block. So if you mess with the
# following block of code you might crash the autograder. You're definitely
# encouraged to look at this code, however, especially if your code crashes.
if __name__ == '__main__':
# Comment out some file names to speed up the development process, but
# ultimately you want to uncomment the filenames so you ensure that your code
# works will all files. The assertions below assume that medline.0.txt.gz is
# in the list.
file_list = []
file_list.append('medline.0.txt.gz')
file_list.append('medline.1.txt.gz')
file_list.append('medline.2.txt.gz')
file_list.append('medline.3.txt.gz')
file_list.append('medline.4.txt.gz')
file_list.append('medline.5.txt.gz')
file_list.append('medline.6.txt.gz')
file_list.append('medline.7.txt.gz')
file_list.append('medline.8.txt.gz')
file_list.append('medline.9.txt.gz')
pmid_list = ['22999938', '23010078', '23018989']
print('::: Problem A :::')
data = read_data(file_list)
print('::: Problem C :::')
_pmids = pmids(data)
for pmid in pmid_list:
if pmid not in _pmids:
util_5353.die('C', 'Assertions assume PMID is present: %s', pmid)
tts = int(len(_pmids) * 0.8)
train = _pmids[:tts]
test = _pmids[tts:]
print('::: Problem 1 :::')
one_ret = unigrams(data, pmid_list[0])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(99, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['metastasis'], '1')
one_ret = unigrams(data, pmid_list[1])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(95, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['destruction'], '1')
one_ret = unigrams(data, pmid_list[2])
util_5353.assert_dict(one_ret, '1')
util_5353.assert_int_eq(133, len(one_ret), '1')
util_5353.assert_float_eq(1.0, one_ret['concurrent'], '1')
print('::: Problem 2 :::')
two_ret = tfidf(data, pmid_list[0])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(99, len(two_ret), '2')
util_5353.assert_float_range((1.5, 3.0), two_ret['metastasis'], '2')
two_ret = tfidf(data, pmid_list[1])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(95, len(two_ret), '2')
util_5353.assert_float_range((10.0, 20.0), two_ret['destruction'], '2')
two_ret = tfidf(data, pmid_list[2])
util_5353.assert_dict(two_ret, '2')
util_5353.assert_int_eq(133, len(two_ret), '2')
util_5353.assert_float_range((7.0, 10.0), two_ret['concurrent'], '2')
print('::: Problem 3 :::')
three_ret = mesh(data, pmid_list[0])
GOLD = ['Animals', 'Breast Neoplasms', 'DNA Methylation', 'DNA, Neoplasm', 'DNA-Binding Proteins', 'Dioxygenases', 'Down-Regulation', 'Female', 'Gene Expression Regulation, Neoplastic', 'Humans', 'Male', 'Mice', 'Mice, Inbred BALB C', 'Mice, Nude', 'Mixed Function Oxygenases', 'Neoplasm Invasiveness', 'Prostatic Neoplasms', 'Proto-Oncogene Proteins', 'Tissue Inhibitor of Metalloproteinase-2', 'Tissue Inhibitor of Metalloproteinase-3', 'Tumor Suppressor Proteins']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
three_ret = mesh(data, pmid_list[1])
GOLD = ['Animals', 'Contrast Media', 'Gene Knockdown Techniques', 'Genetic Therapy', 'Mice', 'Mice, Inbred C3H', 'Microbubbles', 'Neoplasms, Squamous Cell', 'RNA, Small Interfering', 'Receptor, Epidermal Growth Factor', 'Sonication', 'Transfection', 'Ultrasonics', 'Ultrasonography']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
three_ret = mesh(data, pmid_list[2])
GOLD = ['Adult', 'Aged', 'Chemoradiotherapy', 'Diffusion Magnetic Resonance Imaging', 'Female', 'Humans', 'Medical Oncology', 'Middle Aged', 'Reproducibility of Results', 'Time Factors', 'Treatment Outcome', 'Tumor Burden', 'Uterine Cervical Neoplasms']
util_5353.assert_list(three_ret, len(GOLD), '3', valid_values=GOLD)
print('::: Problem 4 :::')
mesh_list = ['Humans', 'Female', 'Male', 'Animals', 'Treatment Outcome',
'Neoplasms', 'Prognosis', 'Risk Factors', 'Breast Neoplasms', 'Lung Neoplasms']
mesh_set = set()
for pmid in _pmids:
mesh_set.update(mesh(data, pmid))
for m in mesh_list:
if m not in mesh_set:
util_5353.die('4', 'Assertions assume MeSH term is present: %s', m)
four_ret = svm_predict_unigram(data, train, test, mesh_list)
util_5353.assert_dict(four_ret, '4')
for m in mesh_list:
util_5353.assert_dict_key(four_ret, m, '4')
util_5353.assert_list(four_ret[m], None, '4', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(four_ret[m]), '4')
util_5353.assert_int_range((len(test)/2, len(test)), len(four_ret['Humans']), '4')
print('::: Problem 5 :::')
five_ret = svm_predict_tfidf(data, train, test, mesh_list)
util_5353.assert_dict(five_ret, '5')
for m in mesh_list:
util_5353.assert_dict_key(five_ret, m, '5')
util_5353.assert_list(five_ret[m], None, '5', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(five_ret[m]), '5')
util_5353.assert_int_range((len(test)/2, len(test)), len(five_ret['Humans']), '5')
print('::: Problem 6 :::')
K = 10
six_ret = kmeans(data, K)
util_5353.assert_dict(six_ret, '6')
util_5353.assert_int_eq(len(_pmids), len(six_ret), '6')
for pmid in _pmids:
util_5353.assert_dict_key(six_ret, pmid, '6')
util_5353.assert_int_range((0, K-1), six_ret[pmid], '6')
print('::: Problem 7 :::')
seven_ret = svm_predict_cluster(data, train, test, mesh_list, K)
util_5353.assert_dict(seven_ret, '7')
for m in mesh_list:
util_5353.assert_dict_key(seven_ret, m, '7')
util_5353.assert_list(seven_ret[m], None, '7', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(seven_ret[m]), '7')
util_5353.assert_int_range((len(test)/2, len(test)), len(seven_ret['Humans']), '7')
print('::: Problem 8 :::')
eight_ret = svm_predict_cluster_unigrams(data, train, test, mesh_list, K)
util_5353.assert_dict(eight_ret, '8')
for m in mesh_list:
util_5353.assert_dict_key(eight_ret, m, '8')
util_5353.assert_list(eight_ret[m], None, '8', valid_values=_pmids)
util_5353.assert_int_range((0, len(test)), len(eight_ret[m]), '8')
util_5353.assert_int_range((len(test)/2, len(test)), len(eight_ret['Humans']), '8')
print(':: Problem 9 ::')
nine_ret4 = evaluate(data, test, four_ret)
nine_ret5 = evaluate(data, test, five_ret)
nine_ret7 = evaluate(data, test, seven_ret)
nine_ret8 = evaluate(data, test, eight_ret)
for nine_ret in [nine_ret4, nine_ret5, nine_ret7, nine_ret8]:
util_5353.assert_dict(nine_ret, '9')
for m in mesh_list:
util_5353.assert_dict_key(nine_ret, m, '9')
util_5353.assert_dict(nine_ret[m], '9')
for k in ['accuracy', 'precision', 'recall', 'f1']:
util_5353.assert_dict_key(nine_ret[m], k, '9')
util_5353.assert_float(nine_ret[m][k], '9')
util_5353.assert_float_range((0.0, 1.0), nine_ret[m][k], '9')
print('~~~ All Tests Pass ~~~')
When I run the program for all 10 documents - 10,000 PMIDs, it has taken over 7 hours to get part way through the method for problem 5, svm_predict_tfidf.
Is there a way to speed this up? My professor says it takes him 4.5 minutes to run the svm_predict_tfidf method with all 10,000 IDs.
Comments
Post a Comment