# -*- coding: utf-8 -*- """ Created on Sun Jan 16 08:44:06 2017 @author: John """ OK as of 1/16 this covers the basics from last week, then shows how to: #read files from the web or a desktop folder into a list or dictionary #import, parse and pre-process those files for different objectives #basic word frequencies #save or pickle your results and retreive them #computer interrater reliability for the PAP labels example) #Construct a confusion matrix using the PAP labels example #Many of these scripts came from searching google for examples! ------------------------------------------------------ #PART I #Basics of lists, dictionaries, methods, loops, functions #Python is case sensitive!! #Indenting is critical for loops and functions!! 3/2 3/2.0 a='head' print(a) print(a+'light') b = a[:-1] print(b) len(b) b=a+'light'+'s are on' print(b) #Tuples: Note the parentheses. tupleExample=(12, "hungry", "men") len(tupleExample) print(tupleExample(1)) #Tuples are immutable! won't work #------------------------------------------------------- #Lists: Note the hard brackets, strings in quotes. A list is #just what it sounds like - a vector of something that can be #read, manipulated, whatever. They are mutable. listExample=[12, 'hungry', 'men'] len(listExample) print(listExample[1:]) #Lists are mutable, will work #deleting first element in list (to grab just one element, listExample.pop[0]) del listExample[0] print listExample #applying a method to a list (here insert 'yes' in location 0) listExample.insert(0,'yes') print listExample #adding a list to a list listExample2=[47, listExample] print(listExample2) #to inspect the 2nd element of the second list within a list print listExample2[1][1] #use a loop to apply a method to a list 5 times (element is an #arbitrary variable name, could be i, j, whatever) myList=[] for element in range(5): myList.append(element) print(myList) #----------------------------------------------------------- #functions are commands that you create in advance and call #using the name of the function for efficiency def happyBirthdayEmily(): #program does nothing as written print("Happy Birthday to you!") print("Happy Birthday to you!") print("Happy Birthday, dear Emily.") print("Happy Birthday to you!") happyBirthdayEmily() #running the function #some other examples where the function asks for inputs (you #could apply the function to a list of values for example) def perfect( score ): print ("I got a perfect " + score) perfect(score='100') def myfunction(): for i in myList: if i > 3: print i myfunction() def pinfo( name, age ): print "Name: ", name print "Age ", age pinfo( age=50, name="miki" ) #------------------------------------------------------- #Dictionaries: note the curly brackets! Each dictionary entry (case) has keys (variable name) and values. They are a way #to organize data for easy extraction of selected information dictExample={'name':'john', 'age': 29, 'sex': 'M'} #dictionaries are automatically sorted, so be sure to check that you are getting what you think you are getting list(dictExample.keys())[0] #the first key in not 'name' len(dictExample) type(dictExample) print(dictExample['age']) #Lots of dictionary methods to apply (some examples) list(dictExample.items()) list(dictExample.keys()) list(dictExample.values()) listExample3=[dictExample, listExample] can add to a list #Creating a dictionary with keys and values for four different people. namelist=('jane', 'jon', 'joe', 'george') agelist=(12,12,5,4) myDict3={'name':[], 'age':[]} for i in namelist: #i refers to an element of namelist myDict3['name'].append(i) for j in agelist: #j refers to an element of agelist myDict3['age'].append(j) print myDict3['name'][2] print myDict3['age'][2] print myDict3 #OK! Can you create your own function that loops through some data applying a method? #--------------------------------------------------------- #Regular Expressions are a sequence of characters that define a #search pattern. For example, '\,' says look for a comma. #split 'happy, go lucky wherever there is a comma, or whereever #there is a space from bs4 import re re.split('\,', 'happy, go lucky') re.split('\s', 'happy, go lucky') #split whereever there is a space \s #play with regular expressions here http://www.regexr.com/ #----------------------------------------------------------- #PART II #Scraping data (using API, website or desktop) #Parsing documents #Tokenizing and cleaning a document #Always start by checking your current working directory. Getting it right can be a challenge print os.getcwd() %cd C:\Users\John.DESKTOP-ES1A8AS\Desktop\Wcopy print os.getcwd() #be sure that it worked ---------------------------------------- # APIs for structured data #WeatherUnderground API (https://www.wunderground.com/weather/api/d/docs) #note that f below includes my registered key (5de2d6847556443d). #WU limits free calls, so it might not work if too many use it in one day! import sys import os import urllib2 import json import pprint f = urllib2.urlopen('http://api.wunderground.com/api/5de2d6847556443d/conditions/q/98101.json') #Seattle zip code json_string = f.read() parsed_json = json.loads(json_string) pprint.pprint(parsed_json) #pretty print makes the json more readable - easier to figure out the hierarchy. type(parsed_json) parsed_json #NYT API: Everything you need to scrape the NYT is here (also requires a key): #http://dlab.berkeley.edu/blog/scraping-new-york-times-articles-python-tutorial #Twitter API https://dev.twitter.com/overview/api #There are also specific python modules for working with Twitter ----------------------------------------------------------------------------------------- #Scraping a single webpage import sys import os import urllib import urllib opener = urllib.FancyURLopener({}) tempfile = opener.open("https://www.polisci.washington.edu/people") polisci=tempfile.read() open(polisci, 'w') print polisci type(polisci) #it's a long string plist=[polisci] #convert it to a list print plist #take a look at it now type(plist) #create a new list that split by words (at every \n or newline) wlist=[polisci.split()] print wlist #-------------------------------------------------------- #This time, strip the html language and make the text more readable import urllib from bs4 import BeautifulSoup from bs4 import re import nltk url = "https://www.polisci.washington.edu/people" html = urllib.urlopen(url).read() soup = BeautifulSoup(html) #process as html politext = soup.get_text() #extract the text print politext type(politext) # Remove the many blank lines lines = (line.strip() for line in politext.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) politext = '\n'.join(chunk for chunk in chunks if chunk) print politext #extract just the emails using a regular expression import urllib from bs4 import BeautifulSoup from bs4 import re poliemails=set() #create a set object emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", politext, re.I)) poliemails.update(emails) #add them to the set print poliemails type(poliemails) emaillist=list(poliemails) #Now convert the set to a list of emails emaillist[0] #---------------------------------------------------------- #Download a web file and add it to a folder on the desktop print os.getcwd() %cd C:\Users\John.DESKTOP-ES1A8AS\Desktop\Wcopy print os.getcwd() import os import urllib filename = os.path.join('C:\Users\John.DESKTOP-ES1A8AS\Desktop\Wcopy', 'syllabus.pdf') #where it is going urllib.urlretrieve("http://faculty.washington.edu/jwilker/559/559Syllabus2017.pdf", filename) #where it is from #Confirm that it is in your desktop folder? ----------------------------------------------------------- #download three webfiles and import them into a list # it might be possible to use a regular expression for the doc number to get all of the minutes documents) fedurllist=['https://www.federalreserve.gov/fomc/minutes/20060131.htm', 'https://www.federalreserve.gov/fomc/minutes/20060328.htm', 'https://www.federalreserve.gov/fomc/minutes/20060629.htm'] feddocs=[] for element in fedurllist: t=urllib.urlopen(element).read() feddocs.append(t) print feddocs[1][0:20] #first 21 characters of first element (transcript) in list #------------------------------------------------------------ # DO THIS FIRST: DOWNLOAD THE FOMC FOLDER AND FILES TO YOUR DESKTOP. TAKE A LOOK AT ONE OF THE FILES # http://faculty.washington.edu/jwilker/559/FOMC/ import os import nltk import urllib from bs4 import BeautifulSoup from bs4 import re #change to that folder on your desktop %cd C:\Users\John.DESKTOP-ES1A8AS\Desktop\FOMC #you'll need to create your own path print os.getcwd() #Create a list of the files in a folder #below is a 'list comprehension' version of a loop that says add the name of any file that ends with .htm* filelist = [filename for filename in os.listdir(os.getcwd()) if re.search(r'(.*\.htm*$)', filename) != None] print len(filelist) #should equal number of html files in FOMC print filelist[0] #loop through the files, pre-process and add to 'fomclist' fomclist = [] for element in filelist: input = open(element, 'r') y = BeautifulSoup(input.read()) y = y.get_text() #get just the text (not html encoding) fomclist.append(y) #append the text to fomclist len(fomclist) #should equal len(listing) print fomclist[0][1:100] #first 100 words of first meeting in list #---------------------------------------------------------- #Tokenize and parse each of the files in fomclist by sentence nltk.download('punkt') #module that parses sentences sent_tokenize = nltk.data.load('nltk:tokenizers/punkt/english.pickle') fomc_tok =[] for element in fomclist: fomc_tok.append(sent_tokenize.tokenize(element)) print fomc_tok[0] #what else are we doing in this block of code?! meeting_sents=[] for meeting in fomc_tok: for sentence in meeting: cleanwords=str(sentence.encode('ascii',errors='ignore')) meeting_sents.append(cleanwords.lower()) print meeting_sents[0] #------------------------------------------------------------- #Pop Quiz! What if you wanted to parse the meeting trascripts by speaker, so that each element in the list was a speaker's statement? #Think about it and see the answer a little further below #------------------------------------------------------------- #remove stopwords and short words. #this can take a while so we limited the number of meetings [0:10] nltk.download('stopwords'), from nltk.corpus import stopwords from nltk.tokenize import word_tokenize #Right now, each element in the list is a single long string. #Strings are immutable so we need to convert them to lists of words (and then back again later). meeting_sents2=[] meeting_sents2 = [word_tokenize(i) for i in meeting_sents] print meeting_sents2[0] #now that each word is separated we can manipulate them meeting_sents3=[] for word in meeting_sents2[0:10]: keepwords = filter(lambda word: word not in stopwords.words('english') or len(word) >3, word) meeting_sents3.append(keepwords) print meeting_sents2[0] print meeting_sents3[0] #if you wanted to include addtional stopwords meeting_sents4=[] mystop=['meeting','federal'] for word in meeting_sents3[0:10]: removewords = filter(lambda word: word not in mystop or len(word) >3, word) meeting_sents4.append(removewords) print meeting_sents4[1] #lambda above is an anonymous function. The line says filter (keep) only the words that fit the lambda function 'word' #------------------------------------------------------------- #stem words from nltk import PorterStemmer ps = PorterStemmer() print meeting_sents4[0] #take a look meeting_sents5=[] for sentence in meeting_sents4: newsent = [] for word in sentence: mystem = ps.stem(word) newsent.append(mystem) meeting_sents5.append(newsent) #now rejoin the words in each sentence in the list print meeting_sents5[1] meeting_sents6=[] for sentence in meeting_sents5: myjoin=" ".join(sentence) meeting_sents6.append(myjoin) print meeting_sents6[1] #----------------------------------------------------------- #What if we wanted to create one single list of words from all of the meetings? allMeetings= ' '.join(fomclist) #remember newlist? allWords = nltk.word_tokenize(allMeetings) #pre-process them again (this will take a while) allWords = [word.lower() for word in allWords] allWords2 = filter(lambda word: word not in stopwords.words('english') and len(word) >3, allWords) allWords3 = [Pstemmer.stem(word) for word in allWords2] sorted(set(allWords3)) #Can use re's to remove additional junk in the file, such as \n cleanfile=[] for word in dirtyfile: fixed=re.sub(r'\\n', '', str(word)) fixed=re.sub(r'\\\\u', '', fixed) fixed=re.sub(r'[', '', fixed) fixed=re.sub(r'-+', '', fixed) fixed=re.sub(r'2014', \"*\", fixed) cleanfile.append(fixed) ----------------------------------------------------- #if we decided that this was a list we wanted to keep, we would pickle it. Otherwise we'll have to run all of the code above #again pickle.dump(allWords3,open(\"allWords3.p\", \"wb\"))" #To load the pickled file at some future time.... pkl_file = open('allWords3.p', 'rb') allWords4 = pickle.load(pkl_file) ------------------------------------------------------ #PART III basic word frequencies #These are just a few of the options #Top 100 words (I think) from collections import Counter, Counter(allWords4[1:100]) #How many instances of a specific word?" allWords4.count(\"board\")" #Frequency distribution of top 1000 words allWords5=allWords4[1:1000] fd = nltk.FreqDist(allWords5) type(fd) fd.plot(50, cumulative=True)" #------------------------------------------------------------- #OK back to the parse by speaker statement. You would need to way to identify many different speakers' names. #Once you were able to do that, you could break or split the file at each speaker's name. #The way to identify a speaker name is to write a regular expression. In the FOMC transcripts, the speakers' names are uniquely in all capital letters (yea!) re.split(r'CHAIRMAN BERNANKE\\.|MR. [A-Z]+\\.|MRS. [A-Z]+\\.|MS. [A-Z]+\\.) #This says split when it finds CHAIRMAN BERNANKE or MR. ALLCAPS or MRS. ALLCAPS etc. Regular Expressions are the bomb. #------------------------------------------------------------- # PART IV Reliability #Interrater agreement import os import nltk import urllib from bs4 import BeautifulSoup from bs4 import re import pandas as pi import io #Start by downloading the POLS559 Team Folder to your desktop #Take a look at how your file is now structured. print os.getcwd() %cd C:\Users\John.DESKTOP-ES1A8AS\Dropbox\POLS559 Team Folder print os.getcwd() filelist = [filename for filename in os.listdir(os.getcwd()) if re.search(r'(.*\.csv*$)', filename) != None] #this is a list comprehension to create a list of the csv filenames in your directory print len(filelist) print filelist[0] #read all of the csvs into a single list of dictionaries. #I had to reformat the csv files so that they were identical papcodes=[] import csv for element in filelist: input_file = csv.DictReader(open(element)) for r in input_file: papcodes.append(r) print papcodes[0] #you'll see the keys and vaues for the first case/person #retreive only the values for each element papcodes2=[] for i in papcodes: j=dict.values(i) papcodes2.append(j) len(papcodes2) print papcodes2[-1] #the keys should be gone #the AnnotationTask module we will be using requires that the #order of each element in the list be coder, id, label so we #have to reorder the list papcodes3=[] myorder=[1,3,0,2] #this is the new order] for sublist in papcodes2: sublist_storer = [] for element in myorder: sublist_storer.append(sublist[element]) papcodes3.append(sublist_storer) len(papcodes3) print papcodes3[5] #now we need to remove the last 'minor' label papcodes4=papcodes3 rmovIndxNo = 3 for i in papcodes4: del i[rmovIndxNo] print papcodes4[3] #Should be ready to examine interrater reliability. Consult the #AnnotationTask documentation for the different options from nltk.metrics.agreement import AnnotationTask import os.path task = AnnotationTask(data=papcodes4) task.avg_Ao() task.alpha() -------------------------------------------------------------- #Confusion matrix compares TRUE labels with PREDICTed labels. #True is the prexeisting label (of an expert annotator ("the gold standard") #Predict includes just one response per bill import os import csv import pandas as pd #confusion.csv has two columns, TRUE label and PREDICT label #for each bill. confusiontrue=[] confusionpredict=[] filename = os.path.join('http://faculty.washington.edu/jwilker/559/confusion.csv') f = pd.read_csv(filename) confusiontrue = f.TRUE confusionpredict=f.PREDICT #let's check len(confusiontrue) len(confusionpredict) print(confusiontrue[0]) print(confusionpredict[0]) #OK, let's do it from sklearn.metrics import confusion_matrix import pandas as pd import numpy as np import matplotlib.pyplot as plt from pylab import rcParams import itertools #Need to create the def plot_confusion_matrix function (loor for #'Erratum' before you do anything else #need to provide labels varnames=(1,2,3,4,5,6,7,8,10,12,13,14,15,16,17,18,19,20,21) #this creates the basic matrix cnf_matrix = confusion_matrix(confusiontrue, confusionpredict) np.set_printoptions(precision=2) #this creates the pretty plot rcParams['figure.figsize'] = 18,10 #make it bigger #Version 1: number of cases plot_confusion_matrix(cnf_matrix, classes=varnames, title='Confusion matrix, without normalization') #Version 2: percent of cases plot_confusion_matrix(cnf_matrix, classes=varnames, normalize=True, title='Normalized confusion matrix') #couple of weird things (NaN and long decimals but enough for now!) #Recall: % of the true cases that are correctly predicted #Precision: % of predicted cases that are true cases. #Not sure if there's an option for calculating these -------------------------------------------------------- #Erratum: This long function is called above to create the plot and must be run before the above will work def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') print(cm) thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') -------------------------------------------------------------