import nltk from nltk.corpus import PlaintextCorpusReader corpus_root = 'JayZ' wordlist = PlaintextCorpusReader(corpus_root, '.*') kjv = nltk.corpus.gutenberg.words('bible-kjv.txt') # This is how you access the gutenberg corpus import re """ This function takes in an object of the type PlaintextCorpusReader, and system path. It returns an nltk corpus It requires the regular expression package re to work """ def create_corpus(wordlist, some_corpus): #process the files so I know what was read in for fileid in wordlist.fileids(): raw = wordlist.raw(fileid) raw = re.split(r'\W+', raw) ## split the raw text into appropriate words some_corpus.extend(raw) print fileid return some_corpus """ The function for calculating lexical diversity """ def lexical_diversity(my_text_data): word_count = len(my_text_data) vocab_size = len(set(my_text_data)) diversity_score = word_count / vocab_size return diversity_score """ Exercise, you may want to use the python funciton set. """ def NumberOfUniqueWords(SomeCorpus): return "%SOmeNumber%"