#!/usr/bin/env python ############################################################################## # # FILE: lit.py # AUTHOR: Paul Gorman (paul@paulgorman.org) # CREATED: 6 January 2010 # # This Python script generates a report with statistics and visualizations # about novels found on Project Gutenberg. The script acts like a filter; it # reads the novel text file from STDIN, and sends an HTML formatted report to # STDOUT. You could use it like this: # # $ cat gutenberg_novel.txt | python novelvis.py > novel_report.html # # Find more info at http://paulgorman.org/software/litvis/ # ############################################################################## import locale import os import re import sys locale.setlocale( locale.LC_ALL, os.environ['LANG'] ) # For thousands format title = '' author = '' #distinct_words = set() # Unique words excluding common the's, be's, to's, etc. distinct_words = {} # Unique words excluding common the's, be's, to's, etc. total_word_count = 0 common_word_count = 0 # Sum of all the's, be's, to's, etc. less_common_word_count = 0 # Sum of all words excluding the's, be's, to's, etc. total_paragraph_count = 0 total_chapters = 0 avg_words_per_sentence = 0 avg_sentences_per_paragraph = 0 avg_paragraphs_per_chapter = 0 common_words = set( ['the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her', 'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their', 'what', 'so', 'up', 'out', 'if', 'about', 'who', 'get', 'which', 'go', 'me', 'when', 'make', 'can', 'like', 'time', 'no', 'just', 'him', 'know', 'take', 'was', 'is', 'had', 'were', 'some', 'are', 'upon', 'into', 'them', 'been', 'these', 'having', 'said', 'its', 'then', 'such', 'than', 'did', 'could', 'over', 'down', 'should', 'our', 'being', 'those', 'any', 'has', 'very', 'your', 'am', 'now', 'more', 'must', 'me', 'only', 'never', 'mr', 'mrs', 'miss', 'little', 'still', 'before', 'think', 'though', 'seemed', 'see', 'man', 'us', 'looked', 'himself', 'herself', 'own', 'much', 'went', 'thought', 'came', 'yet', 'come', 'other', 'saw', 'again', 'made', 'way', 'after', 'myself', 'might', 'first', 'put', 'last', 'away', 'many', 'two', 'too', 'took', 'back', 'well', 'ever', 'even', 'here', 'may', 'every', 'day', 'got', 'quite', 'how', 'once', 'through', 'seen', 'dont', 'found', 'told', 'off', 'tell', 'told', 'without', 'left', 'among', 'both', 'asked', 'enough', 'knew', 'while', 'always', 'rather', 'give', 'done', 'sir', 'soon', 'most', 'where', 'same', 'look', 'heard', 'under', 'thing', 'things', 'something', 'nothing', 'everything' ] ) # Marks end of Project Gutenberg preamble or start of postamble: gutenberg_pattern = re.compile( r'^\*{3}\s*(?:START|END) OF TH(?:E|IS) PROJECT GUTENBERG .*$' ) gutenberg_cruft = True # Project Gutenberg pre- or post-amble text # Matches title in Gutenberg preamble section: title_pattern = re.compile( r'^\s*Title:\s+(.*)$' ) # Matches author in Gutenberg preamble section: author_pattern = re.compile( r'^\s*Author:\s+(.*)$' ) nonalpha_pattern = re.compile( r'[^a-z ]', re.IGNORECASE ) chapter_pattern = re.compile( r'^\s*CHAPTER\s+[ivxl1-9]+.*$', re.IGNORECASE ) try: while True: input = raw_input() if gutenberg_pattern.search( input ) and gutenberg_cruft == True: # Start of original book, after Gutenberg preamble gutenberg_cruft = False elif gutenberg_pattern.search( input ) and gutenberg_cruft == False: # End of original book, start of Gutenberg postamble break elif gutenberg_cruft == False: # Text from original book if chapter_pattern.search( input ): sys.stderr.write( input + "\n" ) stripped_input = nonalpha_pattern.sub( '', input ) words = stripped_input.split() total_word_count += len( words ) for word in words: if word.lower() not in common_words: less_common_word_count += 1 if distinct_words.has_key( word ): distinct_words[ word ] += 1 else: distinct_words[ word ] = 1 else: common_word_count += 1 elif gutenberg_cruft == True and title_pattern.search( input ): # Title line within Project Gutenberg preamble title = title_pattern.match( input ).group( 1 ).rstrip() elif gutenberg_cruft == True and author_pattern.search( input ): # Author line within Project Gutenberg preamble author = author_pattern.match( input ).group( 1 ).rstrip() except EOFError: pass print '
Title: ' + title + '
'
print 'Author: ' + author + '
'
print 'Total words: ' + locale.format( "%0i", total_word_count, True ) \
+ '
'
print 'Common words (the, be, to, etc): ' \
+ locale.format( "%0i", common_word_count, True ) \
+ ' (' + str( int( float( common_word_count ) / \
float( total_word_count ) * 100 ) ) + '% of total words)
'
print 'Less common words: ' \
+ locale.format( "%0i", less_common_word_count, True ) \
+ ' (' + str( int( float( less_common_word_count ) / \
float( total_word_count ) * 100 ) ) + '% of total words)
'
print 'Distinct less common words: ' \
+ locale.format( "%0i", len( distinct_words ), True ) \
+ ' (' + str( int( float( len( distinct_words ) ) / \
float( total_word_count ) * 100 ) ) + '% of total words)
'
print 'Words used only once:'
words_used_once_count = 0
for word, count in distinct_words.iteritems():
if count == 1:
words_used_once_count += 1
print locale.format( "%0i", words_used_once_count, True ) + ' (' \
+ str( int( float( words_used_once_count ) / \
float( total_word_count ) * 100 ) ) + '% of total words)
Most often used words: ' for i in range( 0, 50 ): print most_distinct_words[ i ][ 0 ] + ', ' print '
' print '