#!/usr/bin/env python

##############################################################################
#
#  FILE: lit.py
#  AUTHOR: Paul Gorman (paul@paulgorman.org)
#  CREATED: 6 January 2010
#
#  This Python script generates a report with statistics and visualizations 
#  about novels found on Project Gutenberg. The script acts like a filter; it 
#  reads the novel text file from STDIN, and sends an HTML formatted report to 
#  STDOUT. You could use it like this:
#
#      $ cat gutenberg_novel.txt | python novelvis.py > novel_report.html
# 
#  Find more info at http://paulgorman.org/software/litvis/
#
##############################################################################

import locale
import os
import re
import sys

locale.setlocale( locale.LC_ALL, os.environ['LANG'] ) # For thousands format
title = ''
author = ''
#distinct_words = set() # Unique words excluding common the's, be's, to's, etc.
distinct_words = {} # Unique words excluding common the's, be's, to's, etc.
total_word_count = 0
common_word_count = 0 # Sum of all the's, be's, to's, etc.
less_common_word_count = 0 # Sum of all words excluding the's, be's, to's, etc.
total_paragraph_count = 0
total_chapters = 0
avg_words_per_sentence = 0
avg_sentences_per_paragraph = 0
avg_paragraphs_per_chapter = 0

common_words = set( ['the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have',
    'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at',
    'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her', 'she',
    'or', 'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their',
    'what', 'so', 'up', 'out', 'if', 'about', 'who', 'get', 'which', 'go',
    'me', 'when', 'make', 'can', 'like', 'time', 'no', 'just', 'him', 'know',
    'take', 'was', 'is', 'had', 'were', 'some', 'are', 'upon', 'into', 'them',
    'been', 'these', 'having', 'said', 'its', 'then', 'such', 'than', 'did', 
    'could', 'over', 'down', 'should', 'our', 'being', 'those', 'any', 'has',
    'very', 'your', 'am', 'now', 'more', 'must', 'me', 'only', 'never', 'mr',
    'mrs', 'miss', 'little', 'still', 'before', 'think', 'though', 'seemed',
    'see', 'man', 'us', 'looked', 'himself', 'herself', 'own', 'much', 'went',
    'thought', 'came', 'yet', 'come', 'other', 'saw', 'again', 'made', 'way',
    'after', 'myself', 'might', 'first', 'put', 'last', 'away', 'many', 'two',
    'too', 'took', 'back', 'well', 'ever', 'even', 'here', 'may', 'every', 
    'day', 'got', 'quite', 'how', 'once', 'through', 'seen', 'dont', 'found',
    'told', 'off', 'tell', 'told', 'without', 'left', 'among', 'both', 'asked',
    'enough', 'knew', 'while', 'always', 'rather', 'give', 'done', 'sir', 
    'soon', 'most', 'where', 'same', 'look', 'heard', 'under', 'thing',
    'things', 'something', 'nothing', 'everything' ] )

# Marks end of Project Gutenberg preamble or start of postamble:
gutenberg_pattern = re.compile( 
    r'^\*{3}\s*(?:START|END) OF TH(?:E|IS) PROJECT GUTENBERG .*$' )
gutenberg_cruft = True # Project Gutenberg pre- or post-amble text
# Matches title in Gutenberg preamble section:
title_pattern = re.compile( r'^\s*Title:\s+(.*)$' )
# Matches author in Gutenberg preamble section:
author_pattern = re.compile( r'^\s*Author:\s+(.*)$' )
nonalpha_pattern = re.compile( r'[^a-z ]', re.IGNORECASE )
chapter_pattern = re.compile( r'^\s*CHAPTER\s+[ivxl1-9]+.*$', re.IGNORECASE )

try:
    while True:
        input = raw_input()
        if gutenberg_pattern.search( input ) and gutenberg_cruft == True:
            # Start of original book, after Gutenberg preamble
            gutenberg_cruft = False
        elif gutenberg_pattern.search( input ) and gutenberg_cruft == False:
            # End of original book, start of Gutenberg postamble
            break
        elif gutenberg_cruft == False:
            # Text from original book
            if chapter_pattern.search( input ):
                sys.stderr.write( input + "\n" )
            stripped_input = nonalpha_pattern.sub( '', input )
            words = stripped_input.split()
            total_word_count += len( words )
            for word in words:
                if word.lower() not in common_words:
                    less_common_word_count += 1
                    if distinct_words.has_key( word ):
                        distinct_words[ word ] += 1
                    else:
                        distinct_words[ word ] = 1
                else:
                    common_word_count += 1
        elif gutenberg_cruft == True and title_pattern.search( input ):
            # Title line within Project Gutenberg preamble
            title = title_pattern.match( input ).group( 1 ).rstrip()
        elif gutenberg_cruft == True and author_pattern.search( input ):
            # Author line within Project Gutenberg preamble
            author = author_pattern.match( input ).group( 1 ).rstrip()
except EOFError:
    pass

print '<div>'
print '<p><b>Title:</b> ' + title + '<br />'
print '<b>Author:</b> ' + author + '<br />'
print '<b>Total words:</b> ' + locale.format( "%0i", total_word_count, True ) \
    + '<br />'
print '<b>Common words (the, be, to, etc):</b> ' \
    + locale.format( "%0i", common_word_count, True ) \
    + ' (' + str( int( float( common_word_count ) / \
    float( total_word_count ) * 100 ) ) + '% of total words)<br />'
print '<b>Less common words:</b> ' \
    + locale.format( "%0i", less_common_word_count, True ) \
    + ' (' + str( int( float( less_common_word_count ) / \
    float( total_word_count ) * 100 ) ) + '% of total words)<br />'
print '<b>Distinct less common words:</b> ' \
    + locale.format( "%0i", len( distinct_words ), True ) \
    + ' (' + str( int( float( len( distinct_words ) ) / \
    float( total_word_count ) * 100 ) ) + '% of total words)<br />'

print '<b>Words used only once:</b>'
words_used_once_count = 0
for word, count in distinct_words.iteritems():
    if count == 1:
        words_used_once_count += 1
print locale.format( "%0i", words_used_once_count, True ) + ' (' \
    + str( int( float( words_used_once_count ) / \
    float( total_word_count ) * 100 ) )  + '% of total words)</p>'

most_distinct_words = sorted( distinct_words.iteritems(), \
    key = lambda (k, v): (v, k), reverse=True )
print '<p><b>Most often used words:</b> '
for i in range( 0, 50 ):
    print most_distinct_words[ i ][ 0 ] + ', '
print '</p>'

print '</div>'