#!/usr/bin/env python2.7 # Michael Shavlovsky, 2013 """The program counts occurrences of words in the standard input. The output is two columns: words and its counts. The output can be ordered using arguments --ascend, --descend and --alpha. If --alpha is provided then the output is ordered according to the alphabetical order, otherwise the order is according to counts.""" import sys import argparse import collections def main(args): """ The function reads standard input and counts words frequences. It prints out two columns: words and counts. The order is specified using command line arguments.""" # Specifies command line arguments: --ascend, --descend and --alpha. # If --alpha is provided then the ordering is according to # the alphabetical order. msg = """Counts word frequences appeared in the standard input. The output is two columns of words and word-counts separated by tab. """ parser = argparse.ArgumentParser(description=msg) parser.add_argument('--ascend', dest='ascend', action='store_true', help='order the output columns in ascending order based on word-counts') parser.add_argument('--descend', dest='descend', action='store_true', help='order the output columns in descending order based on word-counts') parser.add_argument('--alpha', dest='alpha', action='store_true', help='order the output columns in alphabetic order') # Parses arguments and checks that both --ascend and --descend options # are not present. options = parser.parse_args(args) if options.ascend and options.descend: sys.stderr.write("Error, both --ascend and --descend options are specified\n") sys.stderr.write("Please pick one.\n") return # Okay, arguments are processed. Now let's count words. counts = collections.defaultdict(int) for word in read_word(sys.stdin): counts[word] += 1 print_output(sys.stdout, counts, options) def read_word(infile_obj): """ The function yields a word at a time from input file object. It is assumed that input consists from only ASCII characters. A word is a contiguous sequence of characters from alphabet: {abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ}, other characters are separators.""" alphabet = list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ') for line in infile_obj.readlines(): # Processes one character at a time of current line. # It outputs a word when it detects it. char_list = [] for char in line: if char in alphabet: char_list.append(char) else: # char is a separator if len(char_list) == 0: continue word = ''.join(char_list) char_list = [] yield word def print_output(outfile_obj, counts, options): """ The function prints two columns, words and word-counts, into outfile_obj Args: - outfile_obj is a file object to write output. - counts is a dictionary with words as keys and counts as values. - options is an object with formating options: - options.ascend - order columns in ascending word-counts order - options.descend - order columns in descending word-counts order - options.alpha - order is based on alphabetic order comparison """ # counts_tuples is a list that contains tuples (word, count) for sorting purposes counts_tuples = [(x, counts[x]) for x in counts] # If we need to sort at all if options.ascend or options.descend: # Yes, we need to order the output if options.alpha: if options.ascend: # Orders in asceding alphabetical order counts_tuples.sort(key=lambda x: x[0]) else: # Orders in desceding alphabetical order counts_tuples.sort(key=lambda x: x[0], reverse=True) else: if options.ascend: # Orders in ascending order based on counts # cmp_custom is a function which helps to break ties while # ascenging sorting according to word-count. # In case of tie the order is alphabetical based on words. cmp_custom_ascend = lambda x, y: cmp(x[0], y[0]) \ if x[1] == y[1] else cmp(x[1], y[1]) counts_tuples.sort(cmp = cmp_custom_ascend) else: # Orders in descending order based on counts cmp_custom_descend = lambda x, y: cmp(x[0], y[0]) \ if x[1] == y[1] else -cmp(x[1], y[1]) counts_tuples.sort(cmp = cmp_custom_descend) # Okay, counts_tuples is sorted. # Writes counts_tuples into the file object. for word, count in counts_tuples: outfile_obj.write("%s\t%s\n" % (word, count)) if __name__ == '__main__': sys.exit(main(sys.argv[1:]))