Mercurial > hg > TextShaper
comparison textshaper/split.py @ 55:4e2190495d50
this basically works
| author | Jeff Hammel <k0scist@gmail.com> |
|---|---|
| date | Sun, 17 May 2015 17:14:47 -0700 |
| parents | 1d755747e67a |
| children |
comparison
equal
deleted
inserted
replaced
| 54:1d755747e67a | 55:4e2190495d50 |
|---|---|
| 69 return paragraphs | 69 return paragraphs |
| 70 | 70 |
| 71 def words(text): | 71 def words(text): |
| 72 """return the alphanumeric words in a sentence""" | 72 """return the alphanumeric words in a sentence""" |
| 73 words = text.strip().split() | 73 words = text.strip().split() |
| 74 return [word for word in words] | 74 return [word for word in words |
| 75 if set(word).intersection(string.letters)] | |
| 75 | 76 |
| 76 def main(args=sys.argv[1:]): | 77 def main(args=sys.argv[1:]): |
| 77 """CLI""" | 78 """CLI""" |
| 78 | 79 |
| 79 # parse command line arguments | 80 # parse command line arguments |
| 80 parser = argparse.ArgumentParser(description=__doc__) | 81 parser = argparse.ArgumentParser(description=__doc__) |
| 81 parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) | 82 parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) |
| 82 parser.add_argument('-n', '--number', dest='number', | 83 parser.add_argument('-n', '--number', dest='number', |
| 83 action='store_true', default=False, | 84 action='store_true', default=False, |
| 84 help="number the sentences (CSV)") | 85 help="number the sentences (CSV)") |
| 86 parser.add_argument('-c', '--count', dest='count', | |
| 87 action='store_true', default=False, | |
| 88 help="count the words in each sentence (CSV)") | |
| 85 parser.add_argument('-o', '--output', dest='output', | 89 parser.add_argument('-o', '--output', dest='output', |
| 86 type=argparse.FileType('w'), default=sys.stdout, | 90 type=argparse.FileType('w'), default=sys.stdout, |
| 87 help="file to output to, or stdout by default") | 91 help="file to output to, or stdout by default") |
| 88 options = parser.parse_args(args) | 92 options = parser.parse_args(args) |
| 89 | 93 |
| 96 ends = '.?!' | 100 ends = '.?!' |
| 97 sentences = split_sentences(text, ends) | 101 sentences = split_sentences(text, ends) |
| 98 | 102 |
| 99 # display | 103 # display |
| 100 if options.number: | 104 if options.number: |
| 105 if options.count: | |
| 106 raise NotImplementedError('TODO') # -> record TODO items | |
| 101 writer = csv.writer(options.output) | 107 writer = csv.writer(options.output) |
| 108 for index, sentence in enumerate(sentences, 1): | |
| 109 writer.writerow([index, sentence]) | |
| 110 elif options.count: | |
| 111 writer = csv.writer(options.output) | |
| 112 for sentence in sentences: | |
| 113 n_words = len(words(sentence)) | |
| 114 writer.writerow([n_words, sentence]) | |
| 102 else: | 115 else: |
| 103 for sentence in sentences: | 116 for sentence in sentences: |
| 104 options.output.write(sentence + '\n') | 117 options.output.write(sentence + '\n') |
| 105 | 118 |
| 106 if __name__ == '__main__': | 119 if __name__ == '__main__': |
