Mercurial > hg > TextShaper
comparison textshaper/split.py @ 46:7e63ca061b6c
start findall function
| author | Jeff Hammel <k0scist@gmail.com> |
|---|---|
| date | Sat, 16 May 2015 18:53:53 -0700 |
| parents | ccbdc00d4f0a |
| children | 03ce88daa98d |
comparison
equal
deleted
inserted
replaced
| 45:ccbdc00d4f0a | 46:7e63ca061b6c |
|---|---|
| 4 split paragraphs, sentences, etc | 4 split paragraphs, sentences, etc |
| 5 """ | 5 """ |
| 6 | 6 |
| 7 # imports | 7 # imports |
| 8 import argparse | 8 import argparse |
| 9 import re | |
| 10 import string | |
| 9 import sys | 11 import sys |
| 12 | |
| 13 | |
| 14 def findall(sub, _string): | |
| 15 """find all occurances of `sub` in _string""" | |
| 16 | |
| 17 retval = [] | |
| 18 index = 0 | |
| 19 while True: | |
| 20 try: | |
| 21 index = _string.index(sub, index) | |
| 22 retval.append(index) | |
| 23 index += 1 | |
| 24 except ValueError: | |
| 25 return retval | |
| 26 | |
| 10 | 27 |
| 11 def split_paragraphs(text): | 28 def split_paragraphs(text): |
| 12 | 29 |
| 13 lines = [line.strip() for line in text.strip().splitlines()] | 30 lines = [line.strip() for line in text.strip().splitlines()] |
| 14 lines = [line if line else '\n' | 31 lines = [line if line else '\n' |
| 23 # parse command line arguments | 40 # parse command line arguments |
| 24 parser = argparse.ArgumentParser(description=__doc__) | 41 parser = argparse.ArgumentParser(description=__doc__) |
| 25 parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) | 42 parser.add_argument('file', nargs='?', type=argparse.FileType('r'), default=sys.stdin) |
| 26 options = parser.parse_args(args) | 43 options = parser.parse_args(args) |
| 27 | 44 |
| 45 # preprocess text | |
| 28 text = options.file.read().strip() | 46 text = options.file.read().strip() |
| 29 text = ' '.join(text.split()) | 47 text = ' '.join(text.split()) |
| 30 # paragraphs = split_paragraphs(text) | 48 # paragraphs = split_paragraphs(text) |
| 31 | 49 |
| 32 punctuation = ('.',) | 50 ends = '.?!' |
| 33 | 51 |
| 52 for end in ends: | |
| 34 # for paragraph in paragraphs: | 53 # for paragraph in paragraphs: |
| 35 # print (paragraph) | 54 # print (paragraph) |
| 36 | 55 |
| 37 if __name__ == '__main__': | 56 if __name__ == '__main__': |
| 38 main() | 57 main() |
