forked from MrBunsy/random_eprints
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrandom_text.py
More file actions
93 lines (74 loc) · 3.21 KB
/
random_text.py
File metadata and controls
93 lines (74 loc) · 3.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import collections, random, sys, textwrap
import re
import argparse
#source: https://benhoyt.com/writings/markov-chain/
#licence "I hereby give you permission to do whatever you want with it."
class RandomText:
def __init__(self, source_file, max_words=200):
self.possibles = self.get_possibles(source_file)
self.max_words = 200
def fix_line(self, line):
if line.endswith("- \n"):
# print(line)
line = line[:-3]
# line = re.sub('[\W_]+', '', line)
if len(line) < 2:
return ""
return line
def get_possibles(self, text_file_path):
# Build possibles table indexed by pair of prefix words (w1, w2)
w1 = w2 = ''
possibles = collections.defaultdict(list)
with open(text_file_path, encoding='utf-8') as textfile:
all_text = "".join([self.fix_line(line) for line in textfile])
# all_text = all_text.replace("- \n", "")
# all_text= re.sub('[\W_]+', '', all_text)
for line in all_text.split("\n"):
for word in line.split():
word=word.strip()
# word = re.sub(r'[\W_.]+', '', word)
word = re.sub(r'[^a-zA-Z0-9 \-\,\.\'"]', '', word)
if len(word) == 0 or word.isnumeric():
continue
possibles[w1, w2].append(word)
w1, w2 = w2, word
# Avoid empty possibles lists at end of input
possibles[w1, w2].append('')
possibles[w2, ''].append('')
return possibles
def get_words(self, min_words=100, max_words=-1):
# Generate randomized output (start with a random capitalized prefix)
# Try and end at a full stop,
if max_words < 0:
max_words = self.max_words
w1, w2 = random.choice([k for k in self.possibles if k[0][:1].isupper()])
output = [w1, w2]
# for i in range(words):
i = 0
while i < max_words:
word = random.choice(self.possibles[w1, w2])
output.append(word)
w1, w2 = w2, word
if i > min_words and word.endswith("."):
break
i += 1
return output
# # Print output wrapped to 70 columns
# print(textwrap.fill(' '.join(output)))
def parse_args_random_text():
parser = argparse.ArgumentParser(description="Generate random text with markov chains")
parser.add_argument('-t', '--textfile', type=str, help="path to text file for data", default='book.txt')
parser.add_argument('-c', '--wordcount', help="Minimum words to generate", default=50)
parser.add_argument('-w', '--wordwrap', help="Wordwrap output", action='store_true')
return parser.parse_args()
if __name__ == "__main__":
args = parse_args_random_text()
#"C:/Users/lpw1r25/Documents/Tasks/3.5/random_data/themodernclock.txt"
# possibles = get_possibles(args.textfile)
# output = get_words(possibles, int(args.wordcount))
textgen = RandomText(args.textfile)
output = textgen.get_words(int(args.wordcount), 2*(int(args.wordcount)))
if args.wordwrap:
print(textwrap.fill(' '.join(output)))
else:
print(' '.join(output))