neuralnet.py - Opengist

Revision 04d9aebbb64c8d911ed27768d1ec43367358f5d7

neuralnet.py · 2.2 KiB · Python Raw

from __future__ import print_function import sys,random # http://stackoverflow.com/a/31505798 import re caps = "([A-Z])" prefixes = "(Mr|St|Mrs|Ms|Dr)[.]" suffixes = "(Inc|Ltd|Jr|Sr|Co)" starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" websites = "[.](com|net|org|io|gov)" def split_into_sentences(text): text = " " + text + " " text = text.replace("\n"," ") text = re.sub(prefixes,"\\1<prd>",text) text = re.sub(websites,"<prd>\\1",text) if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>") text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text) text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text) text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text) text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text) text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text) text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text) text = re.sub(" " + caps + "[.]"," \\1<prd>",text) if "”" in text: text = text.replace(".”","”.") if "\"" in text: text = text.replace(".\"","\".") if "!" in text: text = text.replace("!\"","\"!") if "?" in text: text = text.replace("?\"","\"?") text = text.replace(".",".<stop>") text = text.replace("?","?<stop>") text = text.replace("!","!<stop>") text = text.replace("<prd>",".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = [s.strip() for s in sentences] return sentences # end stackoverflow outp = {} with open(sys.argv[1]) as f: for sentence in split_into_sentences(f.read()): words = sentence.split() for i, word in enumerate(words): if not word in outp: outp[word] = [] if i+1 < len(words): outp[word].append(words[i+1]) seed = random.choice(outp.keys()) for i in xrange(1,10000 if not len(sys.argv) > 2 else int(sys.argv[2])): if not outp[seed]: print("{}".format(seed), end=" ") seed = random.choice(outp.keys()) else: print("{}".format(seed), end=" ") seed = random.choice(outp[seed]) print("")

1	from __future__ import print_function
2	import sys,random
3
4	# http://stackoverflow.com/a/31505798
5	import re
6	caps = "([A-Z])"
7	prefixes = "(Mr\|St\|Mrs\|Ms\|Dr)[.]"
8	suffixes = "(Inc\|Ltd\|Jr\|Sr\|Co)"
9	starters = "(Mr\|Mrs\|Ms\|Dr\|He\s\|She\s\|It\s\|They\s\|Their\s\|Our\s\|We\s\|But\s\|However\s\|That\s\|This\s\|Wherever)"
10	acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
11	websites = "[.](com\|net\|org\|io\|gov)"
12
13	def split_into_sentences(text):
14	text = " " + text + " "
15	text = text.replace("\n"," ")
16	text = re.sub(prefixes,"\\1<prd>",text)
17	text = re.sub(websites,"<prd>\\1",text)
18	if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
19	text = re.sub("\s" + caps + "[.] "," \\1<prd> ",text)
20	text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
21	text = re.sub(caps + "[.]" + caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
22	text = re.sub(caps + "[.]" + caps + "[.]","\\1<prd>\\2<prd>",text)
23	text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
24	text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
25	text = re.sub(" " + caps + "[.]"," \\1<prd>",text)
26	if "”" in text: text = text.replace(".”","”.")
27	if "\"" in text: text = text.replace(".\"","\".")
28	if "!" in text: text = text.replace("!\"","\"!")
29	if "?" in text: text = text.replace("?\"","\"?")
30	text = text.replace(".",".<stop>")
31	text = text.replace("?","?<stop>")
32	text = text.replace("!","!<stop>")
33	text = text.replace("<prd>",".")
34	sentences = text.split("<stop>")
35	sentences = sentences[:-1]
36	sentences = [s.strip() for s in sentences]
37	return sentences
38	# end stackoverflow
39
40	outp = {}
41	with open(sys.argv[1]) as f:
42	for sentence in split_into_sentences(f.read()):
43	words = sentence.split()
44	for i, word in enumerate(words):
45	if not word in outp:
46	outp[word] = []
47	if i+1 < len(words):
48	outp[word].append(words[i+1])
49
50	seed = random.choice(outp.keys())
51	for i in xrange(1,10000 if not len(sys.argv) > 2 else int(sys.argv[2])):
52	if not outp[seed]:
53	print("{}".format(seed), end=" ")
54	seed = random.choice(outp.keys())
55	else:
56	print("{}".format(seed), end=" ")
57	seed = random.choice(outp[seed])
58	print("")