forked from sd17fall/TextMining
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtext_mining.py
More file actions
77 lines (73 loc) · 2.72 KB
/
text_mining.py
File metadata and controls
77 lines (73 loc) · 2.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
###text_mining.py
import requests
bad_links = ["/wiki/Help","/wiki/File","/wiki/Wiki"]
links = {}
def find_start(text,start):
'''
Finds the start of the body and index 'start' of the wikipedia page represented by 'text'
>>> find_start('<p>more</a></p><p>sucess</p>',10)
15
>>> find_start('cat<p><a>more</a></p><p>sucess</p>',0)
21
'''
check = text.find("<p>",start)
if check+5 >= len(text):
raise ValueError("No appropriate start in string.")
elif text[check+3:check+5]=="<a" or text[check+3:check+5]=="<s":
return find_start(text,check+3)
else:
return check
def analyze_page(url):
return requests.get(url).text
def find_link(text,start):
'''
Finds and returns the first internal link in 'text' after index 'start' and returns the link.
This mehtod is very specific to Wikipedia in its parsing.
>>> find_link('href="/wiki/exploding_kittens"',0)
('/wiki/exploding_kittens', 16)
>>> find_link('href="/wiki/nope" href="/wiki/exploding_kittens"',6)
('/wiki/exploding_kittens', 34)
>>> find_link('href="googlenope" href="/wiki/exploding_kittens"',6)
('/wiki/exploding_kittens', 34)
'''
link_start = text.find('href=',start)+6
link_end = text.find('"',link_start)
first_link = text[link_start:link_end]
a = first_link[:5]
b = first_link[:10]
if a != "/wiki" or b in bad_links: #insures it is a non-file, non-help internal link
return find_link(text,link_end)
else:
return first_link,link_start+10
def crawl(page,depth,width):
'''
Accepts a starting 'page' and creates a tree of depth 'depth' following the first 'width' Wikipedia article links on each page.
Returns a list whose first element is the origin, and each subsequent element is a branch. This pattern is followed recursively for each nested list.
>>> print(crawl('/wiki/Turkish_language', 1, 1))
/wiki/Turkish_language
### Can't figure out how to make second test work, print statements commented for doctest ease
#>>> crawl('/wiki/Turkish_language', 3, 1)
['/wiki/Turkish_language', [['/wiki/Turkic_languages', ['/wiki/Language_family', '/wiki/Native_language']], ['/wiki/Ottoman_Turkish_language', ['/wiki/Register_(sociolinguistics)', '/wiki/Vulgar_Latin']]]]
'''
next_start = 0
if page in links:
#print('"'+page+'" was in links')
return page
else:
links[page] = 1
if depth <= 1:
#print('maximum depth reached')
return page
text = analyze_page('https://en.wikipedia.org'+page)
links_list = []
next_link, next_start = find_link(text,find_start(text,next_start))
res = []
for i in range(0,2):
res.append(crawl(next_link,depth-1,width))
next_link, next_start = find_link(text,find_start(text,next_start))
return([page,res])
out =crawl('/wiki/Turkish_language', 3, 1)
print(out)
if __name__ == "__main__":
import doctest
doctest.testmod(verbose=True)