-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkeywordExtraction3.py
More file actions
90 lines (71 loc) · 4.2 KB
/
keywordExtraction3.py
File metadata and controls
90 lines (71 loc) · 4.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# pip install git+https://github.com/LIAAD/yake
import yake
from DatasetHandler import DataHandler
kw_extractor = yake.KeywordExtractor()
keywords = kw_extractor.extract_keywords(text)
dh = DataHandler("Data.xlsx")
df = dh.get_dataframe()
extracted_keywords = []
for abs in df['Abstract']:
keywords = kw_extractor.extract_keywords(abs)
extracted_keywords.append(keywords)
# text = "Sources tell us that Google is acquiring Kaggle, a platform that hosts data science and machine learning "\
# "competitions. Details about the transaction remain somewhat vague, but given that Google is hosting its Cloud "\
# "Next conference in San Francisco this week, the official announcement could come as early as tomorrow. "\
# "Reached by phone, Kaggle co-founder CEO Anthony Goldbloom declined to deny that the acquisition is happening. "\
# "Google itself declined 'to comment on rumors'. Kaggle, which has about half a million data scientists on its platform, "\
# "was founded by Goldbloom and Ben Hamner in 2010. "\
# "The service got an early start and even though it has a few competitors like DrivenData, TopCoder and HackerRank, "\
# "it has managed to stay well ahead of them by focusing on its specific niche. "\
# "The service is basically the de facto home for running data science and machine learning competitions. "\
# "With Kaggle, Google is buying one of the largest and most active communities for data scientists - and with that, "\
# "it will get increased mindshare in this community, too (though it already has plenty of that thanks to Tensorflow "\
# "and other projects). Kaggle has a bit of a history with Google, too, but that's pretty recent. Earlier this month, "\
# "Google and Kaggle teamed up to host a $100,000 machine learning competition around classifying YouTube videos. "\
# "That competition had some deep integrations with the Google Cloud Platform, too. Our understanding is that Google "\
# "will keep the service running - likely under its current name. While the acquisition is probably more about "\
# "Kaggle's community than technology, Kaggle did build some interesting tools for hosting its competition "\
# "and 'kernels', too. On Kaggle, kernels are basically the source code for analyzing data sets and developers can "\
# "share this code on the platform (the company previously called them 'scripts'). "\
# "Like similar competition-centric sites, Kaggle also runs a job board, too. It's unclear what Google will do with "\
# "that part of the service. According to Crunchbase, Kaggle raised $12.5 million (though PitchBook says it's $12.75) "\
# "since its launch in 2010. Investors in Kaggle include Index Ventures, SV Angel, Max Levchin, Naval Ravikant, "\
# "Google chief economist Hal Varian, Khosla Ventures and Yuri Milner "
# # With default parameters
# kw_extractor = yake.KeywordExtractor()
# keywords = kw_extractor.extract_keywords(text)
# for kw in keywords:
# print(kw)
# # specifying parameters
# language = "en"
# max_ngram_size = 3
# deduplication_thresold = 0.9
# deduplication_algo = 'seqm'
# windowSize = 1
# numOfKeywords = 20
# custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_thresold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
# keywords = custom_kw_extractor.extract_keywords(text)
# for kw in keywords:
# print(kw)
# """ Output
# ('google', 0.026580863364597897)
# ('kaggle', 0.0289005976239829)
# ('ceo anthony goldbloom', 0.029946071606210194)
# ('san francisco', 0.048810837074825336)
# ('anthony goldbloom declined', 0.06176910090701819)
# ('google cloud platform', 0.06261974476422487)
# ('co-founder ceo anthony', 0.07357749587020043)
# ('acquiring kaggle', 0.08723571551039863)
# ('ceo anthony', 0.08915156857226395)
# ('anthony goldbloom', 0.09123482372372106)
# ('machine learning', 0.09147989238151344)
# ('kaggle co-founder ceo', 0.093805063905847)
# ('data', 0.097574333771058)
# ('google cloud', 0.10260128641464673)
# ('machine learning competitions', 0.10773000650607861)
# ('francisco this week', 0.11519915079240485)
# ('platform', 0.1183512305596321)
# ('conference in san', 0.12392066376108138)
# ('service', 0.12546743261462942)
# ('goldbloom', 0.14611408778815776)
# """