-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathscrape.py
More file actions
70 lines (55 loc) · 2 KB
/
scrape.py
File metadata and controls
70 lines (55 loc) · 2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import requests
import lxml.html as lh
import pandas as pd
from bs4 import BeautifulSoup
import asyncio
import json
import aiohttp
from understat import Understat
# url = 'https://understat.com/league/EPL'
# #Create a handle, page, to handle the contents of the website
# page = requests.get(url)
# #Store the contents of the website under doc
# soup = BeautifulSoup(page.text, 'html.parser')
# print(soup.findAll("div", {'id': 'league-chemp'}))
# table_div = soup.find('div' , {'id': 'league-chemp'})
# table = table_div.find('table')
# content = str(table)
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import re
# Entering the league's link
link = "https://understat.com/league/EPL"
res = requests.get(link)
soup = BeautifulSoup(res.content,'lxml')
scripts = soup.find_all('script')
# Get the table
strings = scripts[2].string
# Getting rid of unnecessary characters from json data
ind_start = strings.index("('")+2
ind_end = strings.index("')")
json_data = strings[ind_start:ind_end]
json_data = json_data.encode('utf8').decode('unicode_escape')
data = json.loads(json_data)
df = pd.DataFrame(data.values())
df = df.explode("history")
h = df.pop("history")
df = pd.concat([df.reset_index(drop=True), pd.DataFrame(h.tolist())], axis=1)
df = df.infer_objects()
table = df.groupby(['title']).agg({'wins': 'sum', 'draws': 'sum', 'loses': 'sum', 'scored': 'sum', 'missed': 'sum', 'pts': 'sum', 'xG': 'sum', 'xGA': 'sum', 'xpts': 'sum', 'npxG': 'sum', 'npxGA': 'sum', 'deep': 'sum', 'deep_allowed': 'sum'}).reset_index()
table = table.sort_values(by=['pts'], ascending=False)
# You could uncomment next lines to add a ranking format
# Position = [i for i in range(1,21)]
# table['Pos'] = Position
# table.set_index('Pos', inplace=True)
csv_table = table[['xG', 'xGA', 'pts', 'xpts']]
csv_table.to_csv('football.csv', index=None)
# x_train = table[['xG', 'xGA', 'pts']].values
# y_train = table['xpts'].values