-
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_pipeline.py
More file actions
61 lines (48 loc) · 1.72 KB
/
data_pipeline.py
File metadata and controls
61 lines (48 loc) · 1.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from bs4 import BeautifulSoup
import requests
import urllib3
import pandas as pd
# the site has some insecure/incomplete SSL certificate chain
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# change URL each year in accordance to new cutoff-data
URL = "https://admissions.bits-pilani.ac.in/FD/BITSAT_cutOffs.html?FQwp43qOeKhayi8LEQVUtJn3QNZ0TciWLP4NKxNMfcgzQdzcqZCCLqDBZRDnjcsHWFGgSC&yr=2025-2026&eKhayi8LEQwp4NKxN+CfCh+3qOVUtJn3QNZ0TciWLP4"
# static value for year chosen
year = 2025
html_text = requests.get(URL, verify=False).text
soup = BeautifulSoup(html_text, "lxml")
# to modify accordingly
div = soup.find("div", id="2025-2026")
data = []
tables = div.find_all("table")
for table in tables:
rows = table.find_all("tr")
# skip any tiny-tables in the way
if len(rows) < 3:
continue
for row in rows[1:]:
cols = row.find_all("td")
if len(cols) >= 4:
campus = cols[0].get_text(strip=True)
program = cols[1].get_text(strip=True)
cutoff = cols[2].get_text(strip=True)
if "Goa" in campus:
campus = "Goa"
elif campus not in ["Pilani", "Goa", "Hyderabad"]:
continue
# validating if data row
if cutoff.isdigit() and program.lower() != "program":
data.append(
{
"campus": campus,
"branch": program,
"marks": int(cutoff),
"year": year,
}
)
if data:
df = pd.DataFrame(data)
df = df.drop_duplicates()
df.to_csv("bitsat_cutoffs.csv", index=False)
print(df.head(10))
else:
print("\ntask failed")