-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathGSE_scrape.py
More file actions
114 lines (88 loc) · 3.94 KB
/
GSE_scrape.py
File metadata and controls
114 lines (88 loc) · 3.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python
"""GSE Accession Analysis for human: Wordcloud for GSM Characteristics
by Xinran Bi
Modified by Halie Rando as follows:
- Converted from GSE_scrape_human.ipynb on Jan 14 at 12:05pm using `jupyter nbconvert --to script`
- Debugged return statement being outside of an if loop
- Refactored to use the same code for both mouse and human
"""
import re
import json
import sys
import requests
from bs4 import BeautifulSoup
import pandas as pd
def import_GSE(species):
"""Filters a dataframe form GREIN by species
Accepts: species name ("mouse" or "human")
Returns: pandas dataframe
"""
GREIN_data = pd.read_csv("data/GREIN_data.csv")
if species == "mouse":
return GREIN_data[GREIN_data.Species == 'Mus musculus']['GEO accession'].tolist()
if species == "human":
return GREIN_data[GREIN_data.Species == 'Homo sapiens']['GEO accession'].tolist()
print("Error: Unknown species")
sys.exit(1)
def scrape_geo_data(geo_id):
url = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={0}".format(geo_id)
try:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# Find all <a> tags with href attributes containing "GSM"
gsm_links = soup.find_all('a',
href=lambda href: href and href.startswith('/geo/query/acc.cgi?acc=GSM'))
# Extract and store only the GSM values
gsm_values = [link.text for link in gsm_links]
return gsm_values
return "Failed to retrieve the page. Status code: {0}".format(response.status_code)
except requests.exceptions.RequestException as e:
return "Error: {0}".format(e)
def scrape_characteristics(geo_id):
url = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={0}".format(geo_id)
try:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# Find the "Characteristics" label
characteristics_label = soup.find('td', text='Characteristics')
if characteristics_label:
# Get the next sibling (which contains the characteristics content)
characteristics_content = characteristics_label.find_next_sibling('td')
characteristics_string = str(characteristics_content)
return characteristics_string
return "Failed to retrieve the page. Status code: {0}".format(response.status_code)
except requests.exceptions.RequestException as e:
return "Error: {0}".format(e)
def extract_characteristics(input_str):
input_str = re.sub(r'<td[^>]*>', '', input_str) # remove <td> tags
pattern = r'(\w+): ([^<]+)'
matches = re.findall(pattern, input_str)
characteristics_dictionary = dict(matches)
return characteristics_dictionary
def save_results_to_file(results, filename):
with open(filename, 'w', encoding='utf-8') as file:
for result in results:
file.write(result + '\n')
# Define species from standard in
species = sys.argv[1]
# Identify all relevant GSEs for this species
GSEs = import_GSE(species)
full_results = dict()
num_records = dict()
for GSE in GSEs[:100]:
GSM_list = [item for sublist in scrape_geo_data(GSE) for item in sublist] #flatten the list
num_records[GSE] = len(GSM_list)
results = {}
# for GSM in GSM_list:
# characteristics_string = scrape_characteristics(GSM)
# characteristics_dictionary = extract_characteristics(characteristics_string)
# results[GSM] = characteristics_dictionary
# #print(f"Characteristics for {GSM}: {characteristics_dictionary}")
#full_results[GSE] = results
print(num_records)
json_file = "data/test_characteristics_{0}.json".format(species)
with open(json_file, "w") as file:
json.dump(full_results, file)
print("Characteristics saved to {0}".format(json_file))