Metadata/GSE_scrape.py at main · vbilsmith/Metadata · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/env python

"""GSE Accession Analysis for human: Wordcloud for GSM Characteristics
 by Xinran Bi

 Modified by Halie Rando as follows:
- Converted from GSE_scrape_human.ipynb on Jan 14 at 12:05pm using `jupyter nbconvert --to script`
- Debugged return statement being outside of an if loop
- Refactored to use the same code for both mouse and human
"""
import re
import json
import sys
import requests
from bs4 import BeautifulSoup
import pandas as pd

def import_GSE(species):
    """Filters a dataframe form GREIN by species
    Accepts: species name ("mouse" or "human")
    Returns: pandas dataframe
    """
    GREIN_data = pd.read_csv("data/GREIN_data.csv")
    if species == "mouse":
        return GREIN_data[GREIN_data.Species == 'Mus musculus']['GEO accession'].tolist()
    if species == "human":
        return GREIN_data[GREIN_data.Species == 'Homo sapiens']['GEO accession'].tolist()
    print("Error: Unknown species")
    sys.exit(1)


def scrape_geo_data(geo_id):
    url = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={0}".format(geo_id)

    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all <a> tags with href attributes containing "GSM"
            gsm_links = soup.find_all('a',
                                      href=lambda href: href and href.startswith('/geo/query/acc.cgi?acc=GSM'))

            # Extract and store only the GSM values
            gsm_values = [link.text for link in gsm_links]

            return gsm_values

        return "Failed to retrieve the page. Status code: {0}".format(response.status_code)

    except requests.exceptions.RequestException as e:
        return "Error: {0}".format(e)


def scrape_characteristics(geo_id):
    url = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={0}".format(geo_id)

    try:
        response = requests.get(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find the "Characteristics" label
            characteristics_label = soup.find('td', text='Characteristics')

            if characteristics_label:
                # Get the next sibling (which contains the characteristics content)
                characteristics_content = characteristics_label.find_next_sibling('td')
                characteristics_string = str(characteristics_content)

                return characteristics_string

        return "Failed to retrieve the page. Status code: {0}".format(response.status_code)

    except requests.exceptions.RequestException as e:
        return "Error: {0}".format(e)

def extract_characteristics(input_str):
    input_str = re.sub(r'<td[^>]*>', '', input_str)  # remove <td> tags
    pattern = r'(\w+): ([^<]+)'
    matches = re.findall(pattern, input_str)
    characteristics_dictionary = dict(matches)
    return characteristics_dictionary

def save_results_to_file(results, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        for result in results:
            file.write(result + '\n')

# Define species from standard in
species = sys.argv[1]

# Identify all relevant GSEs for this species
GSEs = import_GSE(species)
full_results = dict()
num_records = dict()
for GSE in GSEs[:100]:
    GSM_list = [item for sublist in scrape_geo_data(GSE) for item in sublist] #flatten the list
    num_records[GSE] = len(GSM_list)
    results = {}
    # for GSM in GSM_list:
    #     characteristics_string = scrape_characteristics(GSM)
    #     characteristics_dictionary = extract_characteristics(characteristics_string)
    #     results[GSM] = characteristics_dictionary
    #     #print(f"Characteristics for {GSM}: {characteristics_dictionary}")
    #full_results[GSE] = results

print(num_records)
json_file = "data/test_characteristics_{0}.json".format(species)
with open(json_file, "w") as file:
    json.dump(full_results, file)

print("Characteristics saved to {0}".format(json_file))