From 85ecc01959347a69b0cf49fccdb9eae52082bf33 Mon Sep 17 00:00:00 2001 From: Flavore669 Date: Tue, 10 Feb 2026 01:38:04 -0600 Subject: [PATCH 01/14] basic html scraping --- scrapers/degrees.go | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 scrapers/degrees.go diff --git a/scrapers/degrees.go b/scrapers/degrees.go new file mode 100644 index 0000000..e69de29 From d1cde2a5a4e3c8f0d772182d3ad25f8392eafdfe Mon Sep 17 00:00:00 2001 From: Flavore669 Date: Tue, 10 Feb 2026 01:38:50 -0600 Subject: [PATCH 02/14] basic html scraping --- scrapers/degrees.go | 56 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/scrapers/degrees.go b/scrapers/degrees.go index e69de29..e66624b 100644 --- a/scrapers/degrees.go +++ b/scrapers/degrees.go @@ -0,0 +1,56 @@ +package scrapers + +import ( + "bufio" + "log" + "os" + "path/filepath" + + "github.com/UTDNebula/api-tools/utils" + "github.com/chromedp/chromedp" +) + +func ScrapeDegrees(outDir string) { + // Define the URL (replace with actual URL) + const URL = "https://academics.utdallas.edu/degrees/#filter=.alldegrees.bass" + + ctx, cancel := utils.InitChromeDp() + defer cancel() + + var html string + log.Println("Scraping Degrees!") + err := chromedp.Run(ctx, + chromedp.Navigate(URL), + chromedp.WaitVisible("body", chromedp.ByQuery), + chromedp.OuterHTML("html", &html, chromedp.ByQuery), + ) + if err != nil { + log.Panicf("failed to scrape: %v", err) + } + + // Ensure the output directory exists + outputPath := filepath.Join(outDir, "degrees") + err = os.MkdirAll(outputPath, os.ModePerm) + if err != nil { + log.Panicf("failed to create directory: %v", err) + } + + // Write HTML to file + filename := filepath.Join(outputPath, "degrees.html") + file, err := os.Create(filename) + if err != nil { + log.Panicf("failed to create file: %v", err) + } + defer file.Close() + + writer := bufio.NewWriter(file) + defer writer.Flush() + + // Write HTML content + _, err = writer.WriteString(html) + if err != nil { + log.Panicf("failed to write HTML: %v", err) + } + + log.Println("Successfully scraped and saved degrees data.") +} From 60f9687342027968ca27010194e0521256bc3d0b Mon Sep 17 00:00:00 2001 From: Flavore669 Date: Tue, 10 Feb 2026 23:38:23 -0600 Subject: [PATCH 03/14] degree scraper is finished --- scrapers/degrees.go | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/scrapers/degrees.go b/scrapers/degrees.go index e66624b..a8db099 100644 --- a/scrapers/degrees.go +++ b/scrapers/degrees.go @@ -1,7 +1,7 @@ package scrapers import ( - "bufio" + "fmt" "log" "os" "path/filepath" @@ -13,6 +13,7 @@ import ( func ScrapeDegrees(outDir string) { // Define the URL (replace with actual URL) const URL = "https://academics.utdallas.edu/degrees/#filter=.alldegrees.bass" + const scrollHeight = 5 ctx, cancel := utils.InitChromeDp() defer cancel() @@ -35,22 +36,12 @@ func ScrapeDegrees(outDir string) { log.Panicf("failed to create directory: %v", err) } - // Write HTML to file - filename := filepath.Join(outputPath, "degrees.html") - file, err := os.Create(filename) + // Write raw HTML to file + outPath := fmt.Sprintf("%s/degreesScraped.html", outDir) + err = os.WriteFile(outPath, []byte(html), 0644) if err != nil { - log.Panicf("failed to create file: %v", err) + panic(err) } - defer file.Close() - writer := bufio.NewWriter(file) - defer writer.Flush() - - // Write HTML content - _, err = writer.WriteString(html) - if err != nil { - log.Panicf("failed to write HTML: %v", err) - } - - log.Println("Successfully scraped and saved degrees data.") + log.Printf("Finished scraping discount page successfully!\n\n") } From 4b00eb689d2d1d8f9cbf690ba686084e1fcf3302 Mon Sep 17 00:00:00 2001 From: Flavore669 Date: Tue, 10 Feb 2026 23:39:09 -0600 Subject: [PATCH 04/14] starting on parser --- parser/degreeParser.go | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 parser/degreeParser.go diff --git a/parser/degreeParser.go b/parser/degreeParser.go new file mode 100644 index 0000000..5d061d7 --- /dev/null +++ b/parser/degreeParser.go @@ -0,0 +1,31 @@ +package parser + +import ( + "fmt" + "log" + "os" + "strings" + + "github.com/PuerkitoBio/goquery" +) + +func ParseDegrees(inDir string) { + // Read the scraped HTML file + htmlPath := fmt.Sprintf("%s/discountsScraped.html", inDir) + htmlBytes, err := os.ReadFile(htmlPath) + if err != nil { + panic(err) + } + + log.Println("Parsing Degrees...") + + page, err := goquery.NewDocumentFromReader(strings.NewReader(string(htmlBytes))) + + // Find main content + content := page.Find("col-sm-12").First() + if content.Length() == 0 { + panic("failed to find content area") + } + + fmt.Print(content.Text()) +} From cab12129a920b581b44b1a69638c325343adf84f Mon Sep 17 00:00:00 2001 From: Flavore669 Date: Tue, 10 Feb 2026 23:45:06 -0600 Subject: [PATCH 05/14] added flags to main.go --- main.go | 5 +++++ parser/degreeParser.go | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/main.go b/main.go index 3f405d9..e4b80f4 100644 --- a/main.go +++ b/main.go @@ -48,6 +48,7 @@ func main() { mapFlag := flag.Bool("map", false, "Alongside -scrape, -parse, or -upload, signifies that the UTD map should be scraped/parsed/uploaded.") // Flag for academic calendar scraping academicCalendars := flag.Bool("academicCalendars", false, "Alongside -scrape, -parse, or -upload, signifies that the academic calendars should be scraped/parsed/uploaded.") + degrees := flag.Bool("degrees", false, "Alongside -scrape or -parse, signifies that the degrees should be scraped/parsed.") // Flags for parsing parse := flag.Bool("parse", false, "Puts the tool into parsing mode.") @@ -118,6 +119,8 @@ func main() { scrapers.ScrapeMapLocations(*outDir) case *academicCalendars: scrapers.ScrapeAcademicCalendars(*outDir) + case *degrees: + scrapers.ScrapeDegrees(*outDir) default: log.Panic("You must specify which type of scraping you would like to perform with one of the scraping flags!") } @@ -135,6 +138,8 @@ func main() { parser.ParseAcademicCalendars(*inDir, *outDir) case *scrapeDiscounts: parser.ParseDiscounts(*inDir, *outDir) + case *degrees: + parser.ParseDegrees(*inDir, *outDir) default: parser.Parse(*inDir, *outDir, *csvDir, *skipValidation) } diff --git a/parser/degreeParser.go b/parser/degreeParser.go index 5d061d7..779fc3c 100644 --- a/parser/degreeParser.go +++ b/parser/degreeParser.go @@ -9,7 +9,7 @@ import ( "github.com/PuerkitoBio/goquery" ) -func ParseDegrees(inDir string) { +func ParseDegrees(inDir string, outDir string) { // Read the scraped HTML file htmlPath := fmt.Sprintf("%s/discountsScraped.html", inDir) htmlBytes, err := os.ReadFile(htmlPath) From fcc54cf845a02551034384614beb3a4bf065e0ab Mon Sep 17 00:00:00 2001 From: Flavore669 Date: Tue, 10 Feb 2026 23:54:23 -0600 Subject: [PATCH 06/14] minor comment --- main.go | 1 + 1 file changed, 1 insertion(+) diff --git a/main.go b/main.go index e4b80f4..c3aae01 100644 --- a/main.go +++ b/main.go @@ -48,6 +48,7 @@ func main() { mapFlag := flag.Bool("map", false, "Alongside -scrape, -parse, or -upload, signifies that the UTD map should be scraped/parsed/uploaded.") // Flag for academic calendar scraping academicCalendars := flag.Bool("academicCalendars", false, "Alongside -scrape, -parse, or -upload, signifies that the academic calendars should be scraped/parsed/uploaded.") + // Flag for degree scraping and parsing degrees := flag.Bool("degrees", false, "Alongside -scrape or -parse, signifies that the degrees should be scraped/parsed.") // Flags for parsing From d8cce803632ff24fea862d4446feb189c1b8f8a5 Mon Sep 17 00:00:00 2001 From: Flavore669 Date: Thu, 12 Feb 2026 00:02:55 -0600 Subject: [PATCH 07/14] parsing degree name and school successfuly --- parser/degreeParser.go | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/parser/degreeParser.go b/parser/degreeParser.go index 779fc3c..7ba1406 100644 --- a/parser/degreeParser.go +++ b/parser/degreeParser.go @@ -1,6 +1,7 @@ package parser import ( + "encoding/json" "fmt" "log" "os" @@ -9,9 +10,24 @@ import ( "github.com/PuerkitoBio/goquery" ) +type Degree struct { + Id string `bson:"id" json:"id"` + Title string `bson:"name" json:"name"` + School string `bson:"school" json:"school"` + Department string `bson:"department" json:"department"` + StemDesigned bool `bson:"stem_designated" json:"stem_designated"` + DegreeLevels []DegreeLevel `bson:"degreeLevels" json:"degreeLevels"` + PublicUrl string `bson:"public_url" json:"public_url"` +} + +type DegreeLevel struct { + Level string `bson:"level" json:"level"` + Abbreviation string `bson:"abbreviation" json:"abbreviation"` +} + func ParseDegrees(inDir string, outDir string) { // Read the scraped HTML file - htmlPath := fmt.Sprintf("%s/discountsScraped.html", inDir) + htmlPath := fmt.Sprintf("%s/degreesScraped.html", inDir) htmlBytes, err := os.ReadFile(htmlPath) if err != nil { panic(err) @@ -22,10 +38,30 @@ func ParseDegrees(inDir string, outDir string) { page, err := goquery.NewDocumentFromReader(strings.NewReader(string(htmlBytes))) // Find main content - content := page.Find("col-sm-12").First() + content := page.Find("article .col-sm-12").First() if content.Length() == 0 { panic("failed to find content area") } - fmt.Print(content.Text()) + //var degrees []Degrees + + content.Find("div .element-item.all.alldegrees.allschools.academic.bass.masters"). + Each(func(i int, s *goquery.Selection) { + degree := Degree{} + + header := s.Find("div.degreeTitle, div") + title := header.Find("h3") + school := header.Find("div.school") + //schoolLink := header.Find("div.school, a") + + degree.Title = strings.TrimSpace(title.Text()) + degree.School = strings.TrimSpace(school.Text()) + + marshalled, err := json.MarshalIndent(degree, "", "\t") + if err != nil { + panic("could not convert degree to JSON format") + } + + log.Print(string(marshalled)) + }) } From 8ff5a5fb55d5352b52fb1707a5415f8137c6058b Mon Sep 17 00:00:00 2001 From: Flavore669 Date: Fri, 13 Feb 2026 01:12:02 -0600 Subject: [PATCH 08/14] scraping works for all degrees related to one school --- parser/degreeParser.go | 103 +++++++++++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 25 deletions(-) diff --git a/parser/degreeParser.go b/parser/degreeParser.go index 7ba1406..6906dc6 100644 --- a/parser/degreeParser.go +++ b/parser/degreeParser.go @@ -11,18 +11,17 @@ import ( ) type Degree struct { - Id string `bson:"id" json:"id"` - Title string `bson:"name" json:"name"` - School string `bson:"school" json:"school"` - Department string `bson:"department" json:"department"` - StemDesigned bool `bson:"stem_designated" json:"stem_designated"` - DegreeLevels []DegreeLevel `bson:"degreeLevels" json:"degreeLevels"` - PublicUrl string `bson:"public_url" json:"public_url"` + Title string `bson:"name" json:"name"` + School string `bson:"school" json:"school"` + DegreeLevels []DegreeLevel `bson:"degree_levels" json:"degree_levels"` + AreasOfInterest []string `bson:"areas_of_interest" json:"areas_of_interest"` } type DegreeLevel struct { - Level string `bson:"level" json:"level"` - Abbreviation string `bson:"abbreviation" json:"abbreviation"` + Level string `bson:"level" json:"level"` + PublicUrl string `bson:"public_url" json:"public_url"` + CipCode string `bson:"cip_code" json:"cip_code"` + StemDesignated bool `bson:"stem_designated" json:"stem_designated"` } func ParseDegrees(inDir string, outDir string) { @@ -43,25 +42,79 @@ func ParseDegrees(inDir string, outDir string) { panic("failed to find content area") } - //var degrees []Degrees + var degreeLevels []DegreeLevel + content.Find("div .element-item.all.alldegrees.allschools.academic.bass.masters").Each(func(i int, s *goquery.Selection) { + header := s.Find("div > h3").Parent() + title := header.Find("h3") + school := header.Find("div.school") - content.Find("div .element-item.all.alldegrees.allschools.academic.bass.masters"). - Each(func(i int, s *goquery.Selection) { - degree := Degree{} - - header := s.Find("div.degreeTitle, div") - title := header.Find("h3") - school := header.Find("div.school") - //schoolLink := header.Find("div.school, a") - - degree.Title = strings.TrimSpace(title.Text()) - degree.School = strings.TrimSpace(school.Text()) + s.Find("div.degrees > a.footnote").Each(func(j int, degreeLink *goquery.Selection) { + level, exists := degreeLink.Attr("alt") + if !exists { + log.Println("error parsing alt value:") + } - marshalled, err := json.MarshalIndent(degree, "", "\t") - if err != nil { - panic("could not convert degree to JSON format") + urlForDegree, exists := degreeLink.Attr("href") + if !exists { + log.Println("error parsing href value:") } - log.Print(string(marshalled)) + cipCode := degreeLink.Find("div.cip_code") + stemDesignated := degreeLink.Find("div.footnote").Last() // There is only 1 element named STEM-Designated + + degreeLevels = append(degreeLevels, DegreeLevel{ + Level: level, + PublicUrl: strings.TrimSpace(urlForDegree), + CipCode: strings.TrimSpace(cipCode.Text()), + StemDesignated: isNotBlank(strings.TrimSpace(stemDesignated.Text())), + }) }) + + areasOfInterest := s.Find("div.areas_of_interest.d-none").First() + + d := Degree{ + Title: strings.TrimSpace(title.Text()), + School: strings.TrimSpace(school.Text()), + DegreeLevels: degreeLevels, + AreasOfInterest: parseAreasOfInterest(areasOfInterest.Text()), + } + + marshalled, err := json.MarshalIndent(d, "", "\t") + if err != nil { + panic("could not convert degree to JSON format") + } + + /* Debug */ + log.Print(string(marshalled)) + + /* Write to output File */ + outFile, err := os.Create(fmt.Sprintf("%s/degrees.json", outDir)) + if err != nil { + log.Fatalf("could not create output file: %s", err) + } + + _, err = outFile.Write(marshalled) + if err != nil { + log.Fatalf("could not write to output file: %s", err) + } + }) +} + +func isNotBlank(s string) bool { + return s != "" && len(strings.TrimSpace(s)) > 0 +} + +func parseAreasOfInterest(tags string) []string { + return strings.Split(strings.TrimSpace(tags), ",") +} + +// Generate all possible combinations of filters +/* +func GenerateAllCombinations() []map[string]string { + schools := []string{"bass", "jindal", ""} + levels := []string{"bachelors", "masters", ""} + depts := []string{"academic", ""} + + var combinations []map[string]string } +*/ From eeddba26fefd3ea29352c62c20b3b02949437e69 Mon Sep 17 00:00:00 2001 From: Flavore669 Date: Fri, 13 Feb 2026 01:22:43 -0600 Subject: [PATCH 09/14] fix build not working for some reason --- parser/degreeParser.go | 5 ++++- scrapers/degrees.go | 3 +-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/parser/degreeParser.go b/parser/degreeParser.go index 6906dc6..b18e8e2 100644 --- a/parser/degreeParser.go +++ b/parser/degreeParser.go @@ -35,6 +35,9 @@ func ParseDegrees(inDir string, outDir string) { log.Println("Parsing Degrees...") page, err := goquery.NewDocumentFromReader(strings.NewReader(string(htmlBytes))) + if err != nil { + panic(err) + } // Find main content content := page.Find("article .col-sm-12").First() @@ -60,7 +63,7 @@ func ParseDegrees(inDir string, outDir string) { } cipCode := degreeLink.Find("div.cip_code") - stemDesignated := degreeLink.Find("div.footnote").Last() // There is only 1 element named STEM-Designated + stemDesignated := degreeLink.Find("div.footnote").Last() // There is either 1 element named STEM-Designated or no elements at all degreeLevels = append(degreeLevels, DegreeLevel{ Level: level, diff --git a/scrapers/degrees.go b/scrapers/degrees.go index a8db099..5a9ac48 100644 --- a/scrapers/degrees.go +++ b/scrapers/degrees.go @@ -11,9 +11,8 @@ import ( ) func ScrapeDegrees(outDir string) { - // Define the URL (replace with actual URL) + // Define the URL const URL = "https://academics.utdallas.edu/degrees/#filter=.alldegrees.bass" - const scrollHeight = 5 ctx, cancel := utils.InitChromeDp() defer cancel() From 1168b18f200e956e3dbe0613a711cfa889a77c8a Mon Sep 17 00:00:00 2001 From: Flavore669 Date: Sat, 14 Feb 2026 23:54:35 -0600 Subject: [PATCH 10/14] I'm able to parse all the degrees however I made an error with the stemDesignatedField in schema. There are more possible values --- parser/degreeParser.go | 138 ++++++++++++++++++++++------------------- 1 file changed, 74 insertions(+), 64 deletions(-) diff --git a/parser/degreeParser.go b/parser/degreeParser.go index b18e8e2..8bd3cec 100644 --- a/parser/degreeParser.go +++ b/parser/degreeParser.go @@ -10,14 +10,14 @@ import ( "github.com/PuerkitoBio/goquery" ) -type Degree struct { - Title string `bson:"name" json:"name"` - School string `bson:"school" json:"school"` - DegreeLevels []DegreeLevel `bson:"degree_levels" json:"degree_levels"` - AreasOfInterest []string `bson:"areas_of_interest" json:"areas_of_interest"` +type Program struct { + Title string `bson:"name" json:"name"` + School string `bson:"school" json:"school"` + DegreeOptions []Degree `bson:"degree_levels" json:"degree_levels"` + AreasOfInterest []string `bson:"areas_of_interest" json:"areas_of_interest"` } -type DegreeLevel struct { +type Degree struct { Level string `bson:"level" json:"level"` PublicUrl string `bson:"public_url" json:"public_url"` CipCode string `bson:"cip_code" json:"cip_code"` @@ -45,62 +45,70 @@ func ParseDegrees(inDir string, outDir string) { panic("failed to find content area") } - var degreeLevels []DegreeLevel - content.Find("div .element-item.all.alldegrees.allschools.academic.bass.masters").Each(func(i int, s *goquery.Selection) { - header := s.Find("div > h3").Parent() - title := header.Find("h3") - school := header.Find("div.school") + programHTML := GenerateAllCombinations() + + var allPrograms []Program + for _, program := range programHTML { + content.Find(program).Each(func(i int, s *goquery.Selection) { + header := s.Find("div > h3").Parent() + title := header.Find("h3") + school := header.Find("div.school") + var degrees []Degree + s.Find("div.degrees > a.footnote").Each(func(j int, degreeLink *goquery.Selection) { + + level, exists := degreeLink.Attr("alt") + if !exists { + log.Println("error parsing alt value:") + } + + urlForDegree, exists := degreeLink.Attr("href") + if !exists { + log.Println("error parsing href value:") + } + + cipCode := degreeLink.Find("div.cip_code") + stemDesignated := degreeLink.Find("div.footnote").Last() // There is either 1 element named STEM-Designated or no elements at all + + degrees = append(degrees, Degree{ + Level: level, + PublicUrl: strings.TrimSpace(urlForDegree), + CipCode: strings.TrimSpace(cipCode.Text()), + StemDesignated: isNotBlank(strings.TrimSpace(stemDesignated.Text())), + }) + }) - s.Find("div.degrees > a.footnote").Each(func(j int, degreeLink *goquery.Selection) { - level, exists := degreeLink.Attr("alt") - if !exists { - log.Println("error parsing alt value:") - } + areasOfInterest := s.Find("div.areas_of_interest.d-none").First() - urlForDegree, exists := degreeLink.Attr("href") - if !exists { - log.Println("error parsing href value:") + newProgram := Program{ + Title: strings.TrimSpace(title.Text()), + School: strings.TrimSpace(school.Text()), + DegreeOptions: degrees, + AreasOfInterest: parseAreasOfInterest(areasOfInterest.Text()), } - cipCode := degreeLink.Find("div.cip_code") - stemDesignated := degreeLink.Find("div.footnote").Last() // There is either 1 element named STEM-Designated or no elements at all - - degreeLevels = append(degreeLevels, DegreeLevel{ - Level: level, - PublicUrl: strings.TrimSpace(urlForDegree), - CipCode: strings.TrimSpace(cipCode.Text()), - StemDesignated: isNotBlank(strings.TrimSpace(stemDesignated.Text())), - }) + allPrograms = append(allPrograms, newProgram) }) + } + + marshalled, err := json.MarshalIndent(allPrograms, "", "\t") + if err != nil { + panic("could not convert degree to JSON format") + } - areasOfInterest := s.Find("div.areas_of_interest.d-none").First() - - d := Degree{ - Title: strings.TrimSpace(title.Text()), - School: strings.TrimSpace(school.Text()), - DegreeLevels: degreeLevels, - AreasOfInterest: parseAreasOfInterest(areasOfInterest.Text()), - } - - marshalled, err := json.MarshalIndent(d, "", "\t") - if err != nil { - panic("could not convert degree to JSON format") - } - - /* Debug */ - log.Print(string(marshalled)) - - /* Write to output File */ - outFile, err := os.Create(fmt.Sprintf("%s/degrees.json", outDir)) - if err != nil { - log.Fatalf("could not create output file: %s", err) - } - - _, err = outFile.Write(marshalled) - if err != nil { - log.Fatalf("could not write to output file: %s", err) - } - }) + /* Debug */ + log.Print(string(marshalled)) + + /* Write to output File */ + outFile, err := os.Create(fmt.Sprintf("%s/degrees.json", outDir)) + if err != nil { + log.Fatalf("could not create output file: %s", err) + } + defer outFile.Close() + + _, err = outFile.Write(marshalled) + if err != nil { + log.Fatalf("could not write to output file: %s", err) + } } func isNotBlank(s string) bool { @@ -108,16 +116,18 @@ func isNotBlank(s string) bool { } func parseAreasOfInterest(tags string) []string { - return strings.Split(strings.TrimSpace(tags), ",") + return strings.Split(strings.TrimSpace(tags), ", ") } // Generate all possible combinations of filters -/* -func GenerateAllCombinations() []map[string]string { - schools := []string{"bass", "jindal", ""} - levels := []string{"bachelors", "masters", ""} - depts := []string{"academic", ""} +func GenerateAllCombinations() []string { + schools := []string{"bass", "jindal", "nsm", "ecs", "bbs", "epps"} + + var combinations []string + + for _, s := range schools { + combinations = append(combinations, fmt.Sprintf("div .element-item.all.alldegrees.allschools.academic.%s", s)) + } - var combinations []map[string]string + return combinations } -*/ From 9b960a01a010f7c0aabf9066cb8ab2641417bd98 Mon Sep 17 00:00:00 2001 From: Flavore669 Date: Sun, 15 Feb 2026 00:13:21 -0600 Subject: [PATCH 11/14] Finished, now I need to polish stuff --- parser/degreeParser.go | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/parser/degreeParser.go b/parser/degreeParser.go index 8bd3cec..d382f52 100644 --- a/parser/degreeParser.go +++ b/parser/degreeParser.go @@ -22,6 +22,7 @@ type Degree struct { PublicUrl string `bson:"public_url" json:"public_url"` CipCode string `bson:"cip_code" json:"cip_code"` StemDesignated bool `bson:"stem_designated" json:"stem_designated"` + JointProgram bool `bson:"joint_program" json:"joint_program"` } func ParseDegrees(inDir string, outDir string) { @@ -45,11 +46,11 @@ func ParseDegrees(inDir string, outDir string) { panic("failed to find content area") } - programHTML := GenerateAllCombinations() + programsHTML := GenerateAllCombinations() var allPrograms []Program - for _, program := range programHTML { - content.Find(program).Each(func(i int, s *goquery.Selection) { + for _, programHTML := range programsHTML { + content.Find(programHTML).Each(func(i int, s *goquery.Selection) { header := s.Find("div > h3").Parent() title := header.Find("h3") school := header.Find("div.school") @@ -67,13 +68,14 @@ func ParseDegrees(inDir string, outDir string) { } cipCode := degreeLink.Find("div.cip_code") - stemDesignated := degreeLink.Find("div.footnote").Last() // There is either 1 element named STEM-Designated or no elements at all + footnote := degreeLink.Find("div.footnote") // There is either 1 element named STEM-Designated or no elements at all degrees = append(degrees, Degree{ Level: level, PublicUrl: strings.TrimSpace(urlForDegree), CipCode: strings.TrimSpace(cipCode.Text()), - StemDesignated: isNotBlank(strings.TrimSpace(stemDesignated.Text())), + StemDesignated: strings.Contains(strings.TrimSpace(footnote.Text()), "STEM-Designated"), + JointProgram: strings.Contains(strings.TrimSpace(footnote.Text()), "Joint Program"), }) }) @@ -111,10 +113,6 @@ func ParseDegrees(inDir string, outDir string) { } } -func isNotBlank(s string) bool { - return s != "" && len(strings.TrimSpace(s)) > 0 -} - func parseAreasOfInterest(tags string) []string { return strings.Split(strings.TrimSpace(tags), ", ") } From 305be352a18b24154a5ae7e29d384a5b69220dac Mon Sep 17 00:00:00 2001 From: Flavore669 Date: Sun, 15 Feb 2026 00:31:48 -0600 Subject: [PATCH 12/14] polish, just going to await for new Schema --- parser/degreeParser.go | 127 +++++++++++++++++++++++------------------ 1 file changed, 71 insertions(+), 56 deletions(-) diff --git a/parser/degreeParser.go b/parser/degreeParser.go index d382f52..763ac80 100644 --- a/parser/degreeParser.go +++ b/parser/degreeParser.go @@ -25,104 +25,119 @@ type Degree struct { JointProgram bool `bson:"joint_program" json:"joint_program"` } +// Parses scarped degree HTML and outputs the data in JSON func ParseDegrees(inDir string, outDir string) { // Read the scraped HTML file htmlPath := fmt.Sprintf("%s/degreesScraped.html", inDir) htmlBytes, err := os.ReadFile(htmlPath) if err != nil { - panic(err) + log.Fatalf("could not read HTML file: %v", err) } - log.Println("Parsing Degrees...") - + // Parse the document page, err := goquery.NewDocumentFromReader(strings.NewReader(string(htmlBytes))) if err != nil { - panic(err) + log.Fatalf("failed to parse HTML: %v", err) } // Find main content content := page.Find("article .col-sm-12").First() if content.Length() == 0 { - panic("failed to find content area") + log.Fatalf("failed to find content area") } - programsHTML := GenerateAllCombinations() + // Generate all possible combinations of degree filters + // This is done to cover all degrees from different schools e.g. ECS, NSM, etc + allProgramHTMLs := generateAllCombinations() var allPrograms []Program - for _, programHTML := range programsHTML { + for _, programHTML := range allProgramHTMLs { content.Find(programHTML).Each(func(i int, s *goquery.Selection) { - header := s.Find("div > h3").Parent() - title := header.Find("h3") - school := header.Find("div.school") - var degrees []Degree - s.Find("div.degrees > a.footnote").Each(func(j int, degreeLink *goquery.Selection) { - - level, exists := degreeLink.Attr("alt") - if !exists { - log.Println("error parsing alt value:") - } - - urlForDegree, exists := degreeLink.Attr("href") - if !exists { - log.Println("error parsing href value:") - } - - cipCode := degreeLink.Find("div.cip_code") - footnote := degreeLink.Find("div.footnote") // There is either 1 element named STEM-Designated or no elements at all - - degrees = append(degrees, Degree{ - Level: level, - PublicUrl: strings.TrimSpace(urlForDegree), - CipCode: strings.TrimSpace(cipCode.Text()), - StemDesignated: strings.Contains(strings.TrimSpace(footnote.Text()), "STEM-Designated"), - JointProgram: strings.Contains(strings.TrimSpace(footnote.Text()), "Joint Program"), - }) - }) - - areasOfInterest := s.Find("div.areas_of_interest.d-none").First() - - newProgram := Program{ - Title: strings.TrimSpace(title.Text()), - School: strings.TrimSpace(school.Text()), - DegreeOptions: degrees, - AreasOfInterest: parseAreasOfInterest(areasOfInterest.Text()), - } - - allPrograms = append(allPrograms, newProgram) + extractProgram(s, &allPrograms) }) } + // Convert to JSON marshalled, err := json.MarshalIndent(allPrograms, "", "\t") if err != nil { - panic("could not convert degree to JSON format") + log.Fatalf("could not convert programs to JSON format: %v", err) } - /* Debug */ - log.Print(string(marshalled)) - - /* Write to output File */ + // Write to output file outFile, err := os.Create(fmt.Sprintf("%s/degrees.json", outDir)) if err != nil { - log.Fatalf("could not create output file: %s", err) + log.Fatalf("could not create output file: %v", err) } defer outFile.Close() _, err = outFile.Write(marshalled) if err != nil { - log.Fatalf("could not write to output file: %s", err) + log.Fatalf("could not write to output file: %v", err) } } -func parseAreasOfInterest(tags string) []string { - return strings.Split(strings.TrimSpace(tags), ", ") +func extractProgram(selection *goquery.Selection, programs *[]Program) { + header := selection.Find("div > h3").Parent() + title := header.Find("h3") + school := header.Find("div.school") + + var degrees []Degree + selection.Find("div.degrees > a.footnote").Each(func(j int, degreeLink *goquery.Selection) { + // The alt attribute represents the Degree Level + // Examples: BS, MS, PHD + level, exists := degreeLink.Attr("alt") + if !exists { + log.Println("error parsing alt value:") + return + } + + // Extracts the URL to the degree's page. + urlForDegree, exists := degreeLink.Attr("href") + if !exists { + log.Println("Error parsing href value:") + return + } + + // Extracts Classification of Instructional Programs Codes + // These codes provide a standardized system for reporting data about academic programs across different colleges and universities + cipCode := degreeLink.Find("div.cip_code") + + // Extracts the footnote from the degree HTML + // Relevant footnotes are STEM-Designated and Joint Program + footnote := degreeLink.Find("div.footnote") + + degrees = append(degrees, Degree{ + Level: level, + PublicUrl: strings.TrimSpace(urlForDegree), + CipCode: strings.TrimSpace(cipCode.Text()), + StemDesignated: strings.Contains(strings.TrimSpace(footnote.Text()), "STEM-Designated"), + JointProgram: strings.Contains(strings.TrimSpace(footnote.Text()), "Joint Program"), + }) + }) + + // Extracts a list of tags that correlate to what might interest a student + // Example for Computer Science: Artificial intelligence, AI, computer science, software, robotics, computer vision, digital forensics + areasOfInterest := selection.Find("div.areas_of_interest.d-none").First() + + newProgram := Program{ + Title: strings.TrimSpace(title.Text()), + School: strings.TrimSpace(school.Text()), + DegreeOptions: degrees, + AreasOfInterest: strings.Split(strings.TrimSpace(areasOfInterest.Text()), ", "), + } + + *programs = append(*programs, newProgram) } -// Generate all possible combinations of filters -func GenerateAllCombinations() []string { +// Generates a list of all possible HTML endpoints for a degree from the HTML Page. +// Each endpoint corresponds to a specific school, combining it with common CSS selectors used in the document structure. +func generateAllCombinations() []string { + // List of schools for which we need to generate combination selectors. schools := []string{"bass", "jindal", "nsm", "ecs", "bbs", "epps"} var combinations []string + // Loop through each school and generate the corresponding HTML selector. for _, s := range schools { combinations = append(combinations, fmt.Sprintf("div .element-item.all.alldegrees.allschools.academic.%s", s)) } From 58e9661356b5b4128fa75058680cb23a60c0046c Mon Sep 17 00:00:00 2001 From: Flavore669 Date: Sun, 15 Feb 2026 00:49:26 -0600 Subject: [PATCH 13/14] added verbose printing --- parser/degreeParser.go | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/parser/degreeParser.go b/parser/degreeParser.go index 763ac80..e3109c4 100644 --- a/parser/degreeParser.go +++ b/parser/degreeParser.go @@ -8,12 +8,13 @@ import ( "strings" "github.com/PuerkitoBio/goquery" + "github.com/UTDNebula/api-tools/utils" ) -type Program struct { +type AcademicProgram struct { Title string `bson:"name" json:"name"` School string `bson:"school" json:"school"` - DegreeOptions []Degree `bson:"degree_levels" json:"degree_levels"` + DegreeOptions []Degree `bson:"degree_options" json:"degree_options"` AreasOfInterest []string `bson:"areas_of_interest" json:"areas_of_interest"` } @@ -33,6 +34,7 @@ func ParseDegrees(inDir string, outDir string) { if err != nil { log.Fatalf("could not read HTML file: %v", err) } + utils.VPrintf("Read %d bytes from %s", len(htmlBytes), htmlPath) // Parse the document page, err := goquery.NewDocumentFromReader(strings.NewReader(string(htmlBytes))) @@ -45,17 +47,20 @@ func ParseDegrees(inDir string, outDir string) { if content.Length() == 0 { log.Fatalf("failed to find content area") } + utils.VPrintf("Found main content area") // Generate all possible combinations of degree filters // This is done to cover all degrees from different schools e.g. ECS, NSM, etc allProgramHTMLs := generateAllCombinations() + utils.VPrintf("Generated %d program combinations to search", len(allProgramHTMLs)) - var allPrograms []Program + var allPrograms []AcademicProgram for _, programHTML := range allProgramHTMLs { content.Find(programHTML).Each(func(i int, s *goquery.Selection) { extractProgram(s, &allPrograms) }) } + utils.VPrintf("Extracted %d programs", len(allPrograms)) // Convert to JSON marshalled, err := json.MarshalIndent(allPrograms, "", "\t") @@ -74,12 +79,14 @@ func ParseDegrees(inDir string, outDir string) { if err != nil { log.Fatalf("could not write to output file: %v", err) } + utils.VPrintf("Successfully wrote degrees to %s/degrees.json", outDir) } -func extractProgram(selection *goquery.Selection, programs *[]Program) { +func extractProgram(selection *goquery.Selection, programs *[]AcademicProgram) { header := selection.Find("div > h3").Parent() title := header.Find("h3") school := header.Find("div.school") + utils.VPrintf("Extracting program: %s (%s)", strings.TrimSpace(title.Text()), strings.TrimSpace(school.Text())) var degrees []Degree selection.Find("div.degrees > a.footnote").Each(func(j int, degreeLink *goquery.Selection) { @@ -114,17 +121,19 @@ func extractProgram(selection *goquery.Selection, programs *[]Program) { JointProgram: strings.Contains(strings.TrimSpace(footnote.Text()), "Joint Program"), }) }) + utils.VPrintf(" Found %d degrees", len(degrees)) // Extracts a list of tags that correlate to what might interest a student // Example for Computer Science: Artificial intelligence, AI, computer science, software, robotics, computer vision, digital forensics areasOfInterest := selection.Find("div.areas_of_interest.d-none").First() - newProgram := Program{ + newProgram := AcademicProgram{ Title: strings.TrimSpace(title.Text()), School: strings.TrimSpace(school.Text()), DegreeOptions: degrees, AreasOfInterest: strings.Split(strings.TrimSpace(areasOfInterest.Text()), ", "), } + utils.VPrintf(" Areas of interest: %d topics", len(newProgram.AreasOfInterest)) *programs = append(*programs, newProgram) } From 0e6df2cf00621af35522dcd2149be78ed26e0521 Mon Sep 17 00:00:00 2001 From: Flavore669 Date: Sun, 15 Feb 2026 01:00:58 -0600 Subject: [PATCH 14/14] minor comment edits --- parser/degreeParser.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/parser/degreeParser.go b/parser/degreeParser.go index e3109c4..967b2fa 100644 --- a/parser/degreeParser.go +++ b/parser/degreeParser.go @@ -90,9 +90,9 @@ func extractProgram(selection *goquery.Selection, programs *[]AcademicProgram) { var degrees []Degree selection.Find("div.degrees > a.footnote").Each(func(j int, degreeLink *goquery.Selection) { - // The alt attribute represents the Degree Level + // The alt attribute represents the Degree Option // Examples: BS, MS, PHD - level, exists := degreeLink.Attr("alt") + degreeOption, exists := degreeLink.Attr("alt") if !exists { log.Println("error parsing alt value:") return @@ -110,11 +110,11 @@ func extractProgram(selection *goquery.Selection, programs *[]AcademicProgram) { cipCode := degreeLink.Find("div.cip_code") // Extracts the footnote from the degree HTML - // Relevant footnotes are STEM-Designated and Joint Program + // Relevant footnotes are 'STEM-Designated' and 'Joint Program' footnote := degreeLink.Find("div.footnote") degrees = append(degrees, Degree{ - Level: level, + Level: degreeOption, PublicUrl: strings.TrimSpace(urlForDegree), CipCode: strings.TrimSpace(cipCode.Text()), StemDesignated: strings.Contains(strings.TrimSpace(footnote.Text()), "STEM-Designated"), @@ -138,15 +138,15 @@ func extractProgram(selection *goquery.Selection, programs *[]AcademicProgram) { *programs = append(*programs, newProgram) } -// Generates a list of all possible HTML endpoints for a degree from the HTML Page. -// Each endpoint corresponds to a specific school, combining it with common CSS selectors used in the document structure. +// Generates a list of all possible HTML endpoints for a degree from the HTML Page +// Each endpoint corresponds to a specific school, combining it with common CSS selectors used in the document structure func generateAllCombinations() []string { - // List of schools for which we need to generate combination selectors. + // List of schools for which we need to generate combination selectors schools := []string{"bass", "jindal", "nsm", "ecs", "bbs", "epps"} var combinations []string - // Loop through each school and generate the corresponding HTML selector. + // Loop through each school and generate the corresponding HTML selector for _, s := range schools { combinations = append(combinations, fmt.Sprintf("div .element-item.all.alldegrees.allschools.academic.%s", s)) }