Skip to content

Commit deee9f3

Browse files
fix: provide options for different website
1 parent 71dc27a commit deee9f3

1 file changed

Lines changed: 106 additions & 35 deletions

File tree

apps/spider/crawlers/orc.py

Lines changed: 106 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -82,20 +82,53 @@ def get_all_courses(self):
8282
Returns:
8383
list: Course data with prerequisites, descriptions, and instructors
8484
"""
85-
self._ensure_initialized() # Make sure crawler is initialized
86-
87-
# Get data from course selection APIs
88-
courses_data = self._get_lesson_tasks()
89-
course_details = self._get_course_catalog()
90-
prerequisites = self._get_prerequisites()
85+
# Ask user which data sources to use
86+
use_coursesel = self._ask_user_choice(
87+
"Crawl course selection system data? (y/n): ", default="y"
88+
)
89+
use_official = self._ask_user_choice(
90+
"Crawl official website data? (y/n): ", default="y"
91+
)
9192

92-
# Get official website data for enhanced descriptions
93-
official_data = self._get_official_website_data()
93+
courses_data = []
94+
course_details = {}
95+
prerequisites = {}
96+
official_data = {}
97+
98+
if use_coursesel:
99+
self._ensure_initialized() # Make sure crawler is initialized
100+
print("Crawling course selection system data...")
101+
# Get data from course selection APIs
102+
courses_data = self._get_lesson_tasks()
103+
course_details = self._get_course_catalog()
104+
prerequisites = self._get_prerequisites()
105+
else:
106+
print("Skipping course selection system data")
107+
108+
if use_official:
109+
print("Crawling official website data...")
110+
# Get official website data for enhanced descriptions
111+
official_data = self._get_official_website_data()
112+
else:
113+
print("Skipping official website data")
94114

95115
return self._integrate_course_data(
96116
courses_data, course_details, prerequisites, official_data
97117
)
98118

119+
def _ask_user_choice(self, prompt, default="y"):
120+
"""Ask user for yes/no choice with default value"""
121+
while True:
122+
response = input(prompt).strip().lower()
123+
if not response:
124+
response = default.lower()
125+
if response in ["y", "yes", "true"]:
126+
return True
127+
elif response in ["n", "no", "false"]:
128+
return False
129+
else:
130+
print("Please enter y/yes or n/no")
131+
99132
def _get_current_elect_turn_id(self):
100133
"""Get current election turn ID dynamically"""
101134
url = f"{BASE_URL}/tpm/findStudentElectTurns_ElectTurn.action"
@@ -330,40 +363,62 @@ def _integrate_course_data(
330363
f"Starting integration with {len(courses_data)} courses, {len(prerequisites)} prereq groups, {len(official_data)} official records"
331364
)
332365

333-
courses_by_code = defaultdict(list)
334-
for course in courses_data:
335-
course_code = course.get("courseCode")
336-
if course_code:
337-
courses_by_code[course_code].append(course)
338-
339366
integrated_courses = []
340367
courses_with_prereqs = 0
341368

342-
for course_code, course_list in courses_by_code.items():
343-
merged = self._merge_course_sections(course_list)
344-
if not merged:
345-
continue
346-
347-
course_id = merged.get("courseId")
348-
catalog_info = course_details.get(course_id, {})
349-
prereq_info = prerequisites.get(course_id, [])
350-
official_info = official_data.get(course_code, {})
369+
# If we have course selection data, process it
370+
if courses_data:
371+
courses_by_code = defaultdict(list)
372+
for course in courses_data:
373+
course_code = course.get("courseCode")
374+
if course_code:
375+
courses_by_code[course_code].append(course)
376+
377+
for course_code, course_list in courses_by_code.items():
378+
merged = self._merge_course_sections(course_list)
379+
if not merged:
380+
continue
381+
382+
course_id = merged.get("courseId")
383+
catalog_info = course_details.get(course_id, {})
384+
prereq_info = prerequisites.get(course_id, [])
385+
official_info = official_data.get(course_code, {})
386+
387+
if prereq_info:
388+
courses_with_prereqs += 1
389+
logger.debug(
390+
f"Course {course_code} (ID: {course_id}) has {len(prereq_info)} prereqs"
391+
)
351392

352-
if prereq_info:
353-
courses_with_prereqs += 1
354-
logger.debug(
355-
f"Course {course_code} (ID: {course_id}) has {len(prereq_info)} prereqs"
393+
course_data = self._build_course_record(
394+
course_code, merged, catalog_info, prereq_info, official_info
356395
)
357396

358-
course_data = self._build_course_record(
359-
course_code, merged, catalog_info, prereq_info, official_info
360-
)
397+
if course_data:
398+
integrated_courses.append(course_data)
399+
400+
# If we only have official data (no course selection data), create courses from official data
401+
elif official_data:
402+
logger.info("Creating courses from official website data only")
403+
for course_code, official_info in official_data.items():
404+
# Create empty main_data for courses that only exist in official website
405+
empty_main_data = {}
406+
empty_catalog_data = {}
407+
empty_prereq_data = []
408+
409+
course_data = self._build_course_record(
410+
course_code,
411+
empty_main_data,
412+
empty_catalog_data,
413+
empty_prereq_data,
414+
official_info,
415+
)
361416

362-
if course_data:
363-
integrated_courses.append(course_data)
417+
if course_data:
418+
integrated_courses.append(course_data)
364419

365420
logger.info(
366-
f"Integration complete: {courses_with_prereqs} courses have prerequisites"
421+
f"Integration complete: {courses_with_prereqs} courses have prerequisites, {len(integrated_courses)} total courses"
367422
)
368423
return integrated_courses
369424

@@ -428,6 +483,10 @@ def _extract_course_title(self, main_data, catalog_data, official_data=None):
428483
"""Extract course title (prefer English name)"""
429484
if official_data is None:
430485
official_data = {}
486+
if main_data is None:
487+
main_data = {}
488+
if catalog_data is None:
489+
catalog_data = {}
431490

432491
return (
433492
official_data.get("course_title", "")
@@ -442,8 +501,8 @@ def _parse_course_code(self, course_code):
442501
number = 0
443502

444503
if course_code:
445-
# Match DEPT####J? (J is optional)
446-
match = re.match(r"^([A-Z]{2,4})(\d{4})J?$", course_code)
504+
# Match DEPT###(#)?J? (3 or 4 digits, J is optional)
505+
match = re.match(r"^([A-Z]{2,4})(\d{3,4})J?$", course_code)
447506
if match:
448507
department = match.group(1)
449508
number = int(match.group(2))
@@ -452,6 +511,11 @@ def _parse_course_code(self, course_code):
452511

453512
def _extract_course_credits(self, main_data, catalog_data):
454513
"""Extract course credits"""
514+
if main_data is None:
515+
main_data = {}
516+
if catalog_data is None:
517+
catalog_data = {}
518+
455519
course_credits = main_data.get("totalCredit", 0) or catalog_data.get(
456520
"credit", 0
457521
)
@@ -520,6 +584,11 @@ def _extract_description(self, official_data=None):
520584

521585
def _extract_instructors(self, main_data, catalog_data):
522586
"""Extract and merge instructor information"""
587+
if main_data is None:
588+
main_data = {}
589+
if catalog_data is None:
590+
catalog_data = {}
591+
523592
instructors = main_data.get("all_instructors", [])
524593
teacher_name = catalog_data.get("teacherName", "")
525594

@@ -532,6 +601,8 @@ def _extract_instructors(self, main_data, catalog_data):
532601

533602
def _build_course_url(self, main_data):
534603
"""Build course detail page URL"""
604+
if main_data is None:
605+
main_data = {}
535606
course_id = main_data.get("courseId")
536607
return f"{COURSE_DETAIL_URL_PREFIX}{course_id}" if course_id else ""
537608

0 commit comments

Comments
 (0)