Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions bgp/modules/terms.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,30 @@ def __init__(self):
super().__init__(['copyright', '©'], extractor=self.extractor, match_limit=1)


class TocPageDetectorModule(KeywordPageDetectorModule):

def __init__(self):
super().__init__(["table of contents"], match_limit=1)

def detectTocHeading(self, page):
for i, line in enumerate(page.iter('LINE')):
if i < 5: # if we're in the first few lines
words = " ".join(line.iter('WORD')).lower().strip()
if any(kws == words for kws in self.keywords):
return True
return False


def run(self, page, node):
if not self.match_limit or len(self.matched_pages) < self.match_limit:
if self.detectTocHeading(page):
param = page[0].attrib['value'].split('.djvu')[0]
current_page = param[-4:]
match = {
'page': current_page,
}
self.matched_pages.append(match)

class BackpageIsbnExtractorModule():

def __init__(self):
Expand Down
17 changes: 17 additions & 0 deletions mysequencer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from bgp import ia
from bgp import Sequencer
from bgp.modules.terms import TocPageDetectorModule, PageTypeProcessor, CopyrightPageDetectorModule


PageTypeDetectionSequencer = Sequencer({
"pagetypes": PageTypeProcessor(modules={
"toc_page": TocPageDetectorModule()
})
})


book = ia.get_item("9780262517638OpenAccess")

results = PageTypeDetectionSequencer.sequence(book).results

print(results)
Comment on lines +1 to +17
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is okay for testing but we'll want to delete this file before we move out of draft

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah absolutely. I made this file for my convenience during development. When we decide to merge, I will make a commit to delete this file.