-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtranscript-parser.js
More file actions
138 lines (119 loc) · 4.46 KB
/
transcript-parser.js
File metadata and controls
138 lines (119 loc) · 4.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
/**
* PDF/TXT Parser Module
* Extracts text from PDF or TXT files and identifies speaker segments
*/
class PDFParser {
/**
* Extract text from a PDF or TXT file
* @param {File} file - The PDF or TXT file
* @returns {Promise<Object>} - Extracted text and speaker segments
*/
async extractText(file) {
try {
// Check file type
const fileName = file.name.toLowerCase();
const isTxt = fileName.endsWith('.txt');
let fullText = '';
let pages = [];
if (isTxt) {
// Handle plain text file
fullText = await file.text();
pages = [fullText]; // Treat entire file as one "page"
console.log('Extracted text from TXT file');
} else {
// Handle PDF file
const arrayBuffer = await file.arrayBuffer();
const pdf = await pdfjsLib.getDocument({ data: arrayBuffer }).promise;
// Extract text from each page
for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i);
const textContent = await page.getTextContent();
// Combine text items with proper spacing
const pageText = textContent.items
.map(item => item.str)
.join(' ')
.replace(/\s+/g, ' '); // Normalize whitespace
pages.push(pageText);
fullText += pageText + '\n';
}
console.log('Extracted text from PDF file');
}
// Parse speaker segments (same for both file types)
const segments = this.parseSpeakerSegments(fullText);
return {
fullText: fullText.trim(),
pages,
segments,
success: true
};
} catch (error) {
console.error('Error extracting text:', error);
return {
success: false,
error: error.message
};
}
}
/**
* Parse text into speaker segments
* Looks for patterns like "Name:" or "Interviewer:" to identify speakers
* @param {string} text - The full text from PDF
* @returns {Array} - Array of speaker segments
*/
parseSpeakerSegments(text) {
const segments = [];
// Split by speaker patterns using a regex that captures the full speaker turn
// Match patterns like "Q1:" or "FUCHS:" or other single-word caps followed by colon
const speakerRegex = /\b(Q\d+|FUCHS|[A-Z][A-Z]+)\s*:\s*/g;
let lastIndex = 0;
let currentMatch;
let previousSpeaker = null;
let previousStart = 0;
while ((currentMatch = speakerRegex.exec(text)) !== null) {
const speaker = currentMatch[1];
const matchStart = currentMatch.index;
const matchEnd = speakerRegex.lastIndex;
// If we have a previous speaker, save their segment
if (previousSpeaker) {
const segmentText = text.substring(previousStart, matchStart).trim();
if (segmentText) {
segments.push({
speaker: previousSpeaker,
text: segmentText
});
}
}
previousSpeaker = speaker;
previousStart = matchEnd;
}
// Add the final segment
if (previousSpeaker) {
const segmentText = text.substring(previousStart).trim();
if (segmentText) {
segments.push({
speaker: previousSpeaker,
text: segmentText
});
}
}
// If no speaker segments found, return empty (front matter only)
if (segments.length === 0) {
console.warn('No speaker segments found in PDF');
}
return segments;
}
/**
* Clean and normalize text for better matching
* @param {string} text - Text to clean
* @returns {string} - Cleaned text
*/
cleanText(text) {
return text
.replace(/\s+/g, ' ') // Normalize whitespace
.replace(/[""]/g, '"') // Normalize quotes
.replace(/['']/g, "'") // Normalize apostrophes
.trim();
}
}
// Export for use in other modules
window.PDFParser = PDFParser;