|
| 1 | +package com.darkrockstudios.texteditor.state |
| 2 | + |
| 3 | +import com.darkrockstudios.texteditor.CharLineOffset |
| 4 | +import com.darkrockstudios.texteditor.TextEditorRange |
| 5 | + |
| 6 | +/** |
| 7 | + * Extension function to segment the entire document into sentences. |
| 8 | + * |
| 9 | + * Sentence boundaries are determined by: |
| 10 | + * - Period (.) followed by whitespace or end of text (but not in abbreviations) |
| 11 | + * - Question mark (?) and exclamation mark (!) |
| 12 | + * - Ellipsis (...) followed by capital letter |
| 13 | + * |
| 14 | + * Handles abbreviations for Latin scripts including: |
| 15 | + * - English: Mr., Mrs., Dr., Prof., Inc., Ltd., etc. |
| 16 | + * - French: M., Mme., Mlle. |
| 17 | + * - German: z.B., usw., bzw. |
| 18 | + * - Spanish: Sr., Sra., Dr. |
| 19 | + */ |
| 20 | +fun TextEditorState.sentenceSegments(): Sequence<SentenceSegment> = sequence { |
| 21 | + val linesSnapshot = textLines.toList() |
| 22 | + if (linesSnapshot.isEmpty()) return@sequence |
| 23 | + |
| 24 | + var sentenceStartLine = 0 |
| 25 | + var sentenceStartChar = 0 |
| 26 | + val sentenceBuilder = StringBuilder() |
| 27 | + |
| 28 | + // Track position within the accumulated sentence for multi-line handling |
| 29 | + var currentLineInSentence = 0 |
| 30 | + var currentCharInLine = 0 |
| 31 | + |
| 32 | + for ((lineIndex, line) in linesSnapshot.withIndex()) { |
| 33 | + val text = line.text |
| 34 | + var charIndex = 0 |
| 35 | + |
| 36 | + while (charIndex < text.length) { |
| 37 | + val char = text[charIndex] |
| 38 | + sentenceBuilder.append(char) |
| 39 | + currentCharInLine = charIndex |
| 40 | + |
| 41 | + if (isSentenceEndingPunctuation(char)) { |
| 42 | + val accumulated = sentenceBuilder.toString() |
| 43 | + if (isTrueSentenceEnd(text, charIndex, accumulated, linesSnapshot, lineIndex)) { |
| 44 | + // Found a sentence end |
| 45 | + val sentenceText = accumulated.trim() |
| 46 | + if (sentenceText.isNotEmpty()) { |
| 47 | + yield( |
| 48 | + SentenceSegment( |
| 49 | + text = sentenceText, |
| 50 | + range = TextEditorRange( |
| 51 | + start = CharLineOffset(sentenceStartLine, sentenceStartChar), |
| 52 | + end = CharLineOffset(lineIndex, charIndex + 1) |
| 53 | + ) |
| 54 | + ) |
| 55 | + ) |
| 56 | + } |
| 57 | + |
| 58 | + sentenceBuilder.clear() |
| 59 | + |
| 60 | + // Skip trailing whitespace to find next sentence start |
| 61 | + charIndex++ |
| 62 | + while (charIndex < text.length && text[charIndex].isWhitespace()) { |
| 63 | + charIndex++ |
| 64 | + } |
| 65 | + |
| 66 | + // Set new sentence start |
| 67 | + if (charIndex < text.length) { |
| 68 | + sentenceStartLine = lineIndex |
| 69 | + sentenceStartChar = charIndex |
| 70 | + } else { |
| 71 | + // Sentence starts on next line |
| 72 | + sentenceStartLine = lineIndex + 1 |
| 73 | + sentenceStartChar = 0 |
| 74 | + } |
| 75 | + continue |
| 76 | + } |
| 77 | + } |
| 78 | + charIndex++ |
| 79 | + } |
| 80 | + |
| 81 | + // Add newline to sentence builder for multi-line sentences (preserves spacing) |
| 82 | + if (lineIndex < linesSnapshot.lastIndex && sentenceBuilder.isNotEmpty()) { |
| 83 | + sentenceBuilder.append('\n') |
| 84 | + } |
| 85 | + } |
| 86 | + |
| 87 | + // Yield any remaining text as a final sentence |
| 88 | + val remainingText = sentenceBuilder.toString().trim() |
| 89 | + if (remainingText.isNotEmpty()) { |
| 90 | + val lastLine = linesSnapshot.lastIndex |
| 91 | + val lastLineLength = linesSnapshot[lastLine].text.length |
| 92 | + yield( |
| 93 | + SentenceSegment( |
| 94 | + text = remainingText, |
| 95 | + range = TextEditorRange( |
| 96 | + start = CharLineOffset(sentenceStartLine, sentenceStartChar), |
| 97 | + end = CharLineOffset(lastLine, lastLineLength) |
| 98 | + ) |
| 99 | + ) |
| 100 | + ) |
| 101 | + } |
| 102 | +} |
| 103 | + |
| 104 | +/** |
| 105 | + * Find all sentences that intersect with the given range. |
| 106 | + */ |
| 107 | +fun TextEditorState.sentenceSegmentsInRange(range: TextEditorRange): List<SentenceSegment> { |
| 108 | + return sentenceSegments() |
| 109 | + .filter { it.range.intersects(range) } |
| 110 | + .toList() |
| 111 | +} |
| 112 | + |
| 113 | +/** |
| 114 | + * Find the sentence containing the given position. |
| 115 | + */ |
| 116 | +fun TextEditorState.findSentenceSegmentAt(position: CharLineOffset): SentenceSegment? { |
| 117 | + return sentenceSegments().find { segment -> |
| 118 | + position >= segment.range.start && position <= segment.range.end |
| 119 | + } |
| 120 | +} |
| 121 | + |
| 122 | +private fun isSentenceEndingPunctuation(char: Char): Boolean { |
| 123 | + return char == '.' || char == '?' || char == '!' || char == '…' |
| 124 | +} |
| 125 | + |
| 126 | +/** |
| 127 | + * Determines if a punctuation mark is a true sentence end, |
| 128 | + * handling abbreviations like "U.S.A.", "Mr.", "Dr.", etc. |
| 129 | + */ |
| 130 | +private fun isTrueSentenceEnd( |
| 131 | + currentLineText: String, |
| 132 | + position: Int, |
| 133 | + accumulatedSentence: String, |
| 134 | + allLines: List<androidx.compose.ui.text.AnnotatedString>, |
| 135 | + currentLineIndex: Int |
| 136 | +): Boolean { |
| 137 | + val char = currentLineText[position] |
| 138 | + |
| 139 | + // Question marks and exclamation marks are always sentence ends |
| 140 | + if (char == '?' || char == '!') { |
| 141 | + return true |
| 142 | + } |
| 143 | + |
| 144 | + // Ellipsis character is a sentence end if followed by whitespace + capital |
| 145 | + if (char == '…') { |
| 146 | + val nextChar = getNextNonWhitespaceChar(currentLineText, position, allLines, currentLineIndex) |
| 147 | + return nextChar == null || nextChar.isUpperCase() |
| 148 | + } |
| 149 | + |
| 150 | + // For periods, check for abbreviations |
| 151 | + if (char == '.') { |
| 152 | + // Check for ellipsis pattern (...) |
| 153 | + if (isEllipsis(currentLineText, position)) { |
| 154 | + val nextChar = getNextNonWhitespaceChar(currentLineText, position + 2, allLines, currentLineIndex) |
| 155 | + return nextChar == null || nextChar.isUpperCase() |
| 156 | + } |
| 157 | + |
| 158 | + // Check for single-letter abbreviations (U.S.A.) |
| 159 | + if (isSingleLetterAbbreviation(currentLineText, position)) { |
| 160 | + return false |
| 161 | + } |
| 162 | + |
| 163 | + // Check common abbreviations |
| 164 | + val wordBeforePeriod = extractWordBeforePeriod(accumulatedSentence) |
| 165 | + if (isCommonAbbreviation(wordBeforePeriod)) { |
| 166 | + return false |
| 167 | + } |
| 168 | + |
| 169 | + // Check for number followed by period (ordinals in some languages) |
| 170 | + if (position > 0 && currentLineText[position - 1].isDigit()) { |
| 171 | + val nextChar = getNextNonWhitespaceChar(currentLineText, position, allLines, currentLineIndex) |
| 172 | + // If followed by lowercase, probably not sentence end |
| 173 | + if (nextChar?.isLowerCase() == true) { |
| 174 | + return false |
| 175 | + } |
| 176 | + } |
| 177 | + |
| 178 | + // Check what follows the period |
| 179 | + val nextChar = getNextNonWhitespaceChar(currentLineText, position, allLines, currentLineIndex) |
| 180 | + |
| 181 | + // If followed by nothing or uppercase letter, it's a sentence end |
| 182 | + // If followed by lowercase letter, likely an abbreviation |
| 183 | + return nextChar == null || nextChar.isUpperCase() || nextChar.isDigit() || |
| 184 | + nextChar == '"' || nextChar == '\'' || nextChar == ')' || nextChar == ']' || |
| 185 | + nextChar == '¿' || nextChar == '¡' |
| 186 | + } |
| 187 | + |
| 188 | + return false |
| 189 | +} |
| 190 | + |
| 191 | +/** |
| 192 | + * Gets the next non-whitespace character after the given position, |
| 193 | + * potentially looking into subsequent lines. |
| 194 | + */ |
| 195 | +private fun getNextNonWhitespaceChar( |
| 196 | + currentLineText: String, |
| 197 | + position: Int, |
| 198 | + allLines: List<androidx.compose.ui.text.AnnotatedString>, |
| 199 | + currentLineIndex: Int |
| 200 | +): Char? { |
| 201 | + // Check rest of current line |
| 202 | + for (i in (position + 1) until currentLineText.length) { |
| 203 | + val c = currentLineText[i] |
| 204 | + if (!c.isWhitespace()) return c |
| 205 | + } |
| 206 | + |
| 207 | + // Check subsequent lines |
| 208 | + for (lineIdx in (currentLineIndex + 1) until allLines.size) { |
| 209 | + val lineText = allLines[lineIdx].text |
| 210 | + for (c in lineText) { |
| 211 | + if (!c.isWhitespace()) return c |
| 212 | + } |
| 213 | + } |
| 214 | + |
| 215 | + return null |
| 216 | +} |
| 217 | + |
| 218 | +/** |
| 219 | + * Checks if the period at the given position is part of an ellipsis (...) |
| 220 | + */ |
| 221 | +private fun isEllipsis(text: String, position: Int): Boolean { |
| 222 | + if (position < 2) return false |
| 223 | + return text.getOrNull(position - 1) == '.' && text.getOrNull(position - 2) == '.' |
| 224 | +} |
| 225 | + |
| 226 | +/** |
| 227 | + * Checks if this is a single-letter abbreviation pattern like "U.S.A." |
| 228 | + */ |
| 229 | +private fun isSingleLetterAbbreviation(text: String, position: Int): Boolean { |
| 230 | + // Pattern: single letter before period |
| 231 | + if (position >= 1) { |
| 232 | + val prev = text[position - 1] |
| 233 | + // Check if it's a single uppercase letter preceded by start, whitespace, or another period |
| 234 | + if (prev.isUpperCase()) { |
| 235 | + val prevPrev = text.getOrNull(position - 2) |
| 236 | + if (prevPrev == null || prevPrev.isWhitespace() || prevPrev == '.' || prevPrev == '(') { |
| 237 | + // Check if followed by another letter (continuation of abbreviation) |
| 238 | + val next = text.getOrNull(position + 1) |
| 239 | + if (next?.isUpperCase() == true) { |
| 240 | + return true |
| 241 | + } |
| 242 | + // Check if this is the end of a multi-part abbreviation (e.g., "U.S.A." at end) |
| 243 | + if (prevPrev == '.' && position >= 3) { |
| 244 | + val thirdBack = text.getOrNull(position - 3) |
| 245 | + if (thirdBack?.isUpperCase() == true) { |
| 246 | + return true // Part of abbreviation like "U.S.A." |
| 247 | + } |
| 248 | + } |
| 249 | + } |
| 250 | + } |
| 251 | + } |
| 252 | + return false |
| 253 | +} |
| 254 | + |
| 255 | +private fun extractWordBeforePeriod(text: String): String { |
| 256 | + val trimmed = text.trimEnd('.', ' ', '\n', '\t') |
| 257 | + val lastSpace = trimmed.lastIndexOfAny(charArrayOf(' ', '\n', '\t')) |
| 258 | + return if (lastSpace >= 0) { |
| 259 | + trimmed.substring(lastSpace + 1) |
| 260 | + } else { |
| 261 | + trimmed |
| 262 | + } |
| 263 | +} |
| 264 | + |
| 265 | +// Common abbreviations for Latin scripts |
| 266 | +private val COMMON_ABBREVIATIONS = setOf( |
| 267 | + // English |
| 268 | + "Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr", |
| 269 | + "vs", "etc", "al", "approx", "dept", "est", "govt", "misc", |
| 270 | + // English with periods embedded |
| 271 | + "e.g", "i.e", "cf", "viz", |
| 272 | + // Months |
| 273 | + "Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aug", "Sep", "Sept", "Oct", "Nov", "Dec", |
| 274 | + // Days |
| 275 | + "Mon", "Tue", "Tues", "Wed", "Thu", "Thur", "Thurs", "Fri", "Sat", "Sun", |
| 276 | + // Business |
| 277 | + "Inc", "Ltd", "Corp", "Co", "LLC", "Ave", "Blvd", "St", "Rd", |
| 278 | + // French |
| 279 | + "M", "Mme", "Mlle", "Cie", |
| 280 | + // German |
| 281 | + "Nr", "Str", |
| 282 | + // Spanish |
| 283 | + "Ud", "Uds", "Srta", |
| 284 | + // Academic/Professional |
| 285 | + "Ph", "vol", "no", "pp", "ed", "eds", "rev", "trans", |
| 286 | + // Military/Government |
| 287 | + "Gen", "Col", "Maj", "Capt", "Lt", "Sgt", "Gov", "Sen", "Rep", |
| 288 | + // Other common |
| 289 | + "tel", "fax", "ext", "ref", "max", "min", "avg" |
| 290 | +) |
| 291 | + |
| 292 | +// Abbreviations that include periods (need special handling) |
| 293 | +private val DOTTED_ABBREVIATIONS = setOf( |
| 294 | + "e.g", "i.e", "z.B", "usw", "bzw", "u.a", "d.h", "v.a" |
| 295 | +) |
| 296 | + |
| 297 | +private fun isCommonAbbreviation(word: String): Boolean { |
| 298 | + val normalized = word.trimEnd('.') |
| 299 | + return COMMON_ABBREVIATIONS.contains(normalized) || |
| 300 | + COMMON_ABBREVIATIONS.contains(normalized.lowercase()) || |
| 301 | + DOTTED_ABBREVIATIONS.contains(normalized) || |
| 302 | + DOTTED_ABBREVIATIONS.contains(normalized.lowercase()) |
| 303 | +} |
0 commit comments