Skip to content

Commit a6ccd34

Browse files
committed
Implement sentence level spell checking
1 parent 43cbb61 commit a6ccd34

15 files changed

Lines changed: 892 additions & 37 deletions

File tree

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
package com.darkrockstudios.texteditor.state
2+
3+
import com.darkrockstudios.texteditor.TextEditorRange
4+
5+
/**
6+
* Represents a sentence within the editor text.
7+
*
8+
* @param text The sentence text content
9+
* @param range The document position of this sentence
10+
*/
11+
data class SentenceSegment(
12+
val text: String,
13+
val range: TextEditorRange,
14+
) {
15+
override fun toString(): String = "$text (${range.start}-${range.end})"
16+
}
Lines changed: 303 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,303 @@
1+
package com.darkrockstudios.texteditor.state
2+
3+
import com.darkrockstudios.texteditor.CharLineOffset
4+
import com.darkrockstudios.texteditor.TextEditorRange
5+
6+
/**
7+
* Extension function to segment the entire document into sentences.
8+
*
9+
* Sentence boundaries are determined by:
10+
* - Period (.) followed by whitespace or end of text (but not in abbreviations)
11+
* - Question mark (?) and exclamation mark (!)
12+
* - Ellipsis (...) followed by capital letter
13+
*
14+
* Handles abbreviations for Latin scripts including:
15+
* - English: Mr., Mrs., Dr., Prof., Inc., Ltd., etc.
16+
* - French: M., Mme., Mlle.
17+
* - German: z.B., usw., bzw.
18+
* - Spanish: Sr., Sra., Dr.
19+
*/
20+
fun TextEditorState.sentenceSegments(): Sequence<SentenceSegment> = sequence {
21+
val linesSnapshot = textLines.toList()
22+
if (linesSnapshot.isEmpty()) return@sequence
23+
24+
var sentenceStartLine = 0
25+
var sentenceStartChar = 0
26+
val sentenceBuilder = StringBuilder()
27+
28+
// Track position within the accumulated sentence for multi-line handling
29+
var currentLineInSentence = 0
30+
var currentCharInLine = 0
31+
32+
for ((lineIndex, line) in linesSnapshot.withIndex()) {
33+
val text = line.text
34+
var charIndex = 0
35+
36+
while (charIndex < text.length) {
37+
val char = text[charIndex]
38+
sentenceBuilder.append(char)
39+
currentCharInLine = charIndex
40+
41+
if (isSentenceEndingPunctuation(char)) {
42+
val accumulated = sentenceBuilder.toString()
43+
if (isTrueSentenceEnd(text, charIndex, accumulated, linesSnapshot, lineIndex)) {
44+
// Found a sentence end
45+
val sentenceText = accumulated.trim()
46+
if (sentenceText.isNotEmpty()) {
47+
yield(
48+
SentenceSegment(
49+
text = sentenceText,
50+
range = TextEditorRange(
51+
start = CharLineOffset(sentenceStartLine, sentenceStartChar),
52+
end = CharLineOffset(lineIndex, charIndex + 1)
53+
)
54+
)
55+
)
56+
}
57+
58+
sentenceBuilder.clear()
59+
60+
// Skip trailing whitespace to find next sentence start
61+
charIndex++
62+
while (charIndex < text.length && text[charIndex].isWhitespace()) {
63+
charIndex++
64+
}
65+
66+
// Set new sentence start
67+
if (charIndex < text.length) {
68+
sentenceStartLine = lineIndex
69+
sentenceStartChar = charIndex
70+
} else {
71+
// Sentence starts on next line
72+
sentenceStartLine = lineIndex + 1
73+
sentenceStartChar = 0
74+
}
75+
continue
76+
}
77+
}
78+
charIndex++
79+
}
80+
81+
// Add newline to sentence builder for multi-line sentences (preserves spacing)
82+
if (lineIndex < linesSnapshot.lastIndex && sentenceBuilder.isNotEmpty()) {
83+
sentenceBuilder.append('\n')
84+
}
85+
}
86+
87+
// Yield any remaining text as a final sentence
88+
val remainingText = sentenceBuilder.toString().trim()
89+
if (remainingText.isNotEmpty()) {
90+
val lastLine = linesSnapshot.lastIndex
91+
val lastLineLength = linesSnapshot[lastLine].text.length
92+
yield(
93+
SentenceSegment(
94+
text = remainingText,
95+
range = TextEditorRange(
96+
start = CharLineOffset(sentenceStartLine, sentenceStartChar),
97+
end = CharLineOffset(lastLine, lastLineLength)
98+
)
99+
)
100+
)
101+
}
102+
}
103+
104+
/**
105+
* Find all sentences that intersect with the given range.
106+
*/
107+
fun TextEditorState.sentenceSegmentsInRange(range: TextEditorRange): List<SentenceSegment> {
108+
return sentenceSegments()
109+
.filter { it.range.intersects(range) }
110+
.toList()
111+
}
112+
113+
/**
114+
* Find the sentence containing the given position.
115+
*/
116+
fun TextEditorState.findSentenceSegmentAt(position: CharLineOffset): SentenceSegment? {
117+
return sentenceSegments().find { segment ->
118+
position >= segment.range.start && position <= segment.range.end
119+
}
120+
}
121+
122+
private fun isSentenceEndingPunctuation(char: Char): Boolean {
123+
return char == '.' || char == '?' || char == '!' || char == ''
124+
}
125+
126+
/**
127+
* Determines if a punctuation mark is a true sentence end,
128+
* handling abbreviations like "U.S.A.", "Mr.", "Dr.", etc.
129+
*/
130+
private fun isTrueSentenceEnd(
131+
currentLineText: String,
132+
position: Int,
133+
accumulatedSentence: String,
134+
allLines: List<androidx.compose.ui.text.AnnotatedString>,
135+
currentLineIndex: Int
136+
): Boolean {
137+
val char = currentLineText[position]
138+
139+
// Question marks and exclamation marks are always sentence ends
140+
if (char == '?' || char == '!') {
141+
return true
142+
}
143+
144+
// Ellipsis character is a sentence end if followed by whitespace + capital
145+
if (char == '') {
146+
val nextChar = getNextNonWhitespaceChar(currentLineText, position, allLines, currentLineIndex)
147+
return nextChar == null || nextChar.isUpperCase()
148+
}
149+
150+
// For periods, check for abbreviations
151+
if (char == '.') {
152+
// Check for ellipsis pattern (...)
153+
if (isEllipsis(currentLineText, position)) {
154+
val nextChar = getNextNonWhitespaceChar(currentLineText, position + 2, allLines, currentLineIndex)
155+
return nextChar == null || nextChar.isUpperCase()
156+
}
157+
158+
// Check for single-letter abbreviations (U.S.A.)
159+
if (isSingleLetterAbbreviation(currentLineText, position)) {
160+
return false
161+
}
162+
163+
// Check common abbreviations
164+
val wordBeforePeriod = extractWordBeforePeriod(accumulatedSentence)
165+
if (isCommonAbbreviation(wordBeforePeriod)) {
166+
return false
167+
}
168+
169+
// Check for number followed by period (ordinals in some languages)
170+
if (position > 0 && currentLineText[position - 1].isDigit()) {
171+
val nextChar = getNextNonWhitespaceChar(currentLineText, position, allLines, currentLineIndex)
172+
// If followed by lowercase, probably not sentence end
173+
if (nextChar?.isLowerCase() == true) {
174+
return false
175+
}
176+
}
177+
178+
// Check what follows the period
179+
val nextChar = getNextNonWhitespaceChar(currentLineText, position, allLines, currentLineIndex)
180+
181+
// If followed by nothing or uppercase letter, it's a sentence end
182+
// If followed by lowercase letter, likely an abbreviation
183+
return nextChar == null || nextChar.isUpperCase() || nextChar.isDigit() ||
184+
nextChar == '"' || nextChar == '\'' || nextChar == ')' || nextChar == ']' ||
185+
nextChar == '¿' || nextChar == '¡'
186+
}
187+
188+
return false
189+
}
190+
191+
/**
192+
* Gets the next non-whitespace character after the given position,
193+
* potentially looking into subsequent lines.
194+
*/
195+
private fun getNextNonWhitespaceChar(
196+
currentLineText: String,
197+
position: Int,
198+
allLines: List<androidx.compose.ui.text.AnnotatedString>,
199+
currentLineIndex: Int
200+
): Char? {
201+
// Check rest of current line
202+
for (i in (position + 1) until currentLineText.length) {
203+
val c = currentLineText[i]
204+
if (!c.isWhitespace()) return c
205+
}
206+
207+
// Check subsequent lines
208+
for (lineIdx in (currentLineIndex + 1) until allLines.size) {
209+
val lineText = allLines[lineIdx].text
210+
for (c in lineText) {
211+
if (!c.isWhitespace()) return c
212+
}
213+
}
214+
215+
return null
216+
}
217+
218+
/**
219+
* Checks if the period at the given position is part of an ellipsis (...)
220+
*/
221+
private fun isEllipsis(text: String, position: Int): Boolean {
222+
if (position < 2) return false
223+
return text.getOrNull(position - 1) == '.' && text.getOrNull(position - 2) == '.'
224+
}
225+
226+
/**
227+
* Checks if this is a single-letter abbreviation pattern like "U.S.A."
228+
*/
229+
private fun isSingleLetterAbbreviation(text: String, position: Int): Boolean {
230+
// Pattern: single letter before period
231+
if (position >= 1) {
232+
val prev = text[position - 1]
233+
// Check if it's a single uppercase letter preceded by start, whitespace, or another period
234+
if (prev.isUpperCase()) {
235+
val prevPrev = text.getOrNull(position - 2)
236+
if (prevPrev == null || prevPrev.isWhitespace() || prevPrev == '.' || prevPrev == '(') {
237+
// Check if followed by another letter (continuation of abbreviation)
238+
val next = text.getOrNull(position + 1)
239+
if (next?.isUpperCase() == true) {
240+
return true
241+
}
242+
// Check if this is the end of a multi-part abbreviation (e.g., "U.S.A." at end)
243+
if (prevPrev == '.' && position >= 3) {
244+
val thirdBack = text.getOrNull(position - 3)
245+
if (thirdBack?.isUpperCase() == true) {
246+
return true // Part of abbreviation like "U.S.A."
247+
}
248+
}
249+
}
250+
}
251+
}
252+
return false
253+
}
254+
255+
private fun extractWordBeforePeriod(text: String): String {
256+
val trimmed = text.trimEnd('.', ' ', '\n', '\t')
257+
val lastSpace = trimmed.lastIndexOfAny(charArrayOf(' ', '\n', '\t'))
258+
return if (lastSpace >= 0) {
259+
trimmed.substring(lastSpace + 1)
260+
} else {
261+
trimmed
262+
}
263+
}
264+
265+
// Common abbreviations for Latin scripts
266+
private val COMMON_ABBREVIATIONS = setOf(
267+
// English
268+
"Mr", "Mrs", "Ms", "Dr", "Prof", "Sr", "Jr",
269+
"vs", "etc", "al", "approx", "dept", "est", "govt", "misc",
270+
// English with periods embedded
271+
"e.g", "i.e", "cf", "viz",
272+
// Months
273+
"Jan", "Feb", "Mar", "Apr", "Jun", "Jul", "Aug", "Sep", "Sept", "Oct", "Nov", "Dec",
274+
// Days
275+
"Mon", "Tue", "Tues", "Wed", "Thu", "Thur", "Thurs", "Fri", "Sat", "Sun",
276+
// Business
277+
"Inc", "Ltd", "Corp", "Co", "LLC", "Ave", "Blvd", "St", "Rd",
278+
// French
279+
"M", "Mme", "Mlle", "Cie",
280+
// German
281+
"Nr", "Str",
282+
// Spanish
283+
"Ud", "Uds", "Srta",
284+
// Academic/Professional
285+
"Ph", "vol", "no", "pp", "ed", "eds", "rev", "trans",
286+
// Military/Government
287+
"Gen", "Col", "Maj", "Capt", "Lt", "Sgt", "Gov", "Sen", "Rep",
288+
// Other common
289+
"tel", "fax", "ext", "ref", "max", "min", "avg"
290+
)
291+
292+
// Abbreviations that include periods (need special handling)
293+
private val DOTTED_ABBREVIATIONS = setOf(
294+
"e.g", "i.e", "z.B", "usw", "bzw", "u.a", "d.h", "v.a"
295+
)
296+
297+
private fun isCommonAbbreviation(word: String): Boolean {
298+
val normalized = word.trimEnd('.')
299+
return COMMON_ABBREVIATIONS.contains(normalized) ||
300+
COMMON_ABBREVIATIONS.contains(normalized.lowercase()) ||
301+
DOTTED_ABBREVIATIONS.contains(normalized) ||
302+
DOTTED_ABBREVIATIONS.contains(normalized.lowercase())
303+
}

ComposeTextEditorSpellCheck/src/androidMain/kotlin/com/darkrockstudios/texteditor/spellcheck/SpellCheckTextContextMenuProvider.android.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ actual fun SpellCheckTextContextMenuProvider(
1616

1717
spellCheckMenuState.missSpelling.value?.apply {
1818
SpellCheckDropdown(
19-
wordSegment,
19+
item,
2020
menuPosition,
2121
spellCheckMenuState.spellCheckState,
2222
dismiss = spellCheckMenuState::clearSpellCheck,

0 commit comments

Comments
 (0)