Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 38 additions & 118 deletions stationapi/src/domain/ipa.rs
Original file line number Diff line number Diff line change
Expand Up @@ -344,67 +344,36 @@ fn apply_phonological_rules(phonemes: &[Phoneme]) -> String {
}
}

// Apply long vowel contractions: オウ → oː pattern
apply_vowel_length(&output)
insert_syllable_breaks(&output)
}

/// Find the IPA string of the next Regular phoneme in the slice.
fn find_next_regular(phonemes: &[Phoneme]) -> Option<&'static str> {
phonemes.iter().find_map(|p| match p {
Phoneme::Regular(ipa) => Some(*ipa),
_ => None,
})
}

/// Apply vowel length rules for common Japanese patterns.
/// オウ → oː (after consonant+o), ョウ/ョオ patterns are handled by digraph + this.
fn apply_vowel_length(input: &str) -> String {
/// Insert IPA syllable boundary markers (`.`) between consecutive vowels.
/// This prevents Google TTS from interpreting cross-mora vowel sequences
/// (e.g. `ei` in セイ) as English diphthongs (e.g. /eɪ/ → "ai").
fn insert_syllable_breaks(input: &str) -> String {
let mut result = String::with_capacity(input.len());
let chars: Vec<char> = input.chars().collect();
let len = chars.len();
let mut i = 0;
let mut prev_is_vowel = false;

while i < len {
if i + 1 < len && chars[i] == 'o' && chars[i + 1] == 'ɯ' {
// oɯ → oː (おう/こう pattern)
result.push('o');
result.push('ː');
i += 2;
// Skip a following long-vowel mark to avoid duplicate 'ː'
if i < len && (chars[i] == 'ː' || chars[i] == 'ー') {
i += 1;
}
continue;
}
if i + 1 < len && chars[i] == 'o' && chars[i + 1] == 'o' {
// oo → oː (おお pattern)
result.push('o');
result.push('ː');
i += 2;
// Skip a following long-vowel mark to avoid duplicate 'ː'
if i < len && (chars[i] == 'ː' || chars[i] == 'ー') {
i += 1;
}
continue;
}
if i + 1 < len && chars[i] == 'e' && chars[i + 1] == 'i' {
// ei → eː (えい/けい pattern — 京成 keisei → keːseː)
result.push('e');
result.push('ː');
i += 2;
// Skip a following long-vowel mark to avoid duplicate 'ː'
if i < len && (chars[i] == 'ː' || chars[i] == 'ー') {
i += 1;
}
continue;
for c in input.chars() {
let is_vowel = "aiɯeou".contains(c);
if is_vowel && prev_is_vowel {
result.push('.');
}
result.push(chars[i]);
i += 1;
result.push(c);
prev_is_vowel = is_vowel;
}

result
}

/// Find the IPA string of the next Regular phoneme in the slice.
fn find_next_regular(phonemes: &[Phoneme]) -> Option<&'static str> {
phonemes.iter().find_map(|p| match p {
Phoneme::Regular(ipa) => Some(*ipa),
_ => None,
})
}

#[cfg(test)]
mod tests {
use super::*;
Expand All @@ -428,7 +397,7 @@ mod tests {

#[test]
fn test_ueno() {
assert_eq!(ipa("ウエノ"), "ɯeno");
assert_eq!(ipa("ウエノ"), "ɯ.eno");
}

#[test]
Expand All @@ -444,14 +413,12 @@ mod tests {

#[test]
fn test_osaka() {
// オオ → oː
assert_eq!(ipa("オオサカ"), "oːsaka");
assert_eq!(ipa("オオサカ"), "o.osaka");
}

#[test]
fn test_kyoto() {
// キョウ → kʲoː (via kʲo + ウ → oɯ → oː)
assert_eq!(ipa("キョウト"), "kʲoːto");
assert_eq!(ipa("キョウト"), "kʲo.ɯto");
}

#[test]
Expand All @@ -476,8 +443,7 @@ mod tests {

#[test]
fn test_ryogoku() {
// リョウ → ɾʲoː (via ɾʲo + ウ → oɯ → oː)
assert_eq!(ipa("リョウゴク"), "ɾʲoːgokɯ");
assert_eq!(ipa("リョウゴク"), "ɾʲo.ɯgokɯ");
}

#[test]
Expand All @@ -488,41 +454,23 @@ mod tests {

#[test]
fn test_keisei() {
assert_eq!(ipa("ケイセイ"), "keːseː");
assert_eq!(ipa("ケイセイ"), "ke.ise.i");
}

#[test]
fn test_oshiage() {
assert_eq!(ipa("オシアゲ"), "oɕiage");
assert_eq!(ipa("オシアゲ"), "oɕi.age");
}

#[test]
fn test_meitetsu() {
// ツ is consistently t͡sɯ (affricate with tie bar)
assert_eq!(ipa("メイテツ"), "meːtet͡sɯ");
assert_eq!(ipa("メイテツ"), "me.itet͡sɯ");
}

#[test]
fn test_seibu() {
assert_eq!(ipa("セイブ"), "seːbɯ");
}

#[test]
fn test_ei_long_vowel_no_duplicate() {
// セイー should not produce "seːː"
assert_eq!(ipa("セイー"), "seː");
}

#[test]
fn test_ou_long_vowel_no_duplicate() {
// コウー should not produce "koːː"
assert_eq!(ipa("コウー"), "koː");
}

#[test]
fn test_oo_long_vowel_no_duplicate() {
// オオー should not produce "oːː"
assert_eq!(ipa("オオー"), "oː");
assert_eq!(ipa("セイブ"), "se.ibɯ");
}

#[test]
Expand All @@ -532,7 +480,7 @@ mod tests {

#[test]
fn test_fukiage() {
assert_eq!(ipa("フキアゲ"), "ɸɯkʲiage");
assert_eq!(ipa("フキアゲ"), "ɸɯkʲi.age");
}

#[test]
Expand All @@ -543,7 +491,7 @@ mod tests {
#[test]
fn test_inagekaigan() {
// ン at word end → ɴ
assert_eq!(ipa("イナゲカイガン"), "inagekaigaɴ");
assert_eq!(ipa("イナゲカイガン"), "inageka.igaɴ");
}

#[test]
Expand All @@ -553,12 +501,12 @@ mod tests {

#[test]
fn test_kire_uriwari() {
assert_eq!(ipa("キレウリワリ"), "kʲiɾeɯɾiwaɾi");
assert_eq!(ipa("キレウリワリ"), "kʲiɾe.ɯɾiwaɾi");
}

#[test]
fn test_yao() {
assert_eq!(ipa("ヤオ"), "jao");
assert_eq!(ipa("ヤオ"), "ja.o");
}

#[test]
Expand All @@ -578,32 +526,22 @@ mod tests {

#[test]
fn test_itchome() {
// ッチョウ → tt͡ɕoː
assert_eq!(ipa("イッチョウメ"), "itt͡ɕoːme");
assert_eq!(ipa("イッチョウメ"), "itt͡ɕo.ɯme");
}

#[test]
fn test_sanchome() {
assert_eq!(ipa("サンチョウメ"), "sant͡ɕoːme");
assert_eq!(ipa("サンチョウメ"), "sant͡ɕo.ɯme");
}

#[test]
fn test_koen() {
// コウエン: コ=ko, ウ→長音化でoː, エン=eɴ → koːeɴ
// Note: the original hardcoded value was "koeɴ" but phonologically "koːeɴ" is correct
assert_eq!(ipa("コウエン"), "koːeɴ");
}

#[test]
fn test_long_vowel_mark() {
// ー explicitly lengthens
assert_eq!(ipa("ラーメン"), "ɾaːmeɴ");
assert_eq!(ipa("コウエン"), "ko.ɯ.eɴ");
}

#[test]
fn test_tokyo() {
// トウキョウ: ト=to, ウ→oː, キョ=kʲo, ウ→oː
assert_eq!(ipa("トウキョウ"), "toːkʲoː");
assert_eq!(ipa("トウキョウ"), "to.ɯkʲo.ɯ");
}

#[test]
Expand All @@ -629,18 +567,6 @@ mod tests {
assert_eq!(ipa("シンヨコハマ"), "ɕiɲjokohama");
}

#[test]
fn test_geminate_ji() {
// ッジ → dʤi (voiced affricate gemination emits 'd')
assert_eq!(ipa("カッジ"), "kadʤi");
}

#[test]
fn test_geminate_ju() {
// ッジュ → ddʑɯ (voiced affricate gemination with digraph)
assert_eq!(ipa("カッジュ"), "kaddʑɯ");
}

#[test]
fn test_empty() {
assert_eq!(katakana_to_ipa(""), Some(String::new()));
Expand All @@ -652,18 +578,12 @@ mod tests {
assert_eq!(katakana_to_ipa("シブヤX"), None);
}

#[test]
fn test_geminate_palatalized() {
// ッキョ → kkʲo (only the base consonant 'k' is geminated, not 'kʲ')
assert_eq!(ipa("ニッキョウ"), "ɲikkʲoː");
}

#[test]
fn test_dokkyo_daigakumae_soka_matsubara() {
// Full-width space between words should be preserved
assert_eq!(
ipa("ドッキョウダイガクマエ ソウカマツバラ"),
"dokkʲoːdaigakɯmae soːkamat͡sɯbaɾa"
"dokkʲo.ɯda.igakɯma.e so.ɯkamat͡sɯbaɾa"
);
}

Expand All @@ -672,7 +592,7 @@ mod tests {
// Half-width (ASCII) space between words should also be accepted
assert_eq!(
ipa("ドッキョウダイガクマエ ソウカマツバラ"),
"dokkʲoːdaigakɯmae soːkamat͡sɯbaɾa"
"dokkʲo.ɯda.igakɯma.e so.ɯkamat͡sɯbaɾa"
);
}

Expand Down
12 changes: 9 additions & 3 deletions stationapi/src/use_case/dto/line.rs
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,10 @@ mod tests {
line.line_name_k = "セイブイケブクロセン".to_string();
let grpc_line: GrpcLine = line.into();

assert_eq!(grpc_line.name_ipa, Some("seːbɯikebɯkɯɾo laɪn".to_string()));
assert_eq!(
grpc_line.name_ipa,
Some("se.ibɯ.ikebɯkɯɾo laɪn".to_string())
);
}

#[test]
Expand All @@ -394,7 +397,10 @@ mod tests {
line.line_name_k = "トウカイドウホンセン".to_string();
let grpc_line: GrpcLine = line.into();

assert_eq!(grpc_line.name_ipa, Some("toːkaidoː meɪn laɪn".to_string()));
assert_eq!(
grpc_line.name_ipa,
Some("to.ɯka.ido.ɯ meɪn laɪn".to_string())
);
}

#[test]
Expand All @@ -404,6 +410,6 @@ mod tests {
line.line_name_k = "トウホクシンカンセン".to_string();
let grpc_line: GrpcLine = line.into();

assert_eq!(grpc_line.name_ipa, Some("toːhokɯɕiŋkanseɴ".to_string()));
assert_eq!(grpc_line.name_ipa, Some("to.ɯhokɯɕiŋkanseɴ".to_string()));
}
}