Skip to content

Commit 8bb92f9

Browse files
jkclaude
andcommitted
Add Chinese (mecab-jieba) dictionary support to download_dic and lang
- download_dic("zh") downloads and compiles the mecab-jieba dictionary (584k entries, jieba word frequencies + CC-CEDICT enrichment) - lang = "zh" now available in pos(), posParallel(), and set_dic() - Source: https://github.com/lindera/mecab-jieba Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent cf67e46 commit 8bb92f9

7 files changed

Lines changed: 71 additions & 27 deletions

File tree

R/dic.R

Lines changed: 50 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,37 @@
11
#' Download and install a MeCab dictionary
22
#'
33
#' Downloads and installs a MeCab system dictionary for the specified language.
4-
#' Japanese dictionaries are compiled from source using the built-in
4+
#' Japanese and Chinese dictionaries are compiled from source using the built-in
55
#' \code{mecab-dict-index}; Korean dictionaries are downloaded pre-compiled.
66
#' No system-level MeCab installation is required.
77
#'
88
#' Dictionaries are stored in the user data directory
99
#' (\code{tools::R_user_dir("RcppMeCab", "data")}).
1010
#'
1111
#' @param lang Character scalar. Language code: \code{"ja"} for Japanese
12-
#' (IPAdic) or \code{"ko"} for Korean (mecab-ko-dic).
12+
#' (IPAdic), \code{"ko"} for Korean (mecab-ko-dic), or \code{"zh"} for
13+
#' Chinese (mecab-jieba).
1314
#' @return Invisible path to the installed dictionary directory.
1415
#'
1516
#' @examples
1617
#' \dontrun{
1718
#' download_dic("ja")
1819
#' download_dic("ko")
20+
#' download_dic("zh")
1921
#' pos("some text", lang = "ja")
2022
#' }
2123
#'
2224
#' @export
2325
download_dic <- function(lang) {
24-
lang <- match.arg(lang, c("ja", "ko"))
26+
lang <- match.arg(lang, c("ja", "ko", "zh"))
2527
dic_dir <- file.path(tools::R_user_dir("RcppMeCab", "data"), lang)
2628

2729
if (lang == "ja") {
2830
.download_dic_ja(dic_dir)
29-
} else {
31+
} else if (lang == "ko") {
3032
.download_dic_ko(dic_dir)
33+
} else {
34+
.download_dic_zh(dic_dir)
3135
}
3236

3337
message("Dictionary installed: ", dic_dir)
@@ -65,10 +69,11 @@ list_dic <- function() {
6569
}
6670

6771
# User-installed dictionaries
68-
for (lang in c("ja", "ko")) {
72+
dic_names <- c(ja = "ipadic", ko = "mecab-ko-dic", zh = "mecab-jieba")
73+
for (lang in c("ja", "ko", "zh")) {
6974
dic_path <- file.path(user_dir, lang)
7075
if (file.exists(file.path(dic_path, "sys.dic"))) {
71-
name <- if (lang == "ja") "ipadic" else "mecab-ko-dic"
76+
name <- dic_names[[lang]]
7277
rows <- c(rows, list(data.frame(
7378
lang = lang, name = name,
7479
path = dic_path, active = (active == dic_path),
@@ -95,8 +100,9 @@ list_dic <- function() {
95100
#' \code{\link{posParallel}}. This is equivalent to calling
96101
#' \code{options(mecabSysDic = path)} but allows selection by language code.
97102
#'
98-
#' @param lang Character scalar. Language code (\code{"ja"} or \code{"ko"})
99-
#' or \code{"bundled"} to use the dictionary bundled with the package.
103+
#' @param lang Character scalar. Language code (\code{"ja"}, \code{"ko"}, or
104+
#' \code{"zh"}) or \code{"bundled"} to use the dictionary bundled with the
105+
#' package.
100106
#' @return Invisible path to the activated dictionary directory.
101107
#'
102108
#' @examples
@@ -110,7 +116,7 @@ list_dic <- function() {
110116
#'
111117
#' @export
112118
set_dic <- function(lang) {
113-
lang <- match.arg(lang, c("ja", "ko", "bundled"))
119+
lang <- match.arg(lang, c("ja", "ko", "zh", "bundled"))
114120
dic_path <- .resolve_dic(lang)
115121
options(mecabSysDic = dic_path)
116122
invisible(dic_path)
@@ -191,3 +197,38 @@ set_dic <- function(lang) {
191197

192198
message("Korean (mecab-ko-dic) dictionary installed.")
193199
}
200+
201+
#' Download and compile Chinese mecab-jieba
202+
#' @noRd
203+
.download_dic_zh <- function(dic_dir) {
204+
src_url <- "https://github.com/lindera/mecab-jieba/archive/refs/tags/0.1.1.tar.gz"
205+
tmp_dir <- tempfile("mecab_zh_")
206+
dir.create(tmp_dir, recursive = TRUE)
207+
on.exit(unlink(tmp_dir, recursive = TRUE), add = TRUE)
208+
209+
tarball <- file.path(tmp_dir, "mecab-jieba.tar.gz")
210+
message("Downloading mecab-jieba source...")
211+
utils::download.file(src_url, tarball, mode = "wb", quiet = TRUE)
212+
213+
message("Extracting...")
214+
utils::untar(tarball, exdir = tmp_dir)
215+
extracted <- list.dirs(tmp_dir, recursive = FALSE)
216+
jieba_dir <- extracted[1]
217+
if (!file.exists(file.path(jieba_dir, "jieba.csv")))
218+
stop("Could not find jieba.csv in archive")
219+
220+
dir.create(dic_dir, recursive = TRUE, showWarnings = FALSE)
221+
222+
message("Compiling dictionary (this may take a moment)...")
223+
args <- c("mecab-dict-index",
224+
"-d", normalizePath(jieba_dir, mustWork = TRUE),
225+
"-o", normalizePath(dic_dir, mustWork = TRUE),
226+
"-f", "utf-8",
227+
"-t", "utf-8")
228+
result <- dictIndexRcpp(args)
229+
if (result != 0)
230+
stop("Dictionary compilation failed (return code: ", result, ")")
231+
232+
file.copy(file.path(jieba_dir, "dicrc"), dic_dir, overwrite = TRUE)
233+
message("Chinese (mecab-jieba) dictionary installed.")
234+
}

R/pos.r

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@
2121
#' @param sentence A character vector of any length. For analyzing multiple sentences, put them in one character vector.
2222
#' @param join A bool to decide the output format. The default value is TRUE. If FALSE, the function will return morphemes only, and tags put in the attribute. if \code{format="data.frame"}, then this will be ignored.
2323
#' @param format A data type for the result. The default value is "list". You can set this to "data.frame" to get a result as data frame format.
24-
#' @param lang Optional language code (\code{"ja"} or \code{"ko"}) to select
25-
#' a dictionary installed via \code{\link{download_dic}}. When specified, this
26-
#' overrides \code{sys_dic}.
24+
#' @param lang Optional language code (\code{"ja"}, \code{"ko"}, or \code{"zh"})
25+
#' to select a dictionary installed via \code{\link{download_dic}}. When
26+
#' specified, this overrides \code{sys_dic}.
2727
#' @param sys_dic A location of system MeCab dictionary. The default value is "".
2828
#' @param user_dic A location of user-specific MeCab dictionary. The default value is "".
2929
#' @return A string vector or a list of POS tagged morpheme will be returned in conjoined character
@@ -52,7 +52,7 @@ pos <- function(sentence, join = TRUE, format = c("list", "data.frame"), lang =
5252
}
5353

5454
if (!is.null(lang)) {
55-
sys_dic <- .resolve_dic(match.arg(lang, c("ja", "ko")))
55+
sys_dic <- .resolve_dic(match.arg(lang, c("ja", "ko", "zh")))
5656
} else if (!is.null(getOption("mecabSysDic")) && sys_dic == "") {
5757
sys_dic <- getOption("mecabSysDic")
5858
}

R/posParallel.R

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@
2626
#' @param sentence A character vector of any length. For analyzing multiple sentences, put them in one character vector.
2727
#' @param join A bool to decide the output format. The default value is TRUE. If FALSE, the function will return morphemes only, and tags put in the attribute. if \code{format="data.frame"}, then this will be ignored.
2828
#' @param format A data type for the result. The default value is "list". You can set this to "data.frame" to get a result as data frame format.
29-
#' @param lang Optional language code (\code{"ja"} or \code{"ko"}) to select
30-
#' a dictionary installed via \code{\link{download_dic}}. When specified, this
31-
#' overrides \code{sys_dic}.
29+
#' @param lang Optional language code (\code{"ja"}, \code{"ko"}, or \code{"zh"})
30+
#' to select a dictionary installed via \code{\link{download_dic}}. When
31+
#' specified, this overrides \code{sys_dic}.
3232
#' @param sys_dic A location of system MeCab dictionary. The default value is "".
3333
#' @param user_dic A location of user-specific MeCab dictionary. The default value is "".
3434
#' @return A string vector or a list of POS tagged morpheme will be returned in conjoined character
@@ -57,7 +57,7 @@ posParallel <- function(sentence, join = TRUE, format = c("list", "data.frame"),
5757
}
5858

5959
if (!is.null(lang)) {
60-
sys_dic <- .resolve_dic(match.arg(lang, c("ja", "ko")))
60+
sys_dic <- .resolve_dic(match.arg(lang, c("ja", "ko", "zh")))
6161
} else if (!is.null(getOption("mecabSysDic")) && sys_dic == "") {
6262
sys_dic <- getOption("mecabSysDic")
6363
}

man/download_dic.Rd

Lines changed: 4 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/pos.Rd

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/posParallel.Rd

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/set_dic.Rd

Lines changed: 3 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)