1const NAME_PREFIXES: &[&str] = &[
9 "1lt", "1st", "2lt", "2nd", "3rd", "admiral", "capt", "captain", "col", "cpt", "dr", "gen",
10 "general", "lcdr", "lt", "ltc", "ltg", "ltjg", "maj", "major", "mg", "mr", "mrs", "ms",
11 "pastor", "prof", "rep", "reverend", "rev", "sen",
12];
13
14const NAME_SUFFIXES: &[&str] = &[
15 "b.a", "ba", "d.d.s", "dds", "i", "ii", "iii", "iv", "ix", "jr", "m.a", "m.d", "ma", "md",
16 "ms", "ph.d", "phd", "sr", "v", "vi", "vii", "viii", "x",
17];
18
19const FAMILY_NAME_PREFIXES: &[&str] = &[
20 "d'", "de", "del", "der", "di", "la", "le", "mc", "san", "st", "ter", "van", "von",
21];
22
23const COMMON_CJK_MULTI_CHAR_SURNAMES: &[&str] = &[
26 "남궁", "사공", "서문", "선우", "제갈", "황보", "독고", "망절",
29 "欧阳", "令狐", "皇甫", "上官", "司徒", "诸葛", "司马", "宇文", "呼延", "端木",
33 "張簡", "歐陽", "諸葛", "申屠", "尉遲", "司馬", "軒轅", "夏侯",
35];
36
37const KOREAN_MULTI_CHAR_SURNAMES: &[&str] = &[
40 "강전", "남궁", "독고", "동방", "망절", "사공", "서문", "선우", "소봉", "어금", "장곡", "제갈",
41 "황목", "황보",
42];
43
44const MIDDLE_DOT: &[char] = &[
46 '\u{30FB}', '\u{00B7}', ];
49
50const CJK_RANGE: &[(char, char)] = &[
51 ('\u{1100}', '\u{11FF}'), ('\u{3040}', '\u{309F}'), ('\u{30A0}', '\u{30FF}'), ('\u{3105}', '\u{312C}'), ('\u{3130}', '\u{318F}'), ('\u{31F0}', '\u{31FF}'), ('\u{3200}', '\u{32FF}'), ('\u{3400}', '\u{4DBF}'), ('\u{4E00}', '\u{9FFF}'), ('\u{A960}', '\u{A97F}'), ('\u{AC00}', '\u{D7AF}'), ('\u{D7B0}', '\u{D7FF}'), ('\u{FF00}', '\u{FFEF}'), ];
65
66const HANGUL_RANGE: &[(char, char)] = &[
67 ('\u{1100}', '\u{11FF}'), ('\u{3130}', '\u{318F}'), ('\u{A960}', '\u{A97F}'), ('\u{AC00}', '\u{D7AF}'), ('\u{D7B0}', '\u{D7FF}'), ];
73
74#[derive(PartialEq, Debug, Default)]
75pub(crate) struct NameParts {
76 pub(crate) given: String,
77 pub(crate) middle: String,
78 pub(crate) family: String,
79}
80
81fn is_name_separator(c: char) -> bool {
82 c.is_whitespace() || MIDDLE_DOT.contains(&c) || c == ','
83}
84
85fn contains_string(set: &[&str], token: &str) -> bool {
86 let target = token.trim_end_matches('.').to_lowercase();
87 set.contains(&target.as_str())
88}
89
90fn strip_prefixes<'a>(name_tokens: &'a [&'a str]) -> &'a [&'a str] {
91 name_tokens
92 .iter()
93 .position(|token| !contains_string(NAME_PREFIXES, token))
94 .map_or(&[], |index| &name_tokens[index..])
95}
96
97fn strip_suffixes<'a>(name_tokens: &'a [&'a str]) -> &'a [&'a str] {
98 name_tokens
99 .iter()
100 .rposition(|token| !contains_string(NAME_SUFFIXES, token))
101 .map_or(&[], |index| &name_tokens[..=index])
102}
103
104fn is_char_in_range(c: char, range: &[(char, char)]) -> bool {
105 range.iter().any(|&(start, end)| c >= start && c <= end)
106}
107
108pub(crate) fn is_cjk_name(name: &str) -> bool {
109 if name.is_empty() || name.split_whitespace().count() > 2 {
110 return false;
111 }
112
113 name.split_whitespace().all(|part| {
114 part.chars()
115 .all(|c| MIDDLE_DOT.contains(&c) || is_char_in_range(c, CJK_RANGE))
116 })
117}
118
119fn is_korean_name(name: &str) -> bool {
120 if name.is_empty() {
121 return false;
122 }
123
124 name.split_whitespace()
125 .all(|part| part.chars().all(|c| is_char_in_range(c, HANGUL_RANGE)))
126}
127
128fn get_cjk_surname_length(name: &str) -> usize {
129 let surnames = if is_korean_name(name) && name.chars().count() > 3 {
130 KOREAN_MULTI_CHAR_SURNAMES
131 } else {
132 COMMON_CJK_MULTI_CHAR_SURNAMES
133 };
134
135 if surnames.iter().any(|&surname| name.starts_with(surname)) {
136 2
137 } else {
138 1
139 }
140}
141
142fn split_cjk_name(name_tokens: &[&str]) -> Option<NameParts> {
143 match name_tokens.len() {
144 1 => {
145 let name = name_tokens[0];
146 let surname_length = get_cjk_surname_length(name);
147 Some(NameParts {
148 given: name.chars().skip(surname_length).collect(),
149 family: name.chars().take(surname_length).collect(),
150 ..Default::default()
151 })
152 }
153 2 => Some(NameParts {
154 given: name_tokens[1].to_string(),
155 family: name_tokens[0].to_string(),
156 ..Default::default()
157 }),
158 _ => None,
159 }
160}
161
162fn handle_multiple_suffixes(suffixes: &[&str]) -> NameParts {
163 let mut suffixes = suffixes.to_vec(); let family_tokens = extract_family_tokens(&mut suffixes);
166 let family = family_tokens.join(" ");
167
168 let middle = if suffixes.len() >= 2 {
169 suffixes.pop().unwrap().to_string()
170 } else {
171 String::new()
172 };
173
174 let given = suffixes.join(" ");
175
176 NameParts {
177 given,
178 middle,
179 family,
180 }
181}
182
183fn extract_family_tokens(suffixes: &mut Vec<&str>) -> Vec<String> {
184 let mut family_tokens = vec![suffixes.pop().unwrap().to_string()];
185 while !suffixes.is_empty() && contains_string(FAMILY_NAME_PREFIXES, suffixes.last().unwrap()) {
186 family_tokens.insert(0, suffixes.pop().unwrap().to_string());
187 }
188 family_tokens
189}
190
191pub(crate) fn join_name_parts(name_parts: &NameParts) -> String {
192 if is_cjk_name(&name_parts.given)
193 && is_cjk_name(&name_parts.family)
194 && name_parts.middle.is_empty()
195 {
196 return format!("{}{}", name_parts.family, name_parts.given);
197 }
198
199 [
200 name_parts.given.as_str(),
201 name_parts.middle.as_str(),
202 name_parts.family.as_str(),
203 ]
204 .iter()
205 .filter(|&part| !part.is_empty())
206 .cloned()
207 .collect::<Vec<&str>>()
208 .join(" ")
209}
210
211pub(crate) fn split_name(name: &str) -> NameParts {
212 if name.is_empty() {
213 return NameParts::default();
214 }
215
216 let name_tokens: Vec<&str> = name
217 .trim()
218 .split(is_name_separator)
219 .filter(|s| !s.is_empty())
220 .collect();
221
222 let stripped_prefixes = strip_prefixes(&name_tokens);
223
224 if is_cjk_name(name) {
225 if let Some(cjk_parts) = split_cjk_name(stripped_prefixes) {
226 return cjk_parts;
227 }
228 }
229
230 let stripped_suffixes = if name_tokens.len() > 2 {
231 strip_suffixes(stripped_prefixes)
232 } else {
233 stripped_prefixes
234 };
235
236 match stripped_suffixes {
237 [] => NameParts {
238 given: name.to_string(),
239 ..Default::default()
240 },
241 [given] => NameParts {
242 given: given.to_string(),
243 ..Default::default()
244 },
245 _ => handle_multiple_suffixes(stripped_suffixes),
246 }
247}