autofill/sync/address/
name_utils.rs

1/* This Source Code Form is subject to the terms of the Mozilla Public
2* License, v. 2.0. If a copy of the MPL was not distributed with this
3* file, You can obtain one at http://mozilla.org/MPL/2.0/.
4*/
5
6// This code is a port of the FormAutofillNameUtils.sys.js file from central:
7// https://searchfox.org/mozilla-central/rev/2a867dd1ab015c3ef24b774a57709fb3b3dc4961/toolkit/components/formautofill/shared/FormAutofillNameUtils.sys.mjs
8const NAME_PREFIXES: &[&str] = &[
9    "1lt", "1st", "2lt", "2nd", "3rd", "admiral", "capt", "captain", "col", "cpt", "dr", "gen",
10    "general", "lcdr", "lt", "ltc", "ltg", "ltjg", "maj", "major", "mg", "mr", "mrs", "ms",
11    "pastor", "prof", "rep", "reverend", "rev", "sen",
12];
13
14const NAME_SUFFIXES: &[&str] = &[
15    "b.a", "ba", "d.d.s", "dds", "i", "ii", "iii", "iv", "ix", "jr", "m.a", "m.d", "ma", "md",
16    "ms", "ph.d", "phd", "sr", "v", "vi", "vii", "viii", "x",
17];
18
19const FAMILY_NAME_PREFIXES: &[&str] = &[
20    "d'", "de", "del", "der", "di", "la", "le", "mc", "san", "st", "ter", "van", "von",
21];
22
23// The common and non-ambiguous CJK surnames (last names) that have more than
24// one character.
25const COMMON_CJK_MULTI_CHAR_SURNAMES: &[&str] = &[
26    // Korean, taken from the list of surnames:
27    // https://ko.wikipedia.org/wiki/%ED%95%9C%EA%B5%AD%EC%9D%98_%EC%84%B1%EC%94%A8_%EB%AA%A9%EB%A1%9D
28    "남궁", "사공", "서문", "선우", "제갈", "황보", "독고", "망절",
29    // Chinese, taken from the top 10 Chinese 2-character surnames:
30    // https://zh.wikipedia.org/wiki/%E8%A4%87%E5%A7%93#.E5.B8.B8.E8.A6.8B.E7.9A.84.E8.A4.87.E5.A7.93
31    // Simplified Chinese (mostly mainland China)
32    "欧阳", "令狐", "皇甫", "上官", "司徒", "诸葛", "司马", "宇文", "呼延", "端木",
33    // Traditional Chinese (mostly Taiwan)
34    "張簡", "歐陽", "諸葛", "申屠", "尉遲", "司馬", "軒轅", "夏侯",
35];
36
37// All Korean surnames that have more than one character, even the
38// rare/ambiguous ones.
39const KOREAN_MULTI_CHAR_SURNAMES: &[&str] = &[
40    "강전", "남궁", "독고", "동방", "망절", "사공", "서문", "선우", "소봉", "어금", "장곡", "제갈",
41    "황목", "황보",
42];
43
44// The middle dot is used as a separator for foreign names in Japanese.
45const MIDDLE_DOT: &[char] = &[
46    '\u{30FB}', // KATAKANA MIDDLE DOT
47    '\u{00B7}', // A (common?) typo for "KATAKANA MIDDLE DOT}"
48];
49
50const CJK_RANGE: &[(char, char)] = &[
51    ('\u{1100}', '\u{11FF}'), // Hangul Jamo
52    ('\u{3040}', '\u{309F}'), // Hiragana
53    ('\u{30A0}', '\u{30FF}'), // Katakana
54    ('\u{3105}', '\u{312C}'), // Bopomofo
55    ('\u{3130}', '\u{318F}'), // Hangul Compatibility Jamo
56    ('\u{31F0}', '\u{31FF}'), // Katakana Phonetic Extensions
57    ('\u{3200}', '\u{32FF}'), // Enclosed CJK Letters and Months
58    ('\u{3400}', '\u{4DBF}'), // CJK unified ideographs Extension A
59    ('\u{4E00}', '\u{9FFF}'), // CJK Unified Ideographs
60    ('\u{A960}', '\u{A97F}'), // Hangul Jamo Extended-A
61    ('\u{AC00}', '\u{D7AF}'), // Hangul Syllables
62    ('\u{D7B0}', '\u{D7FF}'), // Hangul Jamo Extended-B
63    ('\u{FF00}', '\u{FFEF}'), // Halfwidth and Fullwidth Forms
64];
65
66const HANGUL_RANGE: &[(char, char)] = &[
67    ('\u{1100}', '\u{11FF}'), // Hangul Jamo
68    ('\u{3130}', '\u{318F}'), // Hangul Compatibility Jamo
69    ('\u{A960}', '\u{A97F}'), // Hangul Jamo Extended-A
70    ('\u{AC00}', '\u{D7AF}'), // Hangul Syllables
71    ('\u{D7B0}', '\u{D7FF}'), // Hangul Jamo Extended-B
72];
73
74#[derive(PartialEq, Debug, Default)]
75pub(crate) struct NameParts {
76    pub(crate) given: String,
77    pub(crate) middle: String,
78    pub(crate) family: String,
79}
80
81fn is_name_separator(c: char) -> bool {
82    c.is_whitespace() || MIDDLE_DOT.contains(&c) || c == ','
83}
84
85fn contains_string(set: &[&str], token: &str) -> bool {
86    let target = token.trim_end_matches('.').to_lowercase();
87    set.contains(&target.as_str())
88}
89
90fn strip_prefixes<'a>(name_tokens: &'a [&'a str]) -> &'a [&'a str] {
91    name_tokens
92        .iter()
93        .position(|token| !contains_string(NAME_PREFIXES, token))
94        .map_or(&[], |index| &name_tokens[index..])
95}
96
97fn strip_suffixes<'a>(name_tokens: &'a [&'a str]) -> &'a [&'a str] {
98    name_tokens
99        .iter()
100        .rposition(|token| !contains_string(NAME_SUFFIXES, token))
101        .map_or(&[], |index| &name_tokens[..=index])
102}
103
104fn is_char_in_range(c: char, range: &[(char, char)]) -> bool {
105    range.iter().any(|&(start, end)| c >= start && c <= end)
106}
107
108pub(crate) fn is_cjk_name(name: &str) -> bool {
109    if name.is_empty() || name.split_whitespace().count() > 2 {
110        return false;
111    }
112
113    name.split_whitespace().all(|part| {
114        part.chars()
115            .all(|c| MIDDLE_DOT.contains(&c) || is_char_in_range(c, CJK_RANGE))
116    })
117}
118
119fn is_korean_name(name: &str) -> bool {
120    if name.is_empty() {
121        return false;
122    }
123
124    name.split_whitespace()
125        .all(|part| part.chars().all(|c| is_char_in_range(c, HANGUL_RANGE)))
126}
127
128fn get_cjk_surname_length(name: &str) -> usize {
129    let surnames = if is_korean_name(name) && name.chars().count() > 3 {
130        KOREAN_MULTI_CHAR_SURNAMES
131    } else {
132        COMMON_CJK_MULTI_CHAR_SURNAMES
133    };
134
135    if surnames.iter().any(|&surname| name.starts_with(surname)) {
136        2
137    } else {
138        1
139    }
140}
141
142fn split_cjk_name(name_tokens: &[&str]) -> Option<NameParts> {
143    match name_tokens.len() {
144        1 => {
145            let name = name_tokens[0];
146            let surname_length = get_cjk_surname_length(name);
147            Some(NameParts {
148                given: name.chars().skip(surname_length).collect(),
149                family: name.chars().take(surname_length).collect(),
150                ..Default::default()
151            })
152        }
153        2 => Some(NameParts {
154            given: name_tokens[1].to_string(),
155            family: name_tokens[0].to_string(),
156            ..Default::default()
157        }),
158        _ => None,
159    }
160}
161
162fn handle_multiple_suffixes(suffixes: &[&str]) -> NameParts {
163    let mut suffixes = suffixes.to_vec(); // Convert to Vec<&str> if mutation is needed
164
165    let family_tokens = extract_family_tokens(&mut suffixes);
166    let family = family_tokens.join(" ");
167
168    let middle = if suffixes.len() >= 2 {
169        suffixes.pop().unwrap().to_string()
170    } else {
171        String::new()
172    };
173
174    let given = suffixes.join(" ");
175
176    NameParts {
177        given,
178        middle,
179        family,
180    }
181}
182
183fn extract_family_tokens(suffixes: &mut Vec<&str>) -> Vec<String> {
184    let mut family_tokens = vec![suffixes.pop().unwrap().to_string()];
185    while !suffixes.is_empty() && contains_string(FAMILY_NAME_PREFIXES, suffixes.last().unwrap()) {
186        family_tokens.insert(0, suffixes.pop().unwrap().to_string());
187    }
188    family_tokens
189}
190
191pub(crate) fn join_name_parts(name_parts: &NameParts) -> String {
192    if is_cjk_name(&name_parts.given)
193        && is_cjk_name(&name_parts.family)
194        && name_parts.middle.is_empty()
195    {
196        return format!("{}{}", name_parts.family, name_parts.given);
197    }
198
199    [
200        name_parts.given.as_str(),
201        name_parts.middle.as_str(),
202        name_parts.family.as_str(),
203    ]
204    .iter()
205    .filter(|&part| !part.is_empty())
206    .cloned()
207    .collect::<Vec<&str>>()
208    .join(" ")
209}
210
211pub(crate) fn split_name(name: &str) -> NameParts {
212    if name.is_empty() {
213        return NameParts::default();
214    }
215
216    let name_tokens: Vec<&str> = name
217        .trim()
218        .split(is_name_separator)
219        .filter(|s| !s.is_empty())
220        .collect();
221
222    let stripped_prefixes = strip_prefixes(&name_tokens);
223
224    if is_cjk_name(name) {
225        if let Some(cjk_parts) = split_cjk_name(stripped_prefixes) {
226            return cjk_parts;
227        }
228    }
229
230    let stripped_suffixes = if name_tokens.len() > 2 {
231        strip_suffixes(stripped_prefixes)
232    } else {
233        stripped_prefixes
234    };
235
236    match stripped_suffixes {
237        [] => NameParts {
238            given: name.to_string(),
239            ..Default::default()
240        },
241        [given] => NameParts {
242            given: given.to_string(),
243            ..Default::default()
244        },
245        _ => handle_multiple_suffixes(stripped_suffixes),
246    }
247}