From 2bbd722ecd411b3620a69482a50819eb9dfd2782 Mon Sep 17 00:00:00 2001 From: Arsen Musayelyan Date: Mon, 4 Oct 2021 19:07:54 -0700 Subject: [PATCH] Add korean transliteration --- README.md | 7 +- go.mod | 2 +- itd.toml | 2 +- notifs.go | 4 +- socket.go | 4 +- translit/korean.go | 452 +++++++++++++++++++++++++++++++++++++++++++ translit/translit.go | 1 + 7 files changed, 463 insertions(+), 9 deletions(-) create mode 100644 translit/korean.go diff --git a/README.md b/README.md index 6eed76f..0e3ca2e 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ The various request types and their data requirements can be seen in `internal/t ### Transliteration -Since the PineTime does not have enough space to store all unicode glyphs, it only stores the ASCII space and Cyrillic. Therefore, this daemon can transliterate unsupported characters into supported ones. Since some languages have different transliterations, the maps to be used must be specified in the config. Here are the available maps: +Since the PineTime does not have enough space to store all unicode glyphs, it only stores the ASCII space and Cyrillic. Therefore, this daemon can transliterate unsupported characters into supported ones. Since some languages have different transliterations, the transliterators to be used must be specified in the config. Here are the available transliterators: - eASCII - Scandinavian @@ -57,12 +57,13 @@ Since the PineTime does not have enough space to store all unicode glyphs, it on - Czeck - French - Armenian +- Korean - Emoji -Place the desired map names in an array as `notifs.translit.maps.use`. They will be evaluated in order. You can also put custom transliterations in `notifs.translit.maps.custom`. These take priority over any other maps. The `notifs.translit.maps` config section should look like this: +Place the desired map names in an array as `notifs.translit.use`. They will be evaluated in order. You can also put custom transliterations in `notifs.translit.custom`. These take priority over any other maps. The `notifs.translit` config section should look like this: ```toml -[notifs.translit.maps] +[notifs.translit] use = ["eASCII", "Russian", "Emoji"] custom = [ "test", "replaced" diff --git a/go.mod b/go.mod index 2922acc..b782fd2 100644 --- a/go.mod +++ b/go.mod @@ -22,5 +22,5 @@ require ( github.com/spf13/viper v1.8.1 go.arsenm.dev/infinitime v0.0.0-20210825051734-745b4bd37cf4 golang.org/x/sys v0.0.0-20210823070655-63515b42dcdf // indirect - golang.org/x/text v0.3.7 // indirect + golang.org/x/text v0.3.7 ) diff --git a/itd.toml b/itd.toml index e99af30..d5990c0 100644 --- a/itd.toml +++ b/itd.toml @@ -15,7 +15,7 @@ cfg.version = 2 notify = true setTime = true -[notifs.translit.maps] +[notifs.translit] use = ["eASCII", "Russian", "Emoji"] [notifs.ignore] diff --git a/notifs.go b/notifs.go index 52c04d5..5a06e0d 100644 --- a/notifs.go +++ b/notifs.go @@ -72,8 +72,8 @@ func initNotifRelay(dev *infinitime.Device) error { continue } - maps := viper.GetStringSlice("notifs.translit.maps.use") - translit.Maps["custom"] = translit.Map(viper.GetStringSlice("notifs.translit.maps.custom")) + maps := viper.GetStringSlice("notifs.translit.use") + translit.Maps["custom"] = translit.Map(viper.GetStringSlice("notifs.translit.custom")) sender = translit.Transliterate(sender, maps...) summary = translit.Transliterate(summary, maps...) body = translit.Transliterate(body, maps...) diff --git a/socket.go b/socket.go index 32c5fc8..250c73f 100644 --- a/socket.go +++ b/socket.go @@ -170,8 +170,8 @@ func handleConnection(conn net.Conn, dev *infinitime.Device) { connErr(conn, err, "Error decoding request data") break } - maps := viper.GetStringSlice("notifs.translit.maps.use") - translit.Maps["custom"] = translit.Map(viper.GetStringSlice("notifs.translit.maps.custom")) + maps := viper.GetStringSlice("notifs.translit.use") + translit.Maps["custom"] = translit.Map(viper.GetStringSlice("notifs.translit.custom")) title := translit.Transliterate(reqData.Title, maps...) body := translit.Transliterate(reqData.Body, maps...) // Send notification to watch diff --git a/translit/korean.go b/translit/korean.go new file mode 100644 index 0000000..280802d --- /dev/null +++ b/translit/korean.go @@ -0,0 +1,452 @@ +package translit + +import ( + "strings" + "unicode" + + "golang.org/x/text/unicode/norm" +) + +// https://en.wikipedia.org/wiki/Hangul_Jamo_%28Unicode_block%29 +var jamoBlock = &unicode.RangeTable{ + R16: []unicode.Range16{{ + Lo: 0x1100, + Hi: 0x11FF, + Stride: 1, + }}, +} + +// https://en.wikipedia.org/wiki/Hangul_Syllables +var syllablesBlock = &unicode.RangeTable{ + R16: []unicode.Range16{{ + Lo: 0xAC00, + Hi: 0xD7A3, + Stride: 1, + }}, +} + +// https://en.wikipedia.org/wiki/Hangul_Compatibility_Jamo +var compatJamoBlock = &unicode.RangeTable{ + R16: []unicode.Range16{{ + Lo: 0x3131, + Hi: 0x318E, + Stride: 1, + }}, +} + +// KoreanTranslit implements transliteration for Korean. +// +// This was translated to Go from the code in https://codeberg.org/Freeyourgadget/Gadgetbridge +type KoreanTranslit struct{} + +// User input consisting of isolated jamo is usually mapped to the KS X 1001 compatibility +// block, but jamo resulting from decomposed syllables are mapped to the modern one. This +// function maps compat jamo to modern ones where possible and returns all other characters +// unmodified. +// +// https://en.wikipedia.org/wiki/Hangul_Compatibility_Jamo +// https://en.wikipedia.org/wiki/Hangul_Jamo_%28Unicode_block%29 +func decompatJamo(jamo rune) rune { + // KS X 1001 Hangul filler, not used in modern Unicode. A useful landmark in the + // compatibility jamo block. + // https://en.wikipedia.org/wiki/KS_X_1001#Hangul_Filler + var hangulFiller rune = 0x3164 + + // Ignore characters outside compatibility jamo block + if !unicode.In(jamo, compatJamoBlock) { + return jamo + } + + // Vowels are contiguous, in the same order, and unambiguous so it's a simple offset. + if jamo >= 0x314F && jamo < hangulFiller { + return jamo - 0x1FEE + } + + // Consonants are organized differently. No clean way to do this. + // The compatibility jamo block doesn't distinguish between Choseong (leading) and Jongseong + // (final) positions, but the modern block does. We map to Choseong here. + switch jamo { + case 0x3131: + return 0x1100 // ㄱ + case 0x3132: + return 0x1101 // ㄲ + case 0x3134: + return 0x1102 // ㄴ + case 0x3137: + return 0x1103 // ㄷ + case 0x3138: + return 0x1104 // ㄸ + case 0x3139: + return 0x1105 // ㄹ + case 0x3141: + return 0x1106 // ㅁ + case 0x3142: + return 0x1107 // ㅂ + case 0x3143: + return 0x1108 // ㅃ + case 0x3145: + return 0x1109 // ㅅ + case 0x3146: + return 0x110A // ㅆ + case 0x3147: + return 0x110B // ㅇ + case 0x3148: + return 0x110C // ㅈ + case 0x3149: + return 0x110D // ㅉ + case 0x314A: + return 0x110E // ㅊ + case 0x314B: + return 0x110F // ㅋ + case 0x314C: + return 0x1110 // ㅌ + case 0x314D: + return 0x1111 // ㅍ + case 0x314E: + return 0x1112 // ㅎ + } + + // The rest of the compatibility block consists of archaic compounds that are + // unlikely to be encountered in modern systems. Just leave them alone. + return jamo +} + +// Transliterates one jamo at a time. +// Does nothing if it isn't in the modern jamo block. +func translitSingleJamo(jamo rune) string { + jamo = decompatJamo(jamo) + + switch jamo { + // Choseong (leading position consonants) + case 0x1100: + return "g" // ㄱ + case 0x1101: + return "kk" // ㄲ + case 0x1102: + return "n" // ㄴ + case 0x1103: + return "d" // ㄷ + case 0x1104: + return "tt" // ㄸ + case 0x1105: + return "r" // ㄹ + case 0x1106: + return "m" // ㅁ + case 0x1107: + return "b" // ㅂ + case 0x1108: + return "pp" // ㅃ + case 0x1109: + return "s" // ㅅ + case 0x110A: + return "ss" // ㅆ + case 0x110B: + return "" // ㅇ + case 0x110C: + return "j" // ㅈ + case 0x110D: + return "jj" // ㅉ + case 0x110E: + return "ch" // ㅊ + case 0x110F: + return "k" // ㅋ + case 0x1110: + return "t" // ㅌ + case 0x1111: + return "p" // ㅍ + case 0x1112: + return "h" // ㅎ + // Jungseong (vowels) + case 0x1161: + return "a" // ㅏ + case 0x1162: + return "ae" // ㅐ + case 0x1163: + return "ya" // ㅑ + case 0x1164: + return "yae" // ㅒ + case 0x1165: + return "eo" // ㅓ + case 0x1166: + return "e" // ㅔ + case 0x1167: + return "yeo" // ㅕ + case 0x1168: + return "ye" // ㅖ + case 0x1169: + return "o" // ㅗ + case 0x116A: + return "wa" // ㅘ + case 0x116B: + return "wae" // ㅙ + case 0x116C: + return "oe" // ㅚ + case 0x116D: + return "yo" // ㅛ + case 0x116E: + return "u" // ㅜ + case 0x116F: + return "wo" // ㅝ + case 0x1170: + return "we" // ㅞ + case 0x1171: + return "wi" // ㅟ + case 0x1172: + return "yu" // ㅠ + case 0x1173: + return "eu" // ㅡ + case 0x1174: + return "ui" // ㅢ + case 0x1175: + return "i" // ㅣ + // Jongseong (final position consonants) + case 0x11A8: + return "k" // ㄱ + case 0x11A9: + return "k" // ㄲ + case 0x11AB: + return "n" // ㄴ + case 0x11AE: + return "t" // ㄷ + case 0x11AF: + return "l" // ㄹ + case 0x11B7: + return "m" // ㅁ + case 0x11B8: + return "p" // ㅂ + case 0x11BA: + return "t" // ㅅ + case 0x11BB: + return "t" // ㅆ + case 0x11BC: + return "ng" // ㅇ + case 0x11BD: + return "t" // ㅈ + case 0x11BE: + return "t" // ㅊ + case 0x11BF: + return "k" // ㅋ + case 0x11C0: + return "t" // ㅌ + case 0x11C1: + return "p" // ㅍ + case 0x11C2: + return "t" // ㅎ + } + + return string(jamo) +} + +// Some combinations of ending jamo in one syllable and initial jamo in the next are romanized +// irregularly. These exceptions are called "special provisions". In cases where multiple +// romanizations are permitted, we use the one that's least commonly used elsewhere. +// +// Returns empty strring and false if either character is not in the modern jamo block, +// or if there is no special provision for that pair of jamo. +func translitSpecialProvisions(previousEnding rune, nextInitial rune) (string, bool) { + // Return false if previousEnding not in modern jamo block + if !unicode.In(previousEnding, jamoBlock) { + return "", false + } + // Return false if nextInitial not in modern jamo block + if !unicode.In(nextInitial, jamoBlock) { + return "", false + } + + // Jongseong (final position) ㅎ has a number of special provisions. + if previousEnding == 0x11C2 { + switch nextInitial { + case 0x110B: + return "h", true // ㅇ + case 0x1100: + return "k", true // ㄱ + case 0x1102: + return "nn", true // ㄴ + case 0x1103: + return "t", true // ㄷ + case 0x1105: + return "nn", true // ㄹ + case 0x1106: + return "nm", true // ㅁ + case 0x1107: + return "p", true // ㅂ + case 0x1109: + return "hs", true // ㅅ + case 0x110C: + return "ch", true // ㅈ + case 0x1112: + return "t", true // ㅎ + default: + return "", false + } + } + + // Otherwise, special provisions are denser when grouped by the second jamo. + switch nextInitial { + case 0x1100: // ㄱ + switch previousEnding { + case 0x11AB: + return "n-g", true // ㄴ + default: + return "", false + } + case 0x1102: // ㄴ + switch previousEnding { + case 0x11A8: + return "ngn", true // ㄱ + case 0x11AE: // ㄷ + case 0x11BA: // ㅅ + case 0x11BD: // ㅈ + case 0x11BE: // ㅊ + case 0x11C0: // ㅌ + return "nn", true + case 0x11AF: + return "ll", true // ㄹ + case 0x11B8: + return "mn", true // ㅂ + default: + return "", false + } + case 0x1105: // ㄹ + switch previousEnding { + case 0x11A8: // ㄱ + case 0x11AB: // ㄴ + case 0x11AF: // ㄹ + return "ll", true + case 0x11AE: // ㄷ + case 0x11BA: // ㅅ + case 0x11BD: // ㅈ + case 0x11BE: // ㅊ + case 0x11C0: // ㅌ + return "nn", true + case 0x11B7: // ㅁ + case 0x11B8: // ㅂ + return "mn", true + case 0x11BC: + return "ngn", true // ㅇ + default: + return "", false + } + case 0x1106: // ㅁ + switch previousEnding { + case 0x11A8: + return "ngm", true // ㄱ + case 0x11AE: // ㄷ + case 0x11BA: // ㅅ + case 0x11BD: // ㅈ + case 0x11BE: // ㅊ + case 0x11C0: // ㅌ + return "nm", true + case 0x11B8: + return "mm", true // ㅂ + default: + return "", false + } + case 0x110B: // ㅇ + switch previousEnding { + case 0x11A8: + return "g", true // ㄱ + case 0x11AE: + return "d", true // ㄷ + case 0x11AF: + return "r", true // ㄹ + case 0x11B8: + return "b", true // ㅂ + case 0x11BA: + return "s", true // ㅅ + case 0x11BC: + return "ng-", true // ㅇ + case 0x11BD: + return "j", true // ㅈ + case 0x11BE: + return "ch", true // ㅊ + default: + return "", false + } + case 0x110F: // ㅋ + switch previousEnding { + case 0x11A8: + return "k-k", true // ㄱ + default: + return "", false + } + case 0x1110: // ㅌ + switch previousEnding { + case 0x11AE: // ㄷ + case 0x11BA: // ㅅ + case 0x11BD: // ㅈ + case 0x11BE: // ㅊ + case 0x11C0: // ㅌ + return "t-t", true + default: + return "", false + } + case 0x1111: // ㅍ + switch previousEnding { + case 0x11B8: + return "p-p", true // ㅂ + default: + return "", false + } + default: + return "", false + } + return "", false +} + +// Decompose a syllable into several jamo. Does nothing if that isn't possible. +func decompose(syllable rune) string { + return norm.NFD.String(string(syllable)) +} + +// Transliterate any Hangul in the given string. +// Leaves any non-Hangul characters unmodified. +func (kt *KoreanTranslit) Transliterate(s string) string { + if len(s) == 0 { + return s + } + + builder := &strings.Builder{} + + nextInitialJamoConsumed := false + + for i, syllable := range s { + // If character not in blocks, leave it unmodified + if !unicode.In(syllable, jamoBlock, syllablesBlock, compatJamoBlock) { + builder.WriteRune(syllable) + continue + } + + jamo := decompose(syllable) + for j, char := range jamo { + // If we already transliterated the first jamo of this syllable as part of a special + // provision, skip it. Otherwise, handle it in the unconditional else branch. + if j == 0 && nextInitialJamoConsumed { + nextInitialJamoConsumed = false + continue + } + + // If this is the last jamo of this syllable and not the last syllable of the + // string, check for special provisions. If the next char is whitespace or not + // Hangul, run translitSpecialProvisions() should return no value. + if j == len(jamo)-1 && i < len(s)-1 { + nextSyllable := s[i+1] + nextJamo := decompose(rune(nextSyllable))[0] + + // Attempt to handle special provision + specialProvision, ok := translitSpecialProvisions(char, rune(nextJamo)) + if ok { + builder.WriteString(specialProvision) + nextInitialJamoConsumed = true + } else { + // Not a special provision, transliterate normally + builder.WriteString(translitSingleJamo(char)) + } + continue + } + // Transliterate normally + builder.WriteString(translitSingleJamo(char)) + } + } + return builder.String() +} diff --git a/translit/translit.go b/translit/translit.go index a116d87..4703383 100644 --- a/translit/translit.go +++ b/translit/translit.go @@ -464,4 +464,5 @@ var Maps = map[string]Transliterator{ "😴", ":zzz:", "💤", ":zzz:", }, + "Korean": &KoreanTranslit{}, }