Add korean transliteration

This commit is contained in:
Elara 2021-10-04 19:07:54 -07:00
parent 73f16fcfef
commit 2bbd722ecd
7 changed files with 463 additions and 9 deletions

View File

@ -39,7 +39,7 @@ The various request types and their data requirements can be seen in `internal/t
### Transliteration
Since the PineTime does not have enough space to store all unicode glyphs, it only stores the ASCII space and Cyrillic. Therefore, this daemon can transliterate unsupported characters into supported ones. Since some languages have different transliterations, the maps to be used must be specified in the config. Here are the available maps:
Since the PineTime does not have enough space to store all unicode glyphs, it only stores the ASCII space and Cyrillic. Therefore, this daemon can transliterate unsupported characters into supported ones. Since some languages have different transliterations, the transliterators to be used must be specified in the config. Here are the available transliterators:
- eASCII
- Scandinavian
@ -57,12 +57,13 @@ Since the PineTime does not have enough space to store all unicode glyphs, it on
- Czeck
- French
- Armenian
- Korean
- Emoji
Place the desired map names in an array as `notifs.translit.maps.use`. They will be evaluated in order. You can also put custom transliterations in `notifs.translit.maps.custom`. These take priority over any other maps. The `notifs.translit.maps` config section should look like this:
Place the desired map names in an array as `notifs.translit.use`. They will be evaluated in order. You can also put custom transliterations in `notifs.translit.custom`. These take priority over any other maps. The `notifs.translit` config section should look like this:
```toml
[notifs.translit.maps]
[notifs.translit]
use = ["eASCII", "Russian", "Emoji"]
custom = [
"test", "replaced"

2
go.mod
View File

@ -22,5 +22,5 @@ require (
github.com/spf13/viper v1.8.1
go.arsenm.dev/infinitime v0.0.0-20210825051734-745b4bd37cf4
golang.org/x/sys v0.0.0-20210823070655-63515b42dcdf // indirect
golang.org/x/text v0.3.7 // indirect
golang.org/x/text v0.3.7
)

View File

@ -15,7 +15,7 @@ cfg.version = 2
notify = true
setTime = true
[notifs.translit.maps]
[notifs.translit]
use = ["eASCII", "Russian", "Emoji"]
[notifs.ignore]

View File

@ -72,8 +72,8 @@ func initNotifRelay(dev *infinitime.Device) error {
continue
}
maps := viper.GetStringSlice("notifs.translit.maps.use")
translit.Maps["custom"] = translit.Map(viper.GetStringSlice("notifs.translit.maps.custom"))
maps := viper.GetStringSlice("notifs.translit.use")
translit.Maps["custom"] = translit.Map(viper.GetStringSlice("notifs.translit.custom"))
sender = translit.Transliterate(sender, maps...)
summary = translit.Transliterate(summary, maps...)
body = translit.Transliterate(body, maps...)

View File

@ -170,8 +170,8 @@ func handleConnection(conn net.Conn, dev *infinitime.Device) {
connErr(conn, err, "Error decoding request data")
break
}
maps := viper.GetStringSlice("notifs.translit.maps.use")
translit.Maps["custom"] = translit.Map(viper.GetStringSlice("notifs.translit.maps.custom"))
maps := viper.GetStringSlice("notifs.translit.use")
translit.Maps["custom"] = translit.Map(viper.GetStringSlice("notifs.translit.custom"))
title := translit.Transliterate(reqData.Title, maps...)
body := translit.Transliterate(reqData.Body, maps...)
// Send notification to watch

452
translit/korean.go Normal file
View File

@ -0,0 +1,452 @@
package translit
import (
"strings"
"unicode"
"golang.org/x/text/unicode/norm"
)
// https://en.wikipedia.org/wiki/Hangul_Jamo_%28Unicode_block%29
var jamoBlock = &unicode.RangeTable{
R16: []unicode.Range16{{
Lo: 0x1100,
Hi: 0x11FF,
Stride: 1,
}},
}
// https://en.wikipedia.org/wiki/Hangul_Syllables
var syllablesBlock = &unicode.RangeTable{
R16: []unicode.Range16{{
Lo: 0xAC00,
Hi: 0xD7A3,
Stride: 1,
}},
}
// https://en.wikipedia.org/wiki/Hangul_Compatibility_Jamo
var compatJamoBlock = &unicode.RangeTable{
R16: []unicode.Range16{{
Lo: 0x3131,
Hi: 0x318E,
Stride: 1,
}},
}
// KoreanTranslit implements transliteration for Korean.
//
// This was translated to Go from the code in https://codeberg.org/Freeyourgadget/Gadgetbridge
type KoreanTranslit struct{}
// User input consisting of isolated jamo is usually mapped to the KS X 1001 compatibility
// block, but jamo resulting from decomposed syllables are mapped to the modern one. This
// function maps compat jamo to modern ones where possible and returns all other characters
// unmodified.
//
// https://en.wikipedia.org/wiki/Hangul_Compatibility_Jamo
// https://en.wikipedia.org/wiki/Hangul_Jamo_%28Unicode_block%29
func decompatJamo(jamo rune) rune {
// KS X 1001 Hangul filler, not used in modern Unicode. A useful landmark in the
// compatibility jamo block.
// https://en.wikipedia.org/wiki/KS_X_1001#Hangul_Filler
var hangulFiller rune = 0x3164
// Ignore characters outside compatibility jamo block
if !unicode.In(jamo, compatJamoBlock) {
return jamo
}
// Vowels are contiguous, in the same order, and unambiguous so it's a simple offset.
if jamo >= 0x314F && jamo < hangulFiller {
return jamo - 0x1FEE
}
// Consonants are organized differently. No clean way to do this.
// The compatibility jamo block doesn't distinguish between Choseong (leading) and Jongseong
// (final) positions, but the modern block does. We map to Choseong here.
switch jamo {
case 0x3131:
return 0x1100 // ㄱ
case 0x3132:
return 0x1101 // ㄲ
case 0x3134:
return 0x1102 // ㄴ
case 0x3137:
return 0x1103 // ㄷ
case 0x3138:
return 0x1104 // ㄸ
case 0x3139:
return 0x1105 // ㄹ
case 0x3141:
return 0x1106 // ㅁ
case 0x3142:
return 0x1107 // ㅂ
case 0x3143:
return 0x1108 // ㅃ
case 0x3145:
return 0x1109 // ㅅ
case 0x3146:
return 0x110A // ㅆ
case 0x3147:
return 0x110B // ㅇ
case 0x3148:
return 0x110C // ㅈ
case 0x3149:
return 0x110D // ㅉ
case 0x314A:
return 0x110E // ㅊ
case 0x314B:
return 0x110F // ㅋ
case 0x314C:
return 0x1110 // ㅌ
case 0x314D:
return 0x1111 // ㅍ
case 0x314E:
return 0x1112 // ㅎ
}
// The rest of the compatibility block consists of archaic compounds that are
// unlikely to be encountered in modern systems. Just leave them alone.
return jamo
}
// Transliterates one jamo at a time.
// Does nothing if it isn't in the modern jamo block.
func translitSingleJamo(jamo rune) string {
jamo = decompatJamo(jamo)
switch jamo {
// Choseong (leading position consonants)
case 0x1100:
return "g" // ㄱ
case 0x1101:
return "kk" // ㄲ
case 0x1102:
return "n" // ㄴ
case 0x1103:
return "d" // ㄷ
case 0x1104:
return "tt" // ㄸ
case 0x1105:
return "r" // ㄹ
case 0x1106:
return "m" // ㅁ
case 0x1107:
return "b" // ㅂ
case 0x1108:
return "pp" // ㅃ
case 0x1109:
return "s" // ㅅ
case 0x110A:
return "ss" // ㅆ
case 0x110B:
return "" // ㅇ
case 0x110C:
return "j" // ㅈ
case 0x110D:
return "jj" // ㅉ
case 0x110E:
return "ch" // ㅊ
case 0x110F:
return "k" // ㅋ
case 0x1110:
return "t" // ㅌ
case 0x1111:
return "p" // ㅍ
case 0x1112:
return "h" // ㅎ
// Jungseong (vowels)
case 0x1161:
return "a" // ㅏ
case 0x1162:
return "ae" // ㅐ
case 0x1163:
return "ya" // ㅑ
case 0x1164:
return "yae" // ㅒ
case 0x1165:
return "eo" // ㅓ
case 0x1166:
return "e" // ㅔ
case 0x1167:
return "yeo" // ㅕ
case 0x1168:
return "ye" // ㅖ
case 0x1169:
return "o" // ㅗ
case 0x116A:
return "wa" // ㅘ
case 0x116B:
return "wae" // ㅙ
case 0x116C:
return "oe" // ㅚ
case 0x116D:
return "yo" // ㅛ
case 0x116E:
return "u" // ㅜ
case 0x116F:
return "wo" // ㅝ
case 0x1170:
return "we" // ㅞ
case 0x1171:
return "wi" // ㅟ
case 0x1172:
return "yu" // ㅠ
case 0x1173:
return "eu" // ㅡ
case 0x1174:
return "ui" // ㅢ
case 0x1175:
return "i" // ㅣ
// Jongseong (final position consonants)
case 0x11A8:
return "k" // ㄱ
case 0x11A9:
return "k" // ㄲ
case 0x11AB:
return "n" // ㄴ
case 0x11AE:
return "t" // ㄷ
case 0x11AF:
return "l" // ㄹ
case 0x11B7:
return "m" // ㅁ
case 0x11B8:
return "p" // ㅂ
case 0x11BA:
return "t" // ㅅ
case 0x11BB:
return "t" // ㅆ
case 0x11BC:
return "ng" // ㅇ
case 0x11BD:
return "t" // ㅈ
case 0x11BE:
return "t" // ㅊ
case 0x11BF:
return "k" // ㅋ
case 0x11C0:
return "t" // ㅌ
case 0x11C1:
return "p" // ㅍ
case 0x11C2:
return "t" // ㅎ
}
return string(jamo)
}
// Some combinations of ending jamo in one syllable and initial jamo in the next are romanized
// irregularly. These exceptions are called "special provisions". In cases where multiple
// romanizations are permitted, we use the one that's least commonly used elsewhere.
//
// Returns empty strring and false if either character is not in the modern jamo block,
// or if there is no special provision for that pair of jamo.
func translitSpecialProvisions(previousEnding rune, nextInitial rune) (string, bool) {
// Return false if previousEnding not in modern jamo block
if !unicode.In(previousEnding, jamoBlock) {
return "", false
}
// Return false if nextInitial not in modern jamo block
if !unicode.In(nextInitial, jamoBlock) {
return "", false
}
// Jongseong (final position) ㅎ has a number of special provisions.
if previousEnding == 0x11C2 {
switch nextInitial {
case 0x110B:
return "h", true // ㅇ
case 0x1100:
return "k", true // ㄱ
case 0x1102:
return "nn", true // ㄴ
case 0x1103:
return "t", true // ㄷ
case 0x1105:
return "nn", true // ㄹ
case 0x1106:
return "nm", true // ㅁ
case 0x1107:
return "p", true // ㅂ
case 0x1109:
return "hs", true // ㅅ
case 0x110C:
return "ch", true // ㅈ
case 0x1112:
return "t", true // ㅎ
default:
return "", false
}
}
// Otherwise, special provisions are denser when grouped by the second jamo.
switch nextInitial {
case 0x1100: // ㄱ
switch previousEnding {
case 0x11AB:
return "n-g", true // ㄴ
default:
return "", false
}
case 0x1102: // ㄴ
switch previousEnding {
case 0x11A8:
return "ngn", true // ㄱ
case 0x11AE: // ㄷ
case 0x11BA: // ㅅ
case 0x11BD: // ㅈ
case 0x11BE: // ㅊ
case 0x11C0: // ㅌ
return "nn", true
case 0x11AF:
return "ll", true // ㄹ
case 0x11B8:
return "mn", true // ㅂ
default:
return "", false
}
case 0x1105: // ㄹ
switch previousEnding {
case 0x11A8: // ㄱ
case 0x11AB: // ㄴ
case 0x11AF: // ㄹ
return "ll", true
case 0x11AE: // ㄷ
case 0x11BA: // ㅅ
case 0x11BD: // ㅈ
case 0x11BE: // ㅊ
case 0x11C0: // ㅌ
return "nn", true
case 0x11B7: // ㅁ
case 0x11B8: // ㅂ
return "mn", true
case 0x11BC:
return "ngn", true // ㅇ
default:
return "", false
}
case 0x1106: // ㅁ
switch previousEnding {
case 0x11A8:
return "ngm", true // ㄱ
case 0x11AE: // ㄷ
case 0x11BA: // ㅅ
case 0x11BD: // ㅈ
case 0x11BE: // ㅊ
case 0x11C0: // ㅌ
return "nm", true
case 0x11B8:
return "mm", true // ㅂ
default:
return "", false
}
case 0x110B: // ㅇ
switch previousEnding {
case 0x11A8:
return "g", true // ㄱ
case 0x11AE:
return "d", true // ㄷ
case 0x11AF:
return "r", true // ㄹ
case 0x11B8:
return "b", true // ㅂ
case 0x11BA:
return "s", true // ㅅ
case 0x11BC:
return "ng-", true // ㅇ
case 0x11BD:
return "j", true // ㅈ
case 0x11BE:
return "ch", true // ㅊ
default:
return "", false
}
case 0x110F: // ㅋ
switch previousEnding {
case 0x11A8:
return "k-k", true // ㄱ
default:
return "", false
}
case 0x1110: // ㅌ
switch previousEnding {
case 0x11AE: // ㄷ
case 0x11BA: // ㅅ
case 0x11BD: // ㅈ
case 0x11BE: // ㅊ
case 0x11C0: // ㅌ
return "t-t", true
default:
return "", false
}
case 0x1111: // ㅍ
switch previousEnding {
case 0x11B8:
return "p-p", true // ㅂ
default:
return "", false
}
default:
return "", false
}
return "", false
}
// Decompose a syllable into several jamo. Does nothing if that isn't possible.
func decompose(syllable rune) string {
return norm.NFD.String(string(syllable))
}
// Transliterate any Hangul in the given string.
// Leaves any non-Hangul characters unmodified.
func (kt *KoreanTranslit) Transliterate(s string) string {
if len(s) == 0 {
return s
}
builder := &strings.Builder{}
nextInitialJamoConsumed := false
for i, syllable := range s {
// If character not in blocks, leave it unmodified
if !unicode.In(syllable, jamoBlock, syllablesBlock, compatJamoBlock) {
builder.WriteRune(syllable)
continue
}
jamo := decompose(syllable)
for j, char := range jamo {
// If we already transliterated the first jamo of this syllable as part of a special
// provision, skip it. Otherwise, handle it in the unconditional else branch.
if j == 0 && nextInitialJamoConsumed {
nextInitialJamoConsumed = false
continue
}
// If this is the last jamo of this syllable and not the last syllable of the
// string, check for special provisions. If the next char is whitespace or not
// Hangul, run translitSpecialProvisions() should return no value.
if j == len(jamo)-1 && i < len(s)-1 {
nextSyllable := s[i+1]
nextJamo := decompose(rune(nextSyllable))[0]
// Attempt to handle special provision
specialProvision, ok := translitSpecialProvisions(char, rune(nextJamo))
if ok {
builder.WriteString(specialProvision)
nextInitialJamoConsumed = true
} else {
// Not a special provision, transliterate normally
builder.WriteString(translitSingleJamo(char))
}
continue
}
// Transliterate normally
builder.WriteString(translitSingleJamo(char))
}
}
return builder.String()
}

View File

@ -464,4 +464,5 @@ var Maps = map[string]Transliterator{
"😴", ":zzz:",
"💤", ":zzz:",
},
"Korean": &KoreanTranslit{},
}