pluralize.go - taldir - Directory service to resolve wallet mailboxes by messenger addresses

pluralize.go (12715B)
      1 package pluralize
      2 
      3 import (
      4 	"fmt"
      5 	"regexp"
      6 	"strconv"
      7 	"strings"
      8 )
      9 
     10 // Rule -- pluralize rule expression and replacement value.
     11 type Rule struct {
     12 	expression  *regexp.Regexp
     13 	replacement string
     14 }
     15 
     16 // Client -- pluralize client.
     17 type Client struct {
     18 	pluralRules      []Rule
     19 	singularRules    []Rule
     20 	uncountables     map[string]bool
     21 	irregularSingles map[string]string
     22 	irregularPlurals map[string]string
     23 	interpolateExpr  *regexp.Regexp
     24 }
     25 
     26 // NewClient - pluralization client factory method.
     27 func NewClient() *Client {
     28 	client := Client{}
     29 	client.init()
     30 
     31 	return &client
     32 }
     33 
     34 func (c *Client) init() {
     35 	c.pluralRules = make([]Rule, 0)
     36 	c.singularRules = make([]Rule, 0)
     37 	c.uncountables = make(map[string]bool)
     38 	c.irregularSingles = make(map[string]string)
     39 	c.irregularPlurals = make(map[string]string)
     40 
     41 	c.loadIrregularRules()
     42 	c.loadPluralizationRules()
     43 	c.loadSingularizationRules()
     44 	c.loadUncountableRules()
     45 	c.interpolateExpr = regexp.MustCompile(`\$(\d{1,2})`)
     46 }
     47 
     48 // Pluralize -- Pluralize or singularize a word based on the passed in count.
     49 // 	word: the word to pluralize
     50 // 	count: how many of the word exist
     51 // 	inclusive: whether to prefix with the number (e.g. 3 ducks)
     52 func (c *Client) Pluralize(word string, count int, inclusive bool) string {
     53 	pluralized := func() func(string) string {
     54 		if count == 1 {
     55 			return c.Singular
     56 		}
     57 
     58 		return c.Plural
     59 	}
     60 
     61 	if inclusive {
     62 		return fmt.Sprintf("%d %s", count, pluralized()(word))
     63 	}
     64 
     65 	return pluralized()(word)
     66 }
     67 
     68 // Plural -- Pluralize a word.
     69 func (c *Client) Plural(word string) string {
     70 	return c.replaceWord(c.irregularSingles, c.irregularPlurals, c.pluralRules)(word)
     71 }
     72 
     73 // IsPlural -- Check if a word is plural.
     74 func (c *Client) IsPlural(word string) bool {
     75 	return c.checkWord(c.irregularSingles, c.irregularPlurals, c.pluralRules)(word)
     76 }
     77 
     78 // Singular -- Singularize a word.
     79 func (c *Client) Singular(word string) string {
     80 	return c.replaceWord(c.irregularPlurals, c.irregularSingles, c.singularRules)(word)
     81 }
     82 
     83 // IsSingular -- Check if a word is singular.
     84 func (c *Client) IsSingular(word string) bool {
     85 	return c.checkWord(c.irregularPlurals, c.irregularSingles, c.singularRules)(word)
     86 }
     87 
     88 // AddPluralRule -- Add a pluralization rule to the collection.
     89 func (c *Client) AddPluralRule(rule string, replacement string) {
     90 	c.pluralRules = append(c.pluralRules, Rule{sanitizeRule(rule), replacement})
     91 }
     92 
     93 // AddSingularRule -- Add a singularization rule to the collection.
     94 func (c *Client) AddSingularRule(rule string, replacement string) {
     95 	c.singularRules = append(c.singularRules, Rule{sanitizeRule(rule), replacement})
     96 }
     97 
     98 // AddUncountableRule -- Add an uncountable word rule.
     99 func (c *Client) AddUncountableRule(word string) {
    100 	if !isExpr(word) {
    101 		c.uncountables[strings.ToLower(word)] = true
    102 		return
    103 	}
    104 
    105 	c.AddPluralRule(word, `$0`)
    106 	c.AddSingularRule(word, `$0`)
    107 }
    108 
    109 // AddIrregularRule -- Add an irregular word definition.
    110 func (c *Client) AddIrregularRule(single string, plural string) {
    111 	p := strings.ToLower(plural)
    112 	s := strings.ToLower(single)
    113 
    114 	c.irregularSingles[s] = p
    115 	c.irregularPlurals[p] = s
    116 }
    117 
    118 func (c *Client) replaceWord(replaceMap map[string]string, keepMap map[string]string, rules []Rule) func(w string) string { //nolint:lll
    119 	f := func(word string) string {
    120 		// Get the correct token and case restoration functions.
    121 		var token = strings.ToLower(word)
    122 
    123 		// Check against the keep object map.
    124 		if _, ok := keepMap[token]; ok {
    125 			return restoreCase(word, token)
    126 		}
    127 
    128 		// Check against the replacement map for a direct word replacement.
    129 		if replaceToken, ok := replaceMap[token]; ok {
    130 			return restoreCase(word, replaceToken)
    131 		}
    132 
    133 		// Run all the rules against the word.
    134 		return c.sanitizeWord(token, word, rules)
    135 	}
    136 
    137 	return f
    138 }
    139 
    140 func (c *Client) checkWord(replaceMap map[string]string, keepMap map[string]string, rules []Rule) func(w string) bool {
    141 	f := func(word string) bool {
    142 		var token = strings.ToLower(word)
    143 
    144 		if _, ok := keepMap[token]; ok {
    145 			return true
    146 		}
    147 
    148 		if _, ok := replaceMap[token]; ok {
    149 			return false
    150 		}
    151 
    152 		return c.sanitizeWord(token, token, rules) == token
    153 	}
    154 
    155 	return f
    156 }
    157 
    158 func (c *Client) interpolate(str string, args []string) string {
    159 	lookup := map[string]string{}
    160 
    161 	for _, submatch := range c.interpolateExpr.FindAllStringSubmatch(str, -1) {
    162 		element, _ := strconv.Atoi(submatch[1])
    163 		lookup[submatch[0]] = args[element]
    164 	}
    165 
    166 	result := c.interpolateExpr.ReplaceAllStringFunc(str, func(repl string) string {
    167 		return lookup[repl]
    168 	})
    169 
    170 	return result
    171 }
    172 
    173 func (c *Client) replace(word string, rule Rule) string {
    174 	return rule.expression.ReplaceAllStringFunc(word, func(w string) string {
    175 		match := rule.expression.FindString(word)
    176 		index := rule.expression.FindStringIndex(word)[0]
    177 		args := rule.expression.FindAllStringSubmatch(word, -1)[0]
    178 
    179 		result := c.interpolate(rule.replacement, args)
    180 
    181 		if match == `` {
    182 			return restoreCase(word[index-1:index], result)
    183 		}
    184 		return restoreCase(match, result)
    185 	})
    186 }
    187 
    188 func (c *Client) sanitizeWord(token string, word string, rules []Rule) string {
    189 	// If empty string
    190 	if len(token) == 0 {
    191 		return word
    192 	}
    193 	// If does not need fixup
    194 	if _, ok := c.uncountables[token]; ok {
    195 		return word
    196 	}
    197 
    198 	// Iterate over the sanitization rules and use the first one to match.
    199 	// NOTE: iterate rules array in reverse order specific => general rules
    200 	for i := len(rules) - 1; i >= 0; i-- {
    201 		if rules[i].expression.MatchString(word) {
    202 			return c.replace(word, rules[i])
    203 		}
    204 	}
    205 
    206 	return word
    207 }
    208 
    209 func sanitizeRule(rule string) *regexp.Regexp {
    210 	if isExpr(rule) {
    211 		return regexp.MustCompile(rule)
    212 	}
    213 
    214 	return regexp.MustCompile(`(?i)^` + rule + `$`)
    215 }
    216 
    217 func restoreCase(word string, token string) string {
    218 	// Tokens are an exact match.
    219 	if word == token {
    220 		return token
    221 	}
    222 
    223 	// Lower cased words. E.g. "hello".
    224 	if word == strings.ToLower(word) {
    225 		return strings.ToLower(token)
    226 	}
    227 
    228 	// Upper cased words. E.g. "WHISKY".
    229 	if word == strings.ToUpper(word) {
    230 		return strings.ToUpper(token)
    231 	}
    232 
    233 	// Title cased words. E.g. "Title".
    234 	if word[:1] == strings.ToUpper(word[:1]) {
    235 		return strings.ToUpper(token[:1]) + strings.ToLower(token[1:])
    236 	}
    237 
    238 	// Lower cased words. E.g. "test".
    239 	return strings.ToLower(token)
    240 }
    241 
    242 // isExpr -- helper to detect if string represents an expression by checking first character to be `(`.
    243 func isExpr(s string) bool {
    244 	return s[:1] == `(`
    245 }
    246 
    247 func (c *Client) loadIrregularRules() { //nolint:funlen
    248 	var irregularRules = []struct {
    249 		single string
    250 		plural string
    251 	}{
    252 		// Pronouns.
    253 		{`I`, `we`},
    254 		{`me`, `us`},
    255 		{`he`, `they`},
    256 		{`she`, `they`},
    257 		{`them`, `them`},
    258 		{`myself`, `ourselves`},
    259 		{`yourself`, `yourselves`},
    260 		{`itself`, `themselves`},
    261 		{`herself`, `themselves`},
    262 		{`himself`, `themselves`},
    263 		{`themself`, `themselves`},
    264 		{`is`, `are`},
    265 		{`was`, `were`},
    266 		{`has`, `have`},
    267 		{`this`, `these`},
    268 		{`that`, `those`},
    269 		{`my`, `our`},
    270 		{`its`, `their`},
    271 		{`his`, `their`},
    272 		{`her`, `their`},
    273 		// Words ending in with a consonant and `o`.
    274 		{`echo`, `echoes`},
    275 		{`dingo`, `dingoes`},
    276 		{`volcano`, `volcanoes`},
    277 		{`tornado`, `tornadoes`},
    278 		{`torpedo`, `torpedoes`},
    279 		// Ends with `us`.
    280 		{`genus`, `genera`},
    281 		{`viscus`, `viscera`},
    282 		// Ends with `ma`.
    283 		{`stigma`, `stigmata`},
    284 		{`stoma`, `stomata`},
    285 		{`dogma`, `dogmata`},
    286 		{`lemma`, `lemmata`},
    287 		{`schema`, `schemata`},
    288 		{`anathema`, `anathemata`},
    289 		// Other irregular rules.
    290 		{`ox`, `oxen`},
    291 		{`axe`, `axes`},
    292 		{`die`, `dice`},
    293 		{`yes`, `yeses`},
    294 		{`foot`, `feet`},
    295 		{`eave`, `eaves`},
    296 		{`goose`, `geese`},
    297 		{`tooth`, `teeth`},
    298 		{`quiz`, `quizzes`},
    299 		{`human`, `humans`},
    300 		{`proof`, `proofs`},
    301 		{`carve`, `carves`},
    302 		{`valve`, `valves`},
    303 		{`looey`, `looies`},
    304 		{`thief`, `thieves`},
    305 		{`groove`, `grooves`},
    306 		{`pickaxe`, `pickaxes`},
    307 		{`passerby`, `passersby`},
    308 		{`canvas`, `canvases`},
    309 		{`sms`, `sms`},
    310 	}
    311 
    312 	for _, r := range irregularRules {
    313 		c.AddIrregularRule(r.single, r.plural)
    314 	}
    315 }
    316 
    317 func (c *Client) loadPluralizationRules() {
    318 	var pluralizationRules = []struct {
    319 		rule        string
    320 		replacement string
    321 	}{
    322 		{`(?i)s?$`, `s`},
    323 		{`(?i)[^[:ascii:]]$`, `$0`},
    324 		{`(?i)([^aeiou]ese)$`, `$1`},
    325 		{`(?i)(ax|test)is$`, `$1es`},
    326 		{`(?i)(alias|[^aou]us|t[lm]as|gas|ris)$`, `$1es`},
    327 		{`(?i)(e[mn]u)s?$`, `$1s`},
    328 		{`(?i)([^l]ias|[aeiou]las|[ejzr]as|[iu]am)$`, `$1`},
    329 		{`(?i)(alumn|syllab|vir|radi|nucle|fung|cact|stimul|termin|bacill|foc|uter|loc|strat)(?:us|i)$`, `$1i`}, //nolint:lll,misspell
    330 		{`(?i)(alumn|alg|vertebr)(?:a|ae)$`, `$1ae`},
    331 		{`(?i)(seraph|cherub)(?:im)?$`, `$1im`},
    332 		{`(?i)(her|at|gr)o$`, `$1oes`},
    333 		{`(?i)(agend|addend|millenni|dat|extrem|bacteri|desiderat|strat|candelabr|errat|ov|symposi|curricul|automat|quor)(?:a|um)$`, `$1a`}, //nolint:lll,misspell
    334 		{`(?i)(apheli|hyperbat|periheli|asyndet|noumen|phenomen|criteri|organ|prolegomen|hedr|automat)(?:a|on)$`, `$1a`},
    335 		{`(?i)sis$`, `ses`},
    336 		{`(?i)(?:(kni|wi|li)fe|(ar|l|ea|eo|oa|hoo)f)$`, `$1$2ves`},
    337 		{`(?i)([^aeiouy]|qu)y$`, `$1ies`},
    338 		{`(?i)([^ch][ieo][ln])ey$`, `$1ies`},
    339 		{`(?i)(x|ch|ss|sh|zz)$`, `$1es`},
    340 		{`(?i)(matr|cod|mur|sil|vert|ind|append)(?:ix|ex)$`, `$1ices`},
    341 		{`(?i)\b((?:tit)?m|l)(?:ice|ouse)$`, `$1ice`},
    342 		{`(?i)(pe)(?:rson|ople)$`, `$1ople`},
    343 		{`(?i)(child)(?:ren)?$`, `$1ren`},
    344 		{`(?i)eaux$`, `$0`},
    345 		{`(?i)m[ae]n$`, `men`},
    346 		{`thou`, `you`},
    347 	}
    348 
    349 	for _, r := range pluralizationRules {
    350 		c.AddPluralRule(r.rule, r.replacement)
    351 	}
    352 }
    353 
    354 func (c *Client) loadSingularizationRules() {
    355 	var singularizationRules = []struct {
    356 		rule        string
    357 		replacement string
    358 	}{
    359 		{`(?i)s$`, ``},
    360 		{`(?i)(ss)$`, `$1`},
    361 		{`(?i)(wi|kni|(?:after|half|high|low|mid|non|night|[^\w]|^)li)ves$`, `$1fe`},
    362 		{`(?i)(ar|(?:wo|[ae])l|[eo][ao])ves$`, `$1f`},
    363 		{`(?i)ies$`, `y`},
    364 		{`(?i)(dg|ss|ois|lk|ok|wn|mb|th|ch|ec|oal|is|ck|ix|sser|ts|wb)ies$`, `$1ie`},
    365 		{`(?i)\b(l|(?:neck|cross|hog|aun)?t|coll|faer|food|gen|goon|group|hipp|junk|vegg|(?:pork)?p|charl|calor|cut)ies$`, `$1ie`}, //nolint:lll
    366 		{`(?i)\b(mon|smil)ies$`, `$1ey`},
    367 		{`(?i)\b((?:tit)?m|l)ice$`, `$1ouse`},
    368 		{`(?i)(seraph|cherub)im$`, `$1`},
    369 		{`(?i)(x|ch|ss|sh|zz|tto|go|cho|alias|[^aou]us|t[lm]as|gas|(?:her|at|gr)o|[aeiou]ris)(?:es)?$`, `$1`},
    370 		{`(?i)(analy|diagno|parenthe|progno|synop|the|empha|cri|ne)(?:sis|ses)$`, `$1sis`},
    371 		{`(?i)(movie|twelve|abuse|e[mn]u)s$`, `$1`},
    372 		{`(?i)(test)(?:is|es)$`, `$1is`},
    373 		{`(?i)(alumn|syllab|vir|radi|nucle|fung|cact|stimul|termin|bacill|foc|uter|loc|strat)(?:us|i)$`, `$1us`},              //nolint:lll,misspell
    374 		{`(?i)(agend|addend|millenni|dat|extrem|bacteri|desiderat|strat|candelabr|errat|ov|symposi|curricul|quor)a$`, `$1um`}, //nolint:lll,misspell
    375 		{`(?i)(apheli|hyperbat|periheli|asyndet|noumen|phenomen|criteri|organ|prolegomen|hedr|automat)a$`, `$1on`},
    376 		{`(?i)(alumn|alg|vertebr)ae$`, `$1a`},
    377 		{`(?i)(cod|mur|sil|vert|ind)ices$`, `$1ex`},
    378 		{`(?i)(matr|append)ices$`, `$1ix`},
    379 		{`(?i)(pe)(rson|ople)$`, `$1rson`},
    380 		{`(?i)(child)ren$`, `$1`},
    381 		{`(?i)(eau)x?$`, `$1`},
    382 		{`(?i)men$`, `man`},
    383 	}
    384 
    385 	for _, r := range singularizationRules {
    386 		c.AddSingularRule(r.rule, r.replacement)
    387 	}
    388 }
    389 
    390 func (c *Client) loadUncountableRules() { //nolint:funlen
    391 	var uncountableRules = []string{
    392 		// Singular words with no plurals.
    393 		`adulthood`,
    394 		`advice`,
    395 		`agenda`,
    396 		`aid`,
    397 		`aircraft`,
    398 		`alcohol`,
    399 		`ammo`,
    400 		`analytics`,
    401 		`anime`,
    402 		`athletics`,
    403 		`audio`,
    404 		`bison`,
    405 		`blood`,
    406 		`bream`,
    407 		`buffalo`,
    408 		`butter`,
    409 		`carp`,
    410 		`cash`,
    411 		`chassis`,
    412 		`chess`,
    413 		`clothing`,
    414 		`cod`,
    415 		`commerce`,
    416 		`cooperation`,
    417 		`corps`,
    418 		`debris`,
    419 		`diabetes`,
    420 		`digestion`,
    421 		`elk`,
    422 		`energy`,
    423 		`equipment`,
    424 		`excretion`,
    425 		`expertise`,
    426 		`firmware`,
    427 		`flounder`,
    428 		`fun`,
    429 		`gallows`,
    430 		`garbage`,
    431 		`graffiti`,
    432 		`hardware`,
    433 		`headquarters`,
    434 		`health`,
    435 		`herpes`,
    436 		`highjinks`,
    437 		`homework`,
    438 		`housework`,
    439 		`information`,
    440 		`jeans`,
    441 		`justice`,
    442 		`kudos`,
    443 		`labour`,
    444 		`literature`,
    445 		`machinery`,
    446 		`mackerel`,
    447 		`mail`,
    448 		`media`,
    449 		`mews`,
    450 		`moose`,
    451 		`music`,
    452 		`mud`,
    453 		`manga`,
    454 		`news`,
    455 		`only`,
    456 		`personnel`,
    457 		`pike`,
    458 		`plankton`,
    459 		`pliers`,
    460 		`police`,
    461 		`pollution`,
    462 		`premises`,
    463 		`rain`,
    464 		`research`,
    465 		`rice`,
    466 		`salmon`,
    467 		`scissors`,
    468 		`series`,
    469 		`sewage`,
    470 		`shambles`,
    471 		`shrimp`,
    472 		`software`,
    473 		`staff`,
    474 		`swine`,
    475 		`tennis`,
    476 		`traffic`,
    477 		`transportation`,
    478 		`trout`,
    479 		`tuna`,
    480 		`wealth`,
    481 		`welfare`,
    482 		`whiting`,
    483 		`wildebeest`,
    484 		`wildlife`,
    485 		`you`,
    486 		// Regexes.
    487 		`(?i)pok[eé]mon$`,  //
    488 		`(?i)[^aeiou]ese$`, // "chinese", "japanese"
    489 		`(?i)deer$`,        // "deer", "reindeer"
    490 		`(?i)(fish)$`,      // "fish", "blowfish", "angelfish"
    491 		`(?i)measles$`,     //
    492 		`(?i)o[iu]s$`,      // "carnivorous"
    493 		`(?i)pox$`,         // "chickpox", "smallpox"
    494 		`(?i)sheep$`,       //
    495 	}
    496 
    497 	for _, w := range uncountableRules {
    498 		c.AddUncountableRule(w)
    499 	}
    500 }
	taldir Directory service to resolve wallet mailboxes by messenger addresses
	Log \| Files \| Refs \| Submodules \| README \| LICENSE