add language-specific rules (for e.g. Turkish) to Utf8String casing

fix utf8string casing
This commit is contained in:
Lars Doucet
2018-10-02 13:15:12 -05:00
parent fc3bba8c38
commit 7c9b3e657f

View File

@@ -209,22 +209,35 @@ abstract UTF8String(String) from String to String {
Returns a String where all characters of `this` String are lower case. Returns a String where all characters of `this` String are lower case.
Affects the characters `A-Z`. Other characters remain unchanged. Affects the characters `A-Z`. Other characters remain unchanged.
If `language` is specified, language-specific casing rules will be followed.
**/ **/
public function toLowerCase ():String { public function toLowerCase (language:Language=null):String {
if(language == null) language = STANDARD;
#if sys #if sys
if (lowercaseMap == null) { if (lowercaseMap == null) {
lowercaseMap = new Map<Int, Int> (); lowercaseMap = new Map<Int, Int> ();
Utf8Ext.fillUpperToLowerMap (uppercaseMap); Utf8Ext.fillUpperToLowerMap (lowercaseMap);
} }
var r = new Utf8 (); var r = new Utf8 ();
Utf8.iter (this, function (v) { Utf8.iter (this, function (v) {
if(language != STANDARD)
{
var v2 = toLowerCaseLanguageFixes(v,language);
if(v2 != v)
{
r.addChar(v2);
return;
}
}
r.addChar (lowercaseMap.exists (v) ? lowercaseMap[v] : v); r.addChar (lowercaseMap.exists (v) ? lowercaseMap[v] : v);
}); });
@@ -239,6 +252,19 @@ abstract UTF8String(String) from String to String {
} }
private static function toLowerCaseLanguageFixes(v:Int,language:Language):Int
{
return switch(language)
{
case TURKISH:
switch(v)
{
case 0xC4B0: 0x69; //İ-->i (large dotted İ to small i) //probably redundant and can be removed, presented here for logical symmtery for when genuine cases are needed
default: v;
}
default: v;
}
}
/** /**
Returns the String itself. Returns the String itself.
@@ -254,22 +280,35 @@ abstract UTF8String(String) from String to String {
Returns a String where all characters of `this` String are upper case. Returns a String where all characters of `this` String are upper case.
Affects the characters `a-z`. Other characters remain unchanged. Affects the characters `a-z`. Other characters remain unchanged.
If `language` is specified, language-specific casing rules will be followed.
**/ **/
public function toUpperCase ():String { public function toUpperCase (language:Language=null):String {
if(language == null) language = STANDARD;
#if sys #if sys
if (uppercaseMap == null) { if (uppercaseMap == null) {
uppercaseMap = new Map<Int, Int> (); uppercaseMap = new Map<Int, Int> ();
Utf8Ext.fillLowerToUpperMap (uppercaseMap); Utf8Ext.fillLowerToUpperMap (uppercaseMap);
} }
var r = new Utf8 (); var r = new Utf8 ();
Utf8.iter (this, function(v) { Utf8.iter (this, function(v) {
if(language != STANDARD)
{
var v2 = toUpperCaseLanguageFixes(v,language);
if(v2 != v)
{
r.addChar(v2);
return;
}
}
r.addChar (uppercaseMap.exists (v) ? uppercaseMap[v] : v); r.addChar (uppercaseMap.exists (v) ? uppercaseMap[v] : v);
}); });
@@ -283,6 +322,20 @@ abstract UTF8String(String) from String to String {
#end #end
} }
private static function toUpperCaseLanguageFixes(v:Int,language:Language):Int
{
return switch(language)
{
case TURKISH:
switch(v)
{
case 0x69: 0xC4B0; //i-->İ (small i to large dotted İ)
default: v;
}
default: v;
}
}
@:op(A == B) private static function equals (a:UTF8String, b:UTF8String):Bool { @:op(A == B) private static function equals (a:UTF8String, b:UTF8String):Bool {
@@ -748,4 +801,11 @@ for (i in 0...51) map[0x10CC0+i] = 0x10C80+i;
for (i in 0...32) map[0x118C0+i] = 0x118A0+i; for (i in 0...32) map[0x118C0+i] = 0x118A0+i;
for (i in 0...34) map[0x1E922+i] = 0x1E900+i; for (i in 0...34) map[0x1E922+i] = 0x1E900+i;
} }
}
enum Language
{
STANDARD; //any language that doesn't have surprising results with casing
TURKISH; //turkish
//add more special case languages as necessary
} }