Merge UTF8String, unifill

This commit is contained in:
Joshua Granick
2017-06-05 16:54:57 -07:00
parent 10326ec249
commit af965d9b04
3 changed files with 346 additions and 357 deletions

View File

@@ -72,6 +72,9 @@ The typed array implementation was developed in collaboration with Sven Bergstr
and the hxtypedarray project, which is available under an "MIT" license.
For details, see https://github.com/underscorediscovery/hxtypedarray
The unifill project is included in the [lime/text/unifill/](lime/text/unifill/) directory,
which is available under an "MIT" license. For details, see https://github.com/mandel59/unifill
lime/project/Version is adapted from thx.semver project, which is available under
an "MIT" license. For details, see https://github.com/fponticelli/thx.semver

View File

@@ -1,8 +1,348 @@
package lime.text;
// Use org.zamedev.lib.tools.CaseMapsGenerator to generate this file
class Utf8ExtInternal {
import haxe.Utf8;
import lime.text.unifill.Unifill;
import lime.text.unifill.CodePoint;
abstract UTF8String(String) from String to String {
#if sys
private static var lowercaseMap:Map<Int, Int>;
private static var uppercaseMap:Map<Int, Int>;
#end
/**
The number of characters in `this` String.
**/
public var length (get, never):Int;
/**
Creates a copy from a given String.
**/
public function new (str:String) {
this = new String (str);
}
/**
Returns the character at position `index` of `this` String.
If `index` is negative or exceeds `this.length`, the empty String `""`
is returned.
**/
public function charAt (index:Int):String {
return Unifill.uCharAt (this, index);
}
/**
Returns the character code at position `index` of `this` String.
If `index` is negative or exceeds `this.length`, `null` is returned.
To obtain the character code of a single character, `"x".code` can be
used instead to inline the character code at compile time. Note that
this only works on String literals of length 1.
**/
public function charCodeAt (index:Int):Null<Int> {
return Utf8.charCodeAt (this, index);
}
/**
Returns the String corresponding to the character code `code`.
If `code` is negative or has another invalid value, the result is
unspecified.
**/
public static function fromCharCode (code:Int):String {
return CodePoint.fromInt (code);
}
/**
Returns the string corresponding to the array of character codes `codes`.
If #unifill is defined, these codes will be treated as UTF-8 code points,
otherwise it will default to using String.fromCharCode() for each character
**/
public static function fromCharCodes (codes:Array<Int>):String {
var s = "";
for (code in codes) {
s += CodePoint.fromInt (code);
}
return s;
}
/**
Returns the position of the leftmost occurence of `str` within `this`
String.
If `startIndex` is given, the search is performed within the substring
of `this` String starting from `startIndex`. Otherwise the search is
performed within `this` String. In either case, the returned position
is relative to the beginning of `this` String.
If `str` cannot be found, -1 is returned.
**/
public function indexOf (str:String, startIndex:Int = 0):Int {
return Unifill.uIndexOf (this, str, startIndex);
}
/**
Returns the position of the rightmost occurence of `str` within `this`
String.
If `startIndex` is given, the search is performed within the substring
of `this` String from 0 to `startIndex`. Otherwise the search is
performed within `this` String. In either case, the returned position
is relative to the beginning of `this` String.
If `str` cannot be found, -1 is returned.
**/
public function lastIndexOf(str:String, ?startIndex:Int):Int {
return Unifill.uLastIndexOf (this, str, startIndex);
}
/**
Splits `this` String at each occurence of `delimiter`.
If `this` String is the empty String `""`, the result is not consistent
across targets and may either be `[]` (on Js, Cpp) or `[""]`.
If `delimiter` is the empty String `""`, `this` String is split into an
Array of `this.length` elements, where the elements correspond to the
characters of `this` String.
If `delimiter` is not found within `this` String, the result is an Array
with one element, which equals `this` String.
If `delimiter` is null, the result is unspecified.
Otherwise, `this` String is split into parts at each occurence of
`delimiter`. If `this` String starts (or ends) with `delimiter`, the
result `Array` contains a leading (or trailing) empty String `""` element.
Two subsequent delimiters also result in an empty String `""` element.
**/
public function split (delimiter:String):Array<String> {
return Unifill.uSplit (this, delimiter);
}
/**
Returns `len` characters of `this` String, starting at position `pos`.
If `len` is omitted, all characters from position `pos` to the end of
`this` String are included.
If `pos` is negative, its value is calculated from the end of `this`
String by `this.length + pos`. If this yields a negative value, 0 is
used instead.
If the calculated position + `len` exceeds `this.length`, the characters
from that position to the end of `this` String are returned.
If `len` is negative, the result is unspecified.
**/
public function substr (pos:Int, ?len:Int):String {
return Utf8.sub (this, pos, len);
}
/**
Returns the part of `this` String from `startIndex` to but not including `endIndex`.
If `startIndex` or `endIndex` are negative, 0 is used instead.
If `startIndex` exceeds `endIndex`, they are swapped.
If the (possibly swapped) `endIndex` is omitted or exceeds
`this.length`, `this.length` is used instead.
If the (possibly swapped) `startIndex` exceeds `this.length`, the empty
String `""` is returned.
**/
public function substring (startIndex:Int, ?endIndex:Int):String {
return Unifill.uSubstring (this, startIndex, endIndex);
}
/**
Returns a String where all characters of `this` String are lower case.
Affects the characters `A-Z`. Other characters remain unchanged.
**/
public function toLowerCase ():String {
#if sys
if (lowercaseMap == null) {
lowercaseMap = new Map<Int, Int> ();
Utf8Ext.fillUpperToLowerMap (uppercaseMap);
}
var r = new Utf8 ();
Utf8.iter (this, function (v) {
r.addChar (lowercaseMap.exists (v) ? lowercaseMap[v] : v);
});
return r.toString ();
#else
return this.toLowerCase ();
#end
}
/**
Returns the String itself.
**/
public function toString ():String {
return this;
}
/**
Returns a String where all characters of `this` String are upper case.
Affects the characters `a-z`. Other characters remain unchanged.
**/
public function toUpperCase ():String {
#if sys
if (uppercaseMap == null) {
uppercaseMap = new Map<Int, Int> ();
Utf8Ext.fillLowerToUpperMap (uppercaseMap);
}
var r = new Utf8 ();
Utf8.iter (this, function(v) {
r.addChar (uppercaseMap.exists (v) ? uppercaseMap[v] : v);
});
return r.toString ();
#else
return this.toUpperCase ();
#end
}
@:op(A == B) private static function equals (a:UTF8String, b:UTF8String):Bool {
return Unifill.uCompare (a, b) == 0;
}
@:op(A < B) private static function lt (a:UTF8String, b:UTF8String):Bool {
return Unifill.uCompare (a, b) == -1;
}
@:op(A > B) private static function gt (a:UTF8String, b:UTF8String):Bool {
return Unifill.uCompare (a, b) == 1;
}
@:op(A <= B) private static function lteq (a:UTF8String, b:UTF8String):Bool {
return Unifill.uCompare (a, b) != 1;
}
@:op(A >= B) static function gteq (a:UTF8String, b:UTF8String):Bool {
return Unifill.uCompare (a, b) != -1;
}
@:op(A + B) static function plus (a:UTF8String, b:UTF8String):UTF8String {
var sb = new StringBuf ();
sb.add (Std.string (a));
sb.add (Std.string (b));
return sb.toString ();
}
// Get & Set Methods
private function get_length ():Int {
return Utf8.length (this);
}
}
// generated from org.zamedev.lib.tools.CaseMapsGenerator
private class Utf8Ext {
public static function fillUpperToLowerMap(map : Map<Int, Int>) : Void {
var i = 0;
for (i in 0...26) map[0x41+i] = 0x61+i;
@@ -381,4 +721,4 @@ for (i in 0...51) map[0x10CC0+i] = 0x10C80+i;
for (i in 0...32) map[0x118C0+i] = 0x118A0+i;
for (i in 0...34) map[0x1E922+i] = 0x1E900+i;
}
}
}

View File

@@ -1,354 +0,0 @@
package lime.text;
import haxe.Utf8;
#if unifill
import lime.text.unifill.Unifill;
import lime.text.unifill.CodePoint;
#end
/**
* ...
* @author
*/
abstract UTFString(String) from String to String
{
#if (unifill && (neko || php || cpp))
static var inited:Bool = false;
static var lcaseMap:Map<Int, Int>;
static var ucaseMap:Map<Int, Int>;
#end
/**
The number of characters in `this` String.
**/
public var length(get, never) : Int;
/**
Creates a copy from a given String.
**/
public function new(str:String)
{
this = new String(str);
}
/**
Caching of character maps in two case sensitivites
**/
static function initialize() : Void {
#if (unifill && (neko || php || cpp))
lcaseMap = new Map<Int, Int>();
ucaseMap = new Map<Int, Int>();
Utf8ExtInternal.fillUpperToLowerMap(lcaseMap);
Utf8ExtInternal.fillLowerToUpperMap(ucaseMap);
inited = true;
#end
}
/**
Returns a String where all characters of `this` String are upper case.
Affects the characters `a-z`. Other characters remain unchanged.
**/
public function toUpperCase() : String
{
#if (unifill && (neko || php || cpp))
if (!inited) initialize();
var r = new Utf8();
Utf8.iter(this, function(v) {
r.addChar(ucaseMap.exists(v) ? ucaseMap[v] : v);
});
return r.toString();
#else
return this.toUpperCase();
#end
}
/**
Returns a String where all characters of `this` String are lower case.
Affects the characters `A-Z`. Other characters remain unchanged.
**/
public function toLowerCase() : String
{
#if (unifill && (neko || php || cpp))
if (!inited) initialize();
var r = new Utf8();
Utf8.iter(this, function(v) {
r.addChar(lcaseMap.exists(v) ? lcaseMap[v] : v);
});
return r.toString();
#else
return this.toLowerCase();
#end
}
/**
Returns the character at position `index` of `this` String.
If `index` is negative or exceeds `this.length`, the empty String `""`
is returned.
**/
public function charAt(index : Int) : String
{
#if unifill
return Unifill.uCharAt(this, index);
#else
return this.charAt(index);
#end
}
/**
Returns the character code at position `index` of `this` String.
If `index` is negative or exceeds `this.length`, `null` is returned.
To obtain the character code of a single character, `"x".code` can be
used instead to inline the character code at compile time. Note that
this only works on String literals of length 1.
**/
public function charCodeAt(index : Int) : Null<Int>
{
#if unifill
return Utf8.charCodeAt(this, index);
#else
return this.charCodeAt(index);
#end
}
/**
Returns the position of the leftmost occurence of `str` within `this`
String.
If `startIndex` is given, the search is performed within the substring
of `this` String starting from `startIndex`. Otherwise the search is
performed within `this` String. In either case, the returned position
is relative to the beginning of `this` String.
If `str` cannot be found, -1 is returned.
**/
public function indexOf(str : String, ?startIndex : Int = 0) : Int
{
#if unifill
return Unifill.uIndexOf(this, str, startIndex);
#else
return this.indexOf(str, startIndex);
#end
}
/**
Returns the position of the rightmost occurence of `str` within `this`
String.
If `startIndex` is given, the search is performed within the substring
of `this` String from 0 to `startIndex`. Otherwise the search is
performed within `this` String. In either case, the returned position
is relative to the beginning of `this` String.
If `str` cannot be found, -1 is returned.
**/
public function lastIndexOf(str : String, ?startIndex : Int) : Int
{
#if unifill
return Unifill.uLastIndexOf(this, str, startIndex);
#else
return this.lastIndexOf(str, startIndex);
#end
}
/**
Splits `this` String at each occurence of `delimiter`.
If `this` String is the empty String `""`, the result is not consistent
across targets and may either be `[]` (on Js, Cpp) or `[""]`.
If `delimiter` is the empty String `""`, `this` String is split into an
Array of `this.length` elements, where the elements correspond to the
characters of `this` String.
If `delimiter` is not found within `this` String, the result is an Array
with one element, which equals `this` String.
If `delimiter` is null, the result is unspecified.
Otherwise, `this` String is split into parts at each occurence of
`delimiter`. If `this` String starts (or ends) with `delimiter`, the
result `Array` contains a leading (or trailing) empty String `""` element.
Two subsequent delimiters also result in an empty String `""` element.
**/
public function split(delimiter : String) : Array<String>
{
#if unifill
return Unifill.uSplit(this, delimiter);
#else
return this.split(delimiter);
#end
}
/**
Returns `len` characters of `this` String, starting at position `pos`.
If `len` is omitted, all characters from position `pos` to the end of
`this` String are included.
If `pos` is negative, its value is calculated from the end of `this`
String by `this.length + pos`. If this yields a negative value, 0 is
used instead.
If the calculated position + `len` exceeds `this.length`, the characters
from that position to the end of `this` String are returned.
If `len` is negative, the result is unspecified.
**/
public function substr(pos : Int, ?len : Int) : String
{
#if unifill
return Utf8.sub(this, pos, len);
#else
return this.substr(pos, len);
#end
}
/**
Returns the part of `this` String from `startIndex` to but not including `endIndex`.
If `startIndex` or `endIndex` are negative, 0 is used instead.
If `startIndex` exceeds `endIndex`, they are swapped.
If the (possibly swapped) `endIndex` is omitted or exceeds
`this.length`, `this.length` is used instead.
If the (possibly swapped) `startIndex` exceeds `this.length`, the empty
String `""` is returned.
**/
public function substring(startIndex : Int, ?endIndex : Int) : String
{
#if unifill
return Unifill.uSubstring(this, startIndex, endIndex);
#else
return this.substring(startIndex, endIndex);
#end
}
/**
Returns the String itself.
**/
public function toString() : String
{
return this;
}
/**
Returns the String corresponding to the character code `code`.
If `code` is negative or has another invalid value, the result is
unspecified.
**/
public static function fromCharCode(code : Int) : String
{
#if unifill
//var sb = new StringBuf();
//sb.addChar(code);
//return sb.toString();
return CodePoint.fromInt(code);
#else
return String.fromCharCode(code);
#end
}
/**
Returns the string corresponding to the array of character codes `codes`.
If #unifill is defined, these codes will be treated as UTF-8 code points,
otherwise it will default to using String.fromCharCode() for each character
**/
public static function fromCharCodes(codes : Array<Int>) : String
{
var s = "";
for (code in codes)
{
#if unifill
s += CodePoint.fromInt(code);
#else
s += String.fromCharCode(code);
#end
}
return s;
}
/**********PRIVATE*************/
@:op(A == B) static function equals(a:UTFString, b:UTFString) : Bool
{
#if unifill
return Unifill.uCompare(a, b) == 0;
#else
return Std.string(a) == Std.string(b);
#end
}
@:op(A < B) static function lt(a:UTFString, b:UTFString) : Bool
{
#if unifill
return Unifill.uCompare(a, b) == -1;
#else
return Std.string(a) < Std.string(b);
#end
}
@:op(A > B) static function gt(a:UTFString, b:UTFString) : Bool
{
#if unifill
return Unifill.uCompare(a, b) == 1;
#else
return Std.string(a) > Std.string(b);
#end
}
@:op(A <= B) static function lteq(a:UTFString, b:UTFString) : Bool
{
#if unifill
return Unifill.uCompare(a, b) != 1;
#else
return Std.string(a) <= Std.string(b);
#end
}
@:op(A >= B) static function gteq(a:UTFString, b:UTFString) : Bool
{
#if unifill
return Unifill.uCompare(a, b) != -1;
#else
return Std.string(a) >= Std.string(b);
#end
}
@:op(A + B) static function plus(a:UTFString, b:UTFString) : String
{
#if unifill
var sb = new StringBuf();
sb.add(Std.string(a));
sb.add(Std.string(b));
return sb.toString();
#else
return Std.string(a) + Std.string(b);
#end
}
private function get_length() : Int
{
#if unifill
return Utf8.length(this);
#else
return this.length;
#end
}
}