Files
lime/lime/text/unifill/Utf8.hx
2017-06-05 16:22:31 -07:00

368 lines
9.2 KiB
Haxe
Executable File

package lime.text.unifill;
import haxe.io.Bytes;
import haxe.io.BytesBuffer;
abstract Utf8(StringU8) {
/**
Converts the code point `code` to a character as a Utf8 string.
**/
public static inline function fromCodePoint(codePoint : Int) : Utf8 {
var buf = new BytesBuffer();
Utf8Impl.encode_code_point(function(c) buf.addByte(c), codePoint);
return new Utf8(StringU8.ofBytes(buf.getBytes()));
}
/**
Converts `codePoints` to a Utf8 string.
**/
public static inline function fromCodePoints(codePoints : Iterable<Int>) : Utf8 {
var buf = new BytesBuffer();
for (c in codePoints) {
Utf8Impl.encode_code_point(function(c) buf.addByte(c), c);
}
return new Utf8(StringU8.ofBytes(buf.getBytes()));
}
public static inline function fromString(s : String) : Utf8 {
return new Utf8(StringU8.fromString(s));
}
public static inline function fromBytes(b : Bytes) : Utf8 {
return new Utf8(StringU8.fromBytes(b));
}
public static inline function encodeWith(f : Int -> Void, c : Int) : Void {
Utf8Impl.encode_code_point(f, c);
}
public var length(get, never) : Int;
/**
Returns the UTF-8 code unit at position `index` of `this`.
**/
public inline function codeUnitAt(index : Int) : Int {
return this.codeUnitAt(index);
}
/**
Returns the Unicode code point at position `index` of
`this`.
**/
public function codePointAt(index : Int) : Int {
return Utf8Impl.decode_code_point(length, function(i) return codeUnitAt(i), index);
}
/**
Returns the character as a String at position `index` of
`this`.
**/
public inline function charAt(index : Int) : Utf8 {
return new Utf8(this.substr(index, codePointWidthAt(index)));
}
/**
Returns the number of Unicode code points from `beginIndex`
to `endIndex` in `this`.
**/
public function codePointCount(beginIndex : Int, endIndex : Int) : Int {
var index = beginIndex;
var i = 0;
while (index < endIndex) {
index += codePointWidthAt(index);
++i;
}
return i;
}
/**
Returns the number of units of the code point at position
`index` of `this`.
**/
public inline function codePointWidthAt(index : Int) : Int {
var c = codeUnitAt(index);
return Utf8Impl.code_point_width(c);
}
/**
Returns the number of units of the code point before
position `index` of `this`.
**/
public inline function codePointWidthBefore(index : Int) : Int {
return Utf8Impl.find_prev_code_point(function(i) return codeUnitAt(i), index);
}
/**
Returns the index within `this` that is offset from
position `index` by `codePointOffset` code points.
**/
public inline function offsetByCodePoints(index : Int, codePointOffset : Int) : Int {
return if (codePointOffset >= 0) {
forward_offset_by_code_points(index, codePointOffset);
} else {
backward_offset_by_code_points(index, -codePointOffset);
}
}
/**
Returns `len` code units of `this`, starting at position pos.
**/
public inline function substr(index : Int, ?len : Int) : Utf8 {
return new Utf8(this.substr(index, len));
}
/**
Validates `this` Utf8 string.
If the code unit sequence of `this` is invalid,
`Exception.InvalidCodeUnitSequence` is throwed.
**/
public function validate() : Void {
var len = this.length;
var accessor = function(i) return codeUnitAt(i);
var i = 0;
while (i < len) {
Utf8Impl.decode_code_point(len, accessor, i);
i += codePointWidthAt(i);
}
}
public inline function toString() : String {
return this.toString();
}
public inline function toBytes() : Bytes {
return this.toBytes();
}
inline function new(s : StringU8) {
this = s;
}
inline function get_length() : Int {
return this.length;
}
inline function forward_offset_by_code_points(index : Int, codePointOffset : Int) : Int {
var len = this.length;
var i = 0;
while (i < codePointOffset && index < len) {
index += codePointWidthAt(index);
++i;
}
return index;
}
inline function backward_offset_by_code_points(index : Int, codePointOffset : Int) : Int {
var count = 0;
while (count < codePointOffset && 0 < index) {
index -= codePointWidthBefore(index);
++count;
}
return index;
}
}
private class Utf8Impl {
public static inline function code_point_width(c : Int) : Int {
return (c < 0xC0) ? 1 : (c < 0xE0) ? 2 : (c < 0xF0) ? 3 : (c < 0xF8) ? 4 : 1;
}
public static inline function find_prev_code_point(accessor : Int -> Int, index : Int) : Int {
var c1 = accessor(index - 1);
return (c1 < 0x80 || c1 >= 0xC0) ? 1
: (accessor(index - 2) & 0xE0 == 0xC0) ? 2
: (accessor(index - 3) & 0xF0 == 0xE0) ? 3
: (accessor(index - 4) & 0xF8 == 0xF0) ? 4
: 1;
}
public static inline function encode_code_point(addUnit : Int -> Void, codePoint : Int) : Void {
if (codePoint <= 0x7F) {
addUnit(codePoint);
} else if (codePoint <= 0x7FF) {
addUnit(0xC0 | (codePoint >> 6));
addUnit(0x80 | (codePoint & 0x3F));
} else if (codePoint <= 0xFFFF) {
addUnit(0xE0 | (codePoint >> 12));
addUnit(0x80 | ((codePoint >> 6) & 0x3F));
addUnit(0x80 | (codePoint & 0x3F));
} else if (codePoint <= 0x10FFFF) {
addUnit(0xF0 | (codePoint >> 18));
addUnit(0x80 | ((codePoint >> 12) & 0x3F));
addUnit(0x80 | ((codePoint >> 6) & 0x3F));
addUnit(0x80 | (codePoint & 0x3F));
} else {
throw new Exception.InvalidCodePoint(codePoint);
}
}
public static function decode_code_point(len : Int, accessor : Int -> Int, index : Int) : Int {
var i = index;
if (i < 0 || len <= i)
throw new Exception.InvalidCodeUnitSequence(index);
var c1 = accessor(i);
if (c1 < 0x80) {
return c1;
}
if (c1 < 0xC0) {
throw new Exception.InvalidCodeUnitSequence(index);
}
++i;
if (i < 0 || len <= i)
throw new Exception.InvalidCodeUnitSequence(index);
var c2 = accessor(i);
if (c1 < 0xE0) {
if ((c1 & 0x1E != 0) && (c2 & 0xC0 == 0x80))
return ((c1 & 0x3F) << 6) | (c2 & 0x7F);
else
throw new Exception.InvalidCodeUnitSequence(index);
}
++i;
if (i < 0 || len <= i)
throw new Exception.InvalidCodeUnitSequence(index);
var c3 = accessor(i);
if (c1 < 0xF0) {
if (((c1 & 0x0F != 0) || (c2 & 0x20 != 0)) && (c2 & 0xC0 == 0x80) && (c3 & 0xC0 == 0x80)
&& !(c1 == 0xED && 0xA0 <= c2 && c2 <= 0xBF))
return ((c1 & 0x1F) << 12) | ((c2 & 0x7F) << 6) | (c3 & 0x7F);
else
throw new Exception.InvalidCodeUnitSequence(index);
}
++i;
if (i < 0 || len <= i)
throw new Exception.InvalidCodeUnitSequence(index);
var c4 = accessor(i);
if (c1 < 0xF8) {
if (((c1 & 0x07 != 0) || (c2 & 0x30 != 0)) && (c2 & 0xC0 == 0x80) && (c3 & 0xC0 == 0x80) && (c4 & 0xC0 == 0x80)
&& !((c1 == 0xF4 && c2 > 0x8F) || c1 > 0xF4))
return ((c1 & 0x0F) << 18) | ((c2 & 0x7F) << 12) | ((c3 & 0x7F) << 6) | (c4 & 0x7F);
else
throw new Exception.InvalidCodeUnitSequence(index);
}
throw new Exception.InvalidCodeUnitSequence(index);
}
}
#if (neko || php || cpp || lua || macro)
private abstract StringU8(String) {
public static inline function fromString(s : String) : StringU8 {
return new StringU8(s);
}
public static inline function ofBytes(b : Bytes) : StringU8 {
return new StringU8(b.toString());
}
public static inline function fromBytes(b : Bytes) : StringU8 {
return new StringU8(b.toString());
}
public var length(get, never) : Int;
public inline function codeUnitAt(index : Int) : Int {
return StringTools.fastCodeAt(this, index);
}
public inline function substr(index : Int, ?len : Int) : StringU8 {
return new StringU8(this.substr(index, len));
}
public inline function toString() : String {
return this;
}
public inline function toBytes() : Bytes {
return Bytes.ofString(this);
}
inline function new(s : String) {
this = s;
}
inline function get_length() : Int {
return this.length;
}
}
#else
private abstract StringU8(Bytes) {
public static function fromString(s : String) : StringU8 {
var buf = new BytesBuffer();
var addUnit = buf.addByte;
for (i in new InternalEncodingIter(s, 0, s.length)) {
var c = InternalEncoding.codePointAt(s, i);
Utf8Impl.encode_code_point(addUnit, c);
}
return new StringU8(buf.getBytes());
}
public static inline function ofBytes(b : Bytes) : StringU8 {
return new StringU8(b);
}
public static inline function fromBytes(b : Bytes) : StringU8 {
var nb = clone_bytes(b);
return new StringU8(nb);
}
public var length(get, never) : Int;
public inline function codeUnitAt(index : Int) : Int {
return this.get(index);
}
public inline function substr(index : Int, ?len : Int) : StringU8 {
if (index < 0) {
index += this.length;
if (index < 0) index = 0;
}
if (len == null) len = this.length - index;
var b = this.sub(index, len);
return new StringU8(b);
}
public function toString() : String {
var buf = new StringBuf();
var i = 0;
var len = this.length;
var cua = function (i) return this.get(i);
while (i < len) {
var u = Utf8Impl.decode_code_point(len, cua, i);
buf.add(InternalEncoding.fromCodePoint(u));
i += Utf8Impl.code_point_width(codeUnitAt(i));
}
return buf.toString();
}
public inline function toBytes() : Bytes {
return clone_bytes(this);
}
static inline function clone_bytes(b : Bytes) : Bytes {
var nb = Bytes.alloc(b.length);
nb.blit(0, b, 0, b.length);
return nb;
}
inline function new(b : Bytes) {
this = b;
}
inline function get_length() : Int {
return this.length;
}
}
#end