OpenJDK / jdk / jdk
changeset 57620:f1007d3e1907
8174270: Consolidate ICU sources in one location
Reviewed-by: srl, joehw
line wrap: on
line diff
--- a/src/java.base/share/classes/java/net/IDN.java Mon Jan 13 16:56:21 2020 +0100 +++ b/src/java.base/share/classes/java/net/IDN.java Mon Jan 13 08:05:59 2020 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -29,9 +29,9 @@ import java.security.AccessController; import java.security.PrivilegedAction; -import sun.net.idn.StringPrep; -import sun.net.idn.Punycode; -import sun.text.normalizer.UCharacterIterator; +import jdk.internal.icu.impl.Punycode; +import jdk.internal.icu.text.StringPrep; +import jdk.internal.icu.text.UCharacterIterator; /** * Provides methods to convert internationalized domain names (IDNs) between @@ -226,7 +226,7 @@ InputStream stream = null; try { - final String IDN_PROFILE = "uidna.spp"; + final String IDN_PROFILE = "/sun/net/idn/uidna.spp"; if (System.getSecurityManager() != null) { stream = AccessController.doPrivileged(new PrivilegedAction<>() { public InputStream run() {
--- a/src/java.base/share/classes/java/text/Bidi.java Mon Jan 13 16:56:21 2020 +0100 +++ b/src/java.base/share/classes/java/text/Bidi.java Mon Jan 13 08:05:59 2020 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2000, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -35,7 +35,7 @@ package java.text; -import sun.text.bidi.BidiBase; +import jdk.internal.icu.text.BidiBase; /** * This class implements the Unicode Bidirectional Algorithm.
--- a/src/java.base/share/classes/java/text/CollationElementIterator.java Mon Jan 13 16:56:21 2020 +0100 +++ b/src/java.base/share/classes/java/text/CollationElementIterator.java Mon Jan 13 08:05:59 2020 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1996, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1996, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -41,7 +41,7 @@ import java.lang.Character; import java.util.Vector; import sun.text.CollatorUtilities; -import sun.text.normalizer.NormalizerBase; +import jdk.internal.icu.text.NormalizerBase; /** * The {@code CollationElementIterator} class is used as an iterator
--- a/src/java.base/share/classes/java/text/Normalizer.java Mon Jan 13 16:56:21 2020 +0100 +++ b/src/java.base/share/classes/java/text/Normalizer.java Mon Jan 13 08:05:59 2020 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2005, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -37,7 +37,7 @@ package java.text; -import sun.text.normalizer.NormalizerBase; +import jdk.internal.icu.text.NormalizerBase; /** * This class provides the method {@code normalize} which transforms Unicode
--- a/src/java.base/share/classes/java/text/RBTableBuilder.java Mon Jan 13 16:56:21 2020 +0100 +++ b/src/java.base/share/classes/java/text/RBTableBuilder.java Mon Jan 13 08:05:59 2020 -0800 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1999, 2019, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1999, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -42,8 +42,7 @@ import sun.text.UCompactIntArray; import sun.text.IntHashtable; import sun.text.ComposedCharIter; -import sun.text.CollatorUtilities; -import sun.text.normalizer.NormalizerImpl; +import jdk.internal.icu.impl.NormalizerImpl; /** * This class contains all the code to parse a RuleBasedCollator pattern
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/java.base/share/classes/jdk/internal/icu/impl/BMPSet.java Mon Jan 13 08:05:59 2020 -0800 @@ -0,0 +1,527 @@ +/* + * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ****************************************************************************** + * + * Copyright (C) 2009-2014, International Business Machines + * Corporation and others. All Rights Reserved. + * + ****************************************************************************** + */ + +package jdk.internal.icu.impl; + +import jdk.internal.icu.text.UnicodeSet.SpanCondition; +import jdk.internal.icu.util.OutputInt; + +/** + * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points. + * + * Latin-1: Look up bytes. + * 2-byte characters: Bits organized vertically. + * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges. + * Supplementary characters: Call contains() on the parent set. + */ +public final class BMPSet { + + /** + * One boolean ('true' or 'false') per Latin-1 character. + */ + private boolean[] latin1Contains; + + /** + * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points + * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6} + * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead) + * + * Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at + * runtime. + */ + private int[] table7FF; + + /** + * One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks + * correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12} + * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit + * indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed + * and set.contains(c) must be called. + * + * Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster + * validity checking at runtime. + */ + private int[] bmpBlockBits; + + /** + * Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000, + * U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are + * always looked up in the bit tables. The last pair of indexes is for finding supplementary code points. + */ + private int[] list4kStarts; + + /** + * The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for + * supplementary code points. The list is terminated with list[listLength-1]=0x110000. + */ + private final int[] list; + private final int listLength; // length used; list may be longer to minimize reallocs + + public BMPSet(final int[] parentList, int parentListLength) { + list = parentList; + listLength = parentListLength; + latin1Contains = new boolean[0x100]; + table7FF = new int[64]; + bmpBlockBits = new int[64]; + list4kStarts = new int[18]; + + /* + * Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the + * first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of + * indexes is for finding supplementary code points. + */ + list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1); + int i; + for (i = 1; i <= 0x10; ++i) { + list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1); + } + list4kStarts[0x11] = listLength - 1; + + initBits(); + } + + public boolean contains(int c) { + if (c <= 0xff) { + return (latin1Contains[c]); + } else if (c <= 0x7ff) { + return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0); + } else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) { + int lead = c >> 12; + int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; + if (twoBits <= 1) { + // All 64 code points with the same bits 15..6 + // are either in the set or not. + return (0 != twoBits); + } else { + // Look up the code point in its 4k block of code points. + return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]); + } + } else if (c <= 0x10ffff) { + // surrogate or supplementary code point + return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]); + } else { + // Out-of-range code points get false, consistent with long-standing + // behavior of UnicodeSet.contains(c). + return false; + } + } + + /** + * Span the initial substring for which each character c has spanCondition==contains(c). It must be + * spanCondition==0 or 1. + * + * @param start The start index + * @param outCount If not null: Receives the number of code points in the span. + * @return the limit (exclusive end) of the span + * + * NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for + * sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points + * as usual in ICU. + */ + public final int span(CharSequence s, int start, SpanCondition spanCondition, + OutputInt outCount) { + char c, c2; + int i = start; + int limit = s.length(); + int numSupplementary = 0; + if (SpanCondition.NOT_CONTAINED != spanCondition) { + // span + while (i < limit) { + c = s.charAt(i); + if (c <= 0xff) { + if (!latin1Contains[c]) { + break; + } + } else if (c <= 0x7ff) { + if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) { + break; + } + } else if (c < 0xd800 || + c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) { + int lead = c >> 12; + int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; + if (twoBits <= 1) { + // All 64 code points with the same bits 15..6 + // are either in the set or not. + if (twoBits == 0) { + break; + } + } else { + // Look up the code point in its 4k block of code points. + if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { + break; + } + } + } else { + // surrogate pair + int supplementary = UCharacterProperty.getRawSupplementary(c, c2); + if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { + break; + } + ++numSupplementary; + ++i; + } + ++i; + } + } else { + // span not + while (i < limit) { + c = s.charAt(i); + if (c <= 0xff) { + if (latin1Contains[c]) { + break; + } + } else if (c <= 0x7ff) { + if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) { + break; + } + } else if (c < 0xd800 || + c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) { + int lead = c >> 12; + int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; + if (twoBits <= 1) { + // All 64 code points with the same bits 15..6 + // are either in the set or not. + if (twoBits != 0) { + break; + } + } else { + // Look up the code point in its 4k block of code points. + if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { + break; + } + } + } else { + // surrogate pair + int supplementary = UCharacterProperty.getRawSupplementary(c, c2); + if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { + break; + } + ++numSupplementary; + ++i; + } + ++i; + } + } + if (outCount != null) { + int spanLength = i - start; + outCount.value = spanLength - numSupplementary; // number of code points + } + return i; + } + + /** + * Symmetrical with span(). + * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >= + * limit and spanCondition==0 or 1. + * + * @return The string index which starts the span (i.e. inclusive). + */ + public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) { + char c, c2; + + if (SpanCondition.NOT_CONTAINED != spanCondition) { + // span + for (;;) { + c = s.charAt(--limit); + if (c <= 0xff) { + if (!latin1Contains[c]) { + break; + } + } else if (c <= 0x7ff) { + if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) { + break; + } + } else if (c < 0xd800 || + c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) { + int lead = c >> 12; + int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; + if (twoBits <= 1) { + // All 64 code points with the same bits 15..6 + // are either in the set or not. + if (twoBits == 0) { + break; + } + } else { + // Look up the code point in its 4k block of code points. + if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { + break; + } + } + } else { + // surrogate pair + int supplementary = UCharacterProperty.getRawSupplementary(c2, c); + if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { + break; + } + --limit; + } + if (0 == limit) { + return 0; + } + } + } else { + // span not + for (;;) { + c = s.charAt(--limit); + if (c <= 0xff) { + if (latin1Contains[c]) { + break; + } + } else if (c <= 0x7ff) { + if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) { + break; + } + } else if (c < 0xd800 || + c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) { + int lead = c >> 12; + int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; + if (twoBits <= 1) { + // All 64 code points with the same bits 15..6 + // are either in the set or not. + if (twoBits != 0) { + break; + } + } else { + // Look up the code point in its 4k block of code points. + if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { + break; + } + } + } else { + // surrogate pair + int supplementary = UCharacterProperty.getRawSupplementary(c2, c); + if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { + break; + } + --limit; + } + if (0 == limit) { + return 0; + } + } + } + return limit + 1; + } + + /** + * Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800 + */ + private static void set32x64Bits(int[] table, int start, int limit) { + assert (64 == table.length); + int lead = start >> 6; // Named for UTF-8 2-byte lead byte with upper 5 bits. + int trail = start & 0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits. + + // Set one bit indicating an all-one block. + int bits = 1 << lead; + if ((start + 1) == limit) { // Single-character shortcut. + table[trail] |= bits; + return; + } + + int limitLead = limit >> 6; + int limitTrail = limit & 0x3f; + + if (lead == limitLead) { + // Partial vertical bit column. + while (trail < limitTrail) { + table[trail++] |= bits; + } + } else { + // Partial vertical bit column, + // followed by a bit rectangle, + // followed by another partial vertical bit column. + if (trail > 0) { + do { + table[trail++] |= bits; + } while (trail < 64); + ++lead; + } + if (lead < limitLead) { + bits = ~((1 << lead) - 1); + if (limitLead < 0x20) { + bits &= (1 << limitLead) - 1; + } + for (trail = 0; trail < 64; ++trail) { + table[trail] |= bits; + } + } + // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0. + // In that case, bits=1<<limitLead == 1<<0 == 1 + // (because Java << uses only the lower 5 bits of the shift operand) + // but the bits value is not used because trail<limitTrail is already false. + bits = 1 << limitLead; + for (trail = 0; trail < limitTrail; ++trail) { + table[trail] |= bits; + } + } + } + + private void initBits() { + int start, limit; + int listIndex = 0; + + // Set latin1Contains[]. + do { + start = list[listIndex++]; + if (listIndex < listLength) { + limit = list[listIndex++]; + } else { + limit = 0x110000; + } + if (start >= 0x100) { + break; + } + do { + latin1Contains[start++] = true; + } while (start < limit && start < 0x100); + } while (limit <= 0x100); + + // Set table7FF[]. + while (start < 0x800) { + set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800); + if (limit > 0x800) { + start = 0x800; + break; + } + + start = list[listIndex++]; + if (listIndex < listLength) { + limit = list[listIndex++]; + } else { + limit = 0x110000; + } + } + + // Set bmpBlockBits[]. + int minStart = 0x800; + while (start < 0x10000) { + if (limit > 0x10000) { + limit = 0x10000; + } + + if (start < minStart) { + start = minStart; + } + if (start < limit) { // Else: Another range entirely in a known mixed-value block. + if (0 != (start & 0x3f)) { + // Mixed-value block of 64 code points. + start >>= 6; + bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6); + start = (start + 1) << 6; // Round up to the next block boundary. + minStart = start; // Ignore further ranges in this block. + } + if (start < limit) { + if (start < (limit & ~0x3f)) { + // Multiple all-ones blocks of 64 code points each. + set32x64Bits(bmpBlockBits, start >> 6, limit >> 6); + } + + if (0 != (limit & 0x3f)) { + // Mixed-value block of 64 code points. + limit >>= 6; + bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6); + limit = (limit + 1) << 6; // Round up to the next block boundary. + minStart = limit; // Ignore further ranges in this block. + } + } + } + + if (limit == 0x10000) { + break; + } + + start = list[listIndex++]; + if (listIndex < listLength) { + limit = list[listIndex++]; + } else { + limit = 0x110000; + } + } + } + + /** + * Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code + * points in a certain range. + * + * For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and + * hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1. + * + * @param c + * a character in a subrange of MIN_VALUE..MAX_VALUE + * @param lo + * The lowest index to be returned. + * @param hi + * The highest index to be returned. + * @return the smallest integer i in the range lo..hi, inclusive, such that c < list[i] + */ + private int findCodePoint(int c, int lo, int hi) { + /* Examples: + findCodePoint(c) + set list[] c=0 1 3 4 7 8 + === ============== =========== + [] [110000] 0 0 0 0 0 0 + [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 + [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 + [:Any:] [0, 110000] 1 1 1 1 1 1 + */ + + // Return the smallest i such that c < list[i]. Assume + // list[len - 1] == HIGH and that c is legal (0..HIGH-1). + if (c < list[lo]) + return lo; + // High runner test. c is often after the last range, so an + // initial check for this condition pays off. + if (lo >= hi || c >= list[hi - 1]) + return hi; + // invariant: c >= list[lo] + // invariant: c < list[hi] + for (;;) { + int i = (lo + hi) >>> 1; + if (i == lo) { + break; // Found! + } else if (c < list[i]) { + hi = i; + } else { + lo = i; + } + } + return hi; + } + + private final boolean containsSlow(int c, int lo, int hi) { + return (0 != (findCodePoint(c, lo, hi) & 1)); + } +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/java.base/share/classes/jdk/internal/icu/impl/CharTrie.java Mon Jan 13 08:05:59 2020 -0800 @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ****************************************************************************** + * Copyright (C) 1996-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ****************************************************************************** + */ + +package jdk.internal.icu.impl; + +import jdk.internal.icu.text.UTF16; + +import java.io.DataInputStream; +import java.io.InputStream; +import java.io.IOException; + +/** + * Trie implementation which stores data in char, 16 bits. + * @author synwee + * @see com.ibm.icu.impl.Trie + * @since release 2.1, Jan 01 2002 + */ + + // note that i need to handle the block calculations later, since chartrie + // in icu4c uses the same index array. +public class CharTrie extends Trie +{ + // public constructors --------------------------------------------- + + /** + * <p>Creates a new Trie with the settings for the trie data.</p> + * <p>Unserialize the 32-bit-aligned input stream and use the data for the + * trie.</p> + * @param inputStream file input stream to a ICU data file, containing + * the trie + * @param dataManipulate object which provides methods to parse the char + * data + * @throws IOException thrown when data reading fails + * @draft 2.1 + */ + public CharTrie(InputStream inputStream, + DataManipulate dataManipulate) throws IOException + { + super(inputStream, dataManipulate); + + if (!isCharTrie()) { + throw new IllegalArgumentException( + "Data given does not belong to a char trie."); + } + } + + // public methods -------------------------------------------------- + + /** + * Gets the value associated with the codepoint. + * If no value is associated with the codepoint, a default value will be + * returned. + * @param ch codepoint + * @return offset to data + */ + public final char getCodePointValue(int ch) + { + int offset; + + // fastpath for U+0000..U+D7FF + if(0 <= ch && ch < UTF16.LEAD_SURROGATE_MIN_VALUE) { + // copy of getRawOffset() + offset = (m_index_[ch >> INDEX_STAGE_1_SHIFT_] << INDEX_STAGE_2_SHIFT_) + + (ch & INDEX_STAGE_3_MASK_); + return m_data_[offset]; + } + + // handle U+D800..U+10FFFF + offset = getCodePointOffset(ch); + + // return -1 if there is an error, in this case we return the default + // value: m_initialValue_ + return (offset >= 0) ? m_data_[offset] : m_initialValue_; + } + + /** + * Gets the value to the data which this lead surrogate character points + * to. + * Returned data may contain folding offset information for the next + * trailing surrogate character. + * This method does not guarantee correct results for trail surrogates. + * @param ch lead surrogate character + * @return data value + */ + public final char getLeadValue(char ch) + { + return m_data_[getLeadOffset(ch)]; + } + + // protected methods ----------------------------------------------- + + /** + * <p>Parses the input stream and stores its trie content into a index and + * data array</p> + * @param inputStream data input stream containing trie data + * @exception IOException thrown when data reading fails + */ + protected final void unserialize(InputStream inputStream) + throws IOException + { + DataInputStream input = new DataInputStream(inputStream); + int indexDataLength = m_dataOffset_ + m_dataLength_; + m_index_ = new char[indexDataLength]; + for (int i = 0; i < indexDataLength; i ++) { + m_index_[i] = input.readChar(); + } + m_data_ = m_index_; + m_initialValue_ = m_data_[m_dataOffset_]; + } + + /** + * Gets the offset to the data which the surrogate pair points to. + * @param lead lead surrogate + * @param trail trailing surrogate + * @return offset to data + * @draft 2.1 + */ + protected final int getSurrogateOffset(char lead, char trail) + { + if (m_dataManipulate_ == null) { + throw new NullPointerException( + "The field DataManipulate in this Trie is null"); + } + + // get fold position for the next trail surrogate + int offset = m_dataManipulate_.getFoldingOffset(getLeadValue(lead)); + + // get the real data from the folded lead/trail units + if (offset > 0) { + return getRawOffset(offset, (char)(trail & SURROGATE_MASK_)); + } + + // return -1 if there is an error, in this case we return the default + // value: m_initialValue_ + return -1; + } + + // private data members -------------------------------------------- + + /** + * Default value + */ + private char m_initialValue_; + /** + * Array of char data + */ + private char m_data_[]; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/java.base/share/classes/jdk/internal/icu/impl/CharacterIteratorWrapper.java Mon Jan 13 08:05:59 2020 -0800 @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * + * * + * The original version of this source code and documentation is copyrighted * + * and owned by IBM, These materials are provided under terms of a License * + * Agreement between IBM and Sun. This technology is protected by multiple * + * US and International patents. This notice and attribution to IBM may not * + * to removed. * + ******************************************************************************* + */ + +package jdk.internal.icu.impl; + +import java.text.CharacterIterator; + +import jdk.internal.icu.text.UCharacterIterator; + +/** + * This class is a wrapper around CharacterIterator and implements the + * UCharacterIterator protocol + * @author ram + */ + +public class CharacterIteratorWrapper extends UCharacterIterator { + + private CharacterIterator iterator; + + public CharacterIteratorWrapper(CharacterIterator iter){ + if(iter==null){ + throw new IllegalArgumentException(); + } + iterator = iter; + } + + /** + * @see UCharacterIterator#current() + */ + public int current() { + int c = iterator.current(); + if(c==CharacterIterator.DONE){ + return DONE; + } + return c; + } + + /** + * @see UCharacterIterator#getLength() + */ + public int getLength() { + return (iterator.getEndIndex() - iterator.getBeginIndex()); + } + + /** + * @see UCharacterIterator#getIndex() + */ + public int getIndex() { + return iterator.getIndex(); + } + + /** + * @see UCharacterIterator#next() + */ + public int next() { + int i = iterator.current(); + iterator.next(); + if(i==CharacterIterator.DONE){ + return DONE; + } + return i; + } + + /** + * @see UCharacterIterator#previous() + */ + public int previous() { + int i = iterator.previous(); + if(i==CharacterIterator.DONE){ + return DONE; + } + return i; + } + + /** + * @see UCharacterIterator#setIndex(int) + */ + public void setIndex(int index) { + iterator.setIndex(index); + } + + /** + * @see UCharacterIterator#getText(char[]) + */ + public int getText(char[] fillIn, int offset){ + int length =iterator.getEndIndex() - iterator.getBeginIndex(); + int currentIndex = iterator.getIndex(); + if(offset < 0 || offset + length > fillIn.length){ + throw new IndexOutOfBoundsException(Integer.toString(length)); + } + + for (char ch = iterator.first(); ch != CharacterIterator.DONE; ch = iterator.next()) { + fillIn[offset++] = ch; + } + iterator.setIndex(currentIndex); + + return length; + } + + /** + * Creates a clone of this iterator. Clones the underlying character iterator. + * @see UCharacterIterator#clone() + */ + public Object clone(){ + try { + CharacterIteratorWrapper result = (CharacterIteratorWrapper) super.clone(); + result.iterator = (CharacterIterator)this.iterator.clone(); + return result; + } catch (CloneNotSupportedException e) { + return null; // only invoked if bad underlying character iterator + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/java.base/share/classes/jdk/internal/icu/impl/ICUBinary.java Mon Jan 13 08:05:59 2020 -0800 @@ -0,0 +1,324 @@ +/* + * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 1996-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ + +package jdk.internal.icu.impl; + +import java.io.DataInputStream; +import java.io.InputStream; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Arrays; +import java.security.AccessController; +import java.security.PrivilegedAction; + +import jdk.internal.icu.util.VersionInfo; + +public final class ICUBinary { + + private static final class IsAcceptable implements Authenticate { + @Override + public boolean isDataVersionAcceptable(byte version[]) { + return version[0] == 1; + } + } + + // public inner interface ------------------------------------------------ + + /** + * Special interface for data authentication + */ + public static interface Authenticate + { + /** + * Method used in ICUBinary.readHeader() to provide data format + * authentication. + * @param version version of the current data + * @return true if dataformat is an acceptable version, false otherwise + */ + public boolean isDataVersionAcceptable(byte version[]); + } + + // public methods -------------------------------------------------------- + + /** + * Loads an ICU binary data file and returns it as a ByteBuffer. + * The buffer contents is normally read-only, but its position etc. can be modified. + * + * @param itemPath Relative ICU data item path, for example "root.res" or "coll/ucadata.icu". + * @return The data as a read-only ByteBuffer. + */ + public static ByteBuffer getRequiredData(String itemPath) { + final Class<ICUBinary> root = ICUBinary.class; + + try (InputStream is = AccessController.doPrivileged(new PrivilegedAction<InputStream>() { + public InputStream run() { + return root.getResourceAsStream(itemPath); + } + })) { + + // is.available() may return 0, or 1, or the total number of bytes in the stream, + // or some other number. + // Do not try to use is.available() == 0 to find the end of the stream! + byte[] bytes; + int avail = is.available(); + if (avail > 32) { + // There are more bytes available than just the ICU data header length. + // With luck, it is the total number of bytes. + bytes = new byte[avail]; + } else { + bytes = new byte[128]; // empty .res files are even smaller + } + // Call is.read(...) until one returns a negative value. + int length = 0; + for(;;) { + if (length < bytes.length) { + int numRead = is.read(bytes, length, bytes.length - length); + if (numRead < 0) { + break; // end of stream + } + length += numRead; + } else { + // See if we are at the end of the stream before we grow the array. + int nextByte = is.read(); + if (nextByte < 0) { + break; + } + int capacity = 2 * bytes.length; + if (capacity < 128) { + capacity = 128; + } else if (capacity < 0x4000) { + capacity *= 2; // Grow faster until we reach 16kB. + } + bytes = Arrays.copyOf(bytes, capacity); + bytes[length++] = (byte) nextByte; + } + } + return ByteBuffer.wrap(bytes, 0, length); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + /** + * Same as readHeader(), but returns a VersionInfo rather than a compact int. + */ + public static VersionInfo readHeaderAndDataVersion(ByteBuffer bytes, + int dataFormat, + Authenticate authenticate) + throws IOException { + return getVersionInfoFromCompactInt(readHeader(bytes, dataFormat, authenticate)); + } + + private static final byte BIG_ENDIAN_ = 1; + public static final byte[] readHeader(InputStream inputStream, + byte dataFormatIDExpected[], + Authenticate authenticate) + throws IOException + { + DataInputStream input = new DataInputStream(inputStream); + char headersize = input.readChar(); + int readcount = 2; + //reading the header format + byte magic1 = input.readByte(); + readcount ++; + byte magic2 = input.readByte(); + readcount ++; + if (magic1 != MAGIC1 || magic2 != MAGIC2) { + throw new IOException(MAGIC_NUMBER_AUTHENTICATION_FAILED_); + } + + input.readChar(); // reading size + readcount += 2; + input.readChar(); // reading reserved word + readcount += 2; + byte bigendian = input.readByte(); + readcount ++; + byte charset = input.readByte(); + readcount ++; + byte charsize = input.readByte(); + readcount ++; + input.readByte(); // reading reserved byte + readcount ++; + + byte dataFormatID[] = new byte[4]; + input.readFully(dataFormatID); + readcount += 4; + byte dataVersion[] = new byte[4]; + input.readFully(dataVersion); + readcount += 4; + byte unicodeVersion[] = new byte[4]; + input.readFully(unicodeVersion); + readcount += 4; + if (headersize < readcount) { + throw new IOException("Internal Error: Header size error"); + } + input.skipBytes(headersize - readcount); + + if (bigendian != BIG_ENDIAN_ || charset != CHAR_SET_ + || charsize != CHAR_SIZE_ + || !Arrays.equals(dataFormatIDExpected, dataFormatID) + || (authenticate != null + && !authenticate.isDataVersionAcceptable(dataVersion))) { + throw new IOException(HEADER_AUTHENTICATION_FAILED_); + } + return unicodeVersion; + } + + /** + * Reads an ICU data header, checks the data format, and returns the data version. + * + * <p>Assumes that the ByteBuffer position is 0 on input. + * The buffer byte order is set according to the data. + * The buffer position is advanced past the header (including UDataInfo and comment). + * + * <p>See C++ ucmndata.h and unicode/udata.h. + * + * @return dataVersion + * @throws IOException if this is not a valid ICU data item of the expected dataFormat + */ + public static int readHeader(ByteBuffer bytes, int dataFormat, Authenticate authenticate) + throws IOException { + assert bytes.position() == 0; + byte magic1 = bytes.get(2); + byte magic2 = bytes.get(3); + if (magic1 != MAGIC1 || magic2 != MAGIC2) { + throw new IOException(MAGIC_NUMBER_AUTHENTICATION_FAILED_); + } + + byte isBigEndian = bytes.get(8); + byte charsetFamily = bytes.get(9); + byte sizeofUChar = bytes.get(10); + if (isBigEndian < 0 || 1 < isBigEndian || + charsetFamily != CHAR_SET_ || sizeofUChar != CHAR_SIZE_) { + throw new IOException(HEADER_AUTHENTICATION_FAILED_); + } + bytes.order(isBigEndian != 0 ? ByteOrder.BIG_ENDIAN : ByteOrder.LITTLE_ENDIAN); + + int headerSize = bytes.getChar(0); + int sizeofUDataInfo = bytes.getChar(4); + if (sizeofUDataInfo < 20 || headerSize < (sizeofUDataInfo + 4)) { + throw new IOException("Internal Error: Header size error"); + } + // TODO: Change Authenticate to take int major, int minor, int milli, int micro + // to avoid array allocation. + byte[] formatVersion = new byte[] { + bytes.get(16), bytes.get(17), bytes.get(18), bytes.get(19) + }; + if (bytes.get(12) != (byte)(dataFormat >> 24) || + bytes.get(13) != (byte)(dataFormat >> 16) || + bytes.get(14) != (byte)(dataFormat >> 8) || + bytes.get(15) != (byte)dataFormat || + (authenticate != null && !authenticate.isDataVersionAcceptable(formatVersion))) { + throw new IOException(HEADER_AUTHENTICATION_FAILED_ + + String.format("; data format %02x%02x%02x%02x, format version %d.%d.%d.%d", + bytes.get(12), bytes.get(13), bytes.get(14), bytes.get(15), + formatVersion[0] & 0xff, formatVersion[1] & 0xff, + formatVersion[2] & 0xff, formatVersion[3] & 0xff)); + } + + bytes.position(headerSize); + return // dataVersion + ((int)bytes.get(20) << 24) | + ((bytes.get(21) & 0xff) << 16) | + ((bytes.get(22) & 0xff) << 8) | + (bytes.get(23) & 0xff); + } + + public static void skipBytes(ByteBuffer bytes, int skipLength) { + if (skipLength > 0) { + bytes.position(bytes.position() + skipLength); + } + } + + public static byte[] getBytes(ByteBuffer bytes, int length, int additionalSkipLength) { + byte[] dest = new byte[length]; + bytes.get(dest); + if (additionalSkipLength > 0) { + skipBytes(bytes, additionalSkipLength); + } + return dest; + } + + public static String getString(ByteBuffer bytes, int length, int additionalSkipLength) { + CharSequence cs = bytes.asCharBuffer(); + String s = cs.subSequence(0, length).toString(); + skipBytes(bytes, length * 2 + additionalSkipLength); + return s; + } + + public static char[] getChars(ByteBuffer bytes, int length, int additionalSkipLength) { + char[] dest = new char[length]; + bytes.asCharBuffer().get(dest); + skipBytes(bytes, length * 2 + additionalSkipLength); + return dest; + } + + public static int[] getInts(ByteBuffer bytes, int length, int additionalSkipLength) { + int[] dest = new int[length]; + bytes.asIntBuffer().get(dest); + skipBytes(bytes, length * 4 + additionalSkipLength); + return dest; + } + + /** + * Returns a VersionInfo for the bytes in the compact version integer. + */ + public static VersionInfo getVersionInfoFromCompactInt(int version) { + return VersionInfo.getInstance( + version >>> 24, (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff); + } + + // private variables ------------------------------------------------- + + /** + * Magic numbers to authenticate the data file + */ + private static final byte MAGIC1 = (byte)0xda; + private static final byte MAGIC2 = (byte)0x27; + + /** + * File format authentication values + */ + private static final byte CHAR_SET_ = 0; + private static final byte CHAR_SIZE_ = 2; + + /** + * Error messages + */ + private static final String MAGIC_NUMBER_AUTHENTICATION_FAILED_ = + "ICUBinary data file error: Magic number authentication failed"; + private static final String HEADER_AUTHENTICATION_FAILED_ = + "ICUBinary data file error: Header authentication failed"; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/java.base/share/classes/jdk/internal/icu/impl/Norm2AllModes.java Mon Jan 13 08:05:59 2020 -0800 @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 2009-2014, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ + +package jdk.internal.icu.impl; + +import java.io.IOException; + +import jdk.internal.icu.text.Normalizer2; +import jdk.internal.icu.util.VersionInfo; + +public final class Norm2AllModes { + // Public API dispatch via Normalizer2 subclasses -------------------------- *** + + // Normalizer2 implementation for the old UNORM_NONE. + public static final class NoopNormalizer2 extends Normalizer2 { + @Override + public StringBuilder normalize(CharSequence src, StringBuilder dest) { + if(dest!=src) { + dest.setLength(0); + return dest.append(src); + } else { + throw new IllegalArgumentException(); + } + } + + @Override + public Appendable normalize(CharSequence src, Appendable dest) { + if(dest!=src) { + try { + return dest.append(src); + } catch(IOException e) { + throw new InternalError(e.toString(), e); + } + } else { + throw new IllegalArgumentException(); + } + } + + @Override + public StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second) { + if(first!=second) { + return first.append(second); + } else { + throw new IllegalArgumentException(); + } + } + + @Override + public StringBuilder append(StringBuilder first, CharSequence second) { + if(first!=second) { + return first.append(second); + } else { + throw new IllegalArgumentException(); + } + } + + @Override + public String getDecomposition(int c) { + return null; + } + + // No need to override the default getRawDecomposition(). + @Override + public boolean isNormalized(CharSequence s) { return true; } + + @Override + public int spanQuickCheckYes(CharSequence s) { return s.length(); } + + @Override + public boolean hasBoundaryBefore(int c) { return true; } + } + + // Intermediate class: + // Has NormalizerImpl and does boilerplate argument checking and setup. + public abstract static class Normalizer2WithImpl extends Normalizer2 { + public Normalizer2WithImpl(NormalizerImpl ni) { + impl=ni; + } + + // normalize + @Override + public StringBuilder normalize(CharSequence src, StringBuilder dest) { + if(dest==src) { + throw new IllegalArgumentException(); + } + dest.setLength(0); + normalize(src, new NormalizerImpl.ReorderingBuffer(impl, dest, src.length())); + return dest; + } + + @Override + public Appendable normalize(CharSequence src, Appendable dest) { + if(dest==src) { + throw new IllegalArgumentException(); + } + NormalizerImpl.ReorderingBuffer buffer= + new NormalizerImpl.ReorderingBuffer(impl, dest, src.length()); + normalize(src, buffer); + buffer.flush(); + return dest; + } + + protected abstract void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer); + + // normalize and append + @Override + public StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second) { + return normalizeSecondAndAppend(first, second, true); + } + + @Override + public StringBuilder append(StringBuilder first, CharSequence second) { + return normalizeSecondAndAppend(first, second, false); + } + + public StringBuilder normalizeSecondAndAppend( + StringBuilder first, CharSequence second, boolean doNormalize) { + if(first==second) { + throw new IllegalArgumentException(); + } + normalizeAndAppend( + second, doNormalize, + new NormalizerImpl.ReorderingBuffer(impl, first, first.length()+second.length())); + return first; + } + + protected abstract void normalizeAndAppend( + CharSequence src, boolean doNormalize, NormalizerImpl.ReorderingBuffer buffer); + + @Override + public String getDecomposition(int c) { + return impl.getDecomposition(c); + } + + @Override + public int getCombiningClass(int c) { + return impl.getCC(impl.getNorm16(c)); + } + + // quick checks + @Override + public boolean isNormalized(CharSequence s) { + return s.length()==spanQuickCheckYes(s); + } + + public final NormalizerImpl impl; + } + + public static final class DecomposeNormalizer2 extends Normalizer2WithImpl { + public DecomposeNormalizer2(NormalizerImpl ni) { + super(ni); + } + + @Override + protected void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer) { + impl.decompose(src, 0, src.length(), buffer); + } + + @Override + protected void normalizeAndAppend( + CharSequence src, boolean doNormalize, NormalizerImpl.ReorderingBuffer buffer) { + impl.decomposeAndAppend(src, doNormalize, buffer); + } + + @Override + public int spanQuickCheckYes(CharSequence s) { + return impl.decompose(s, 0, s.length(), null); + } + + @Override + public boolean hasBoundaryBefore(int c) { return impl.hasDecompBoundaryBefore(c); } + } + + public static final class ComposeNormalizer2 extends Normalizer2WithImpl { + public ComposeNormalizer2(NormalizerImpl ni, boolean fcc) { + super(ni); + onlyContiguous=fcc; + } + + @Override + protected void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer) { + impl.compose(src, 0, src.length(), onlyContiguous, true, buffer); + } + + @Override + protected void normalizeAndAppend( + CharSequence src, boolean doNormalize, NormalizerImpl.ReorderingBuffer buffer) { + impl.composeAndAppend(src, doNormalize, onlyContiguous, buffer); + } + + @Override + public boolean isNormalized(CharSequence s) { + // 5: small destCapacity for substring normalization + return impl.compose(s, 0, s.length(), + onlyContiguous, false, + new NormalizerImpl.ReorderingBuffer(impl, new StringBuilder(), 5)); + } + + @Override + public int spanQuickCheckYes(CharSequence s) { + return impl.composeQuickCheck(s, 0, s.length(), onlyContiguous, true)>>>1; + } + + @Override + public boolean hasBoundaryBefore(int c) { return impl.hasCompBoundaryBefore(c); } + + private final boolean onlyContiguous; + } + + // instance cache ---------------------------------------------------------- *** + + private Norm2AllModes(NormalizerImpl ni) { + impl=ni; + comp=new ComposeNormalizer2(ni, false); + decomp=new DecomposeNormalizer2(ni); + } + + public final NormalizerImpl impl; + public final ComposeNormalizer2 comp; + public final DecomposeNormalizer2 decomp; + + private static Norm2AllModes getInstanceFromSingleton(Norm2AllModesSingleton singleton) { + if(singleton.exception!=null) { + throw singleton.exception; + } + return singleton.allModes; + } + + public static Norm2AllModes getNFCInstance() { + return getInstanceFromSingleton(NFCSingleton.INSTANCE); + } + + public static Norm2AllModes getNFKCInstance() { + return getInstanceFromSingleton(NFKCSingleton.INSTANCE); + } + + public static final NoopNormalizer2 NOOP_NORMALIZER2=new NoopNormalizer2(); + + private static final class Norm2AllModesSingleton { + private Norm2AllModesSingleton(String name) { + try { + @SuppressWarnings("deprecation") + String DATA_FILE_NAME = "/jdk/internal/icu/impl/data/icudt" + + VersionInfo.ICU_DATA_VERSION_PATH + "/" + name + ".nrm"; + NormalizerImpl impl=new NormalizerImpl().load(DATA_FILE_NAME); + allModes=new Norm2AllModes(impl); + } catch (RuntimeException e) { + exception=e; + } + } + + private Norm2AllModes allModes; + private RuntimeException exception; + } + + private static final class NFCSingleton { + private static final Norm2AllModesSingleton INSTANCE=new Norm2AllModesSingleton("nfc"); + } + + private static final class NFKCSingleton { + private static final Norm2AllModesSingleton INSTANCE=new Norm2AllModesSingleton("nfkc"); + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/java.base/share/classes/jdk/internal/icu/impl/NormalizerImpl.java Mon Jan 13 08:05:59 2020 -0800 @@ -0,0 +1,2193 @@ +/* + * Copyright (c) 2009, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 2009-2014, International Business Machines + * Corporation and others. All Rights Reserved. + ******************************************************************************* + */ +package jdk.internal.icu.impl; + +import java.io.IOException; +import java.nio.ByteBuffer; + +import jdk.internal.icu.lang.UCharacter; +import jdk.internal.icu.text.Normalizer2; +import jdk.internal.icu.text.UTF16; +import jdk.internal.icu.util.CodePointTrie; +import jdk.internal.icu.util.VersionInfo; + +// Original filename in ICU4J: Normalizer2Impl.java +public final class NormalizerImpl { + public static final class Hangul { + /* Korean Hangul and Jamo constants */ + public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ + public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ + public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ + + public static final int HANGUL_BASE=0xac00; + public static final int HANGUL_END=0xd7a3; + + public static final int JAMO_L_COUNT=19; + public static final int JAMO_V_COUNT=21; + public static final int JAMO_T_COUNT=28; + + public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; + public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; + + public static boolean isHangul(int c) { + return HANGUL_BASE<=c && c<HANGUL_LIMIT; + } + public static boolean isHangulLV(int c) { + c-=HANGUL_BASE; + return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0; + } + + /** + * Decomposes c, which must be a Hangul syllable, into buffer + * and returns the length of the decomposition (2 or 3). + */ + public static int decompose(int c, Appendable buffer) { + try { + c-=HANGUL_BASE; + int c2=c%JAMO_T_COUNT; + c/=JAMO_T_COUNT; + buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); + buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); + if(c2==0) { + return 2; + } else { + buffer.append((char)(JAMO_T_BASE+c2)); + return 3; + } + } catch(IOException e) { + throw new InternalError(e); + } + } + } + + /** + * Writable buffer that takes care of canonical ordering. + * Its Appendable methods behave like the C++ implementation's + * appendZeroCC() methods. + * <p> + * If dest is a StringBuilder, then the buffer writes directly to it. + * Otherwise, the buffer maintains a StringBuilder for intermediate text segments + * until no further changes are necessary and whole segments are appended. + * append() methods that take combining-class values always write to the StringBuilder. + * Other append() methods flush and append to the Appendable. + */ + public static final class ReorderingBuffer implements Appendable { + public ReorderingBuffer(NormalizerImpl ni, Appendable dest, int destCapacity) { + impl=ni; + app=dest; + if (app instanceof StringBuilder) { + appIsStringBuilder=true; + str=(StringBuilder)dest; + // In Java, the constructor subsumes public void init(int destCapacity) + str.ensureCapacity(destCapacity); + reorderStart=0; + if(str.length()==0) { + lastCC=0; + } else { + setIterator(); + lastCC=previousCC(); + // Set reorderStart after the last code point with cc<=1 if there is one. + if(lastCC>1) { + while(previousCC()>1) {} + } + reorderStart=codePointLimit; + } + } else { + appIsStringBuilder=false; + str=new StringBuilder(); + reorderStart=0; + lastCC=0; + } + } + + public boolean isEmpty() { return str.length()==0; } + public int length() { return str.length(); } + public int getLastCC() { return lastCC; } + + public StringBuilder getStringBuilder() { return str; } + + public boolean equals(CharSequence s, int start, int limit) { + return UTF16Plus.equal(str, 0, str.length(), s, start, limit); + } + + public void append(int c, int cc) { + if(lastCC<=cc || cc==0) { + str.appendCodePoint(c); + lastCC=cc; + if(cc<=1) { + reorderStart=str.length(); + } + } else { + insert(c, cc); + } + } + public void append(CharSequence s, int start, int limit, boolean isNFD, + int leadCC, int trailCC) { + if(start==limit) { + return; + } + if(lastCC<=leadCC || leadCC==0) { + if(trailCC<=1) { + reorderStart=str.length()+(limit-start); + } else if(leadCC<=1) { + reorderStart=str.length()+1; // Ok if not a code point boundary. + } + str.append(s, start, limit); + lastCC=trailCC; + } else { + int c=Character.codePointAt(s, start); + start+=Character.charCount(c); + insert(c, leadCC); // insert first code point + while(start<limit) { + c=Character.codePointAt(s, start); + start+=Character.charCount(c); + if(start<limit) { + if (isNFD) { + leadCC = getCCFromYesOrMaybe(impl.getNorm16(c)); + } else { + leadCC = impl.getCC(impl.getNorm16(c)); + } + } else { + leadCC=trailCC; + } + append(c, leadCC); + } + } + } + // The following append() methods work like C++ appendZeroCC(). + // They assume that the cc or trailCC of their input is 0. + // Most of them implement Appendable interface methods. + @Override + public ReorderingBuffer append(char c) { + str.append(c); + lastCC=0; + reorderStart=str.length(); + return this; + } + public void appendZeroCC(int c) { + str.appendCodePoint(c); + lastCC=0; + reorderStart=str.length(); + } + @Override + public ReorderingBuffer append(CharSequence s) { + if(s.length()!=0) { + str.append(s); + lastCC=0; + reorderStart=str.length(); + } + return this; + } + @Override + public ReorderingBuffer append(CharSequence s, int start, int limit) { + if(start!=limit) { + str.append(s, start, limit); + lastCC=0; + reorderStart=str.length(); + } + return this; + } + /** + * Flushes from the intermediate StringBuilder to the Appendable, + * if they are different objects. + * Used after recomposition. + * Must be called at the end when writing to a non-StringBuilder Appendable. + */ + public void flush() { + if(appIsStringBuilder) { + reorderStart=str.length(); + } else { + try { + app.append(str); + str.setLength(0); + reorderStart=0; + } catch(IOException e) { + throw new InternalError(e); // Avoid declaring "throws IOException". + } + } + lastCC=0; + } + /** + * Flushes from the intermediate StringBuilder to the Appendable, + * if they are different objects. + * Then appends the new text to the Appendable or StringBuilder. + * Normally used after quick check loops find a non-empty sequence. + */ + public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) { + if(appIsStringBuilder) { + str.append(s, start, limit); + reorderStart=str.length(); + } else { + try { + app.append(str).append(s, start, limit); + str.setLength(0); + reorderStart=0; + } catch(IOException e) { + throw new InternalError(e); // Avoid declaring "throws IOException". + } + } + lastCC=0; + return this; + } + public void remove() { + str.setLength(0); + lastCC=0; + reorderStart=0; + } + public void removeSuffix(int suffixLength) { + int oldLength=str.length(); + str.delete(oldLength-suffixLength, oldLength); + lastCC=0; + reorderStart=str.length(); + } + + // Inserts c somewhere before the last character. + // Requires 0<cc<lastCC which implies reorderStart<limit. + private void insert(int c, int cc) { + for(setIterator(), skipPrevious(); previousCC()>cc;) {} + // insert c at codePointLimit, after the character with prevCC<=cc + if(c<=0xffff) { + str.insert(codePointLimit, (char)c); + if(cc<=1) { + reorderStart=codePointLimit+1; + } + } else { + str.insert(codePointLimit, Character.toChars(c)); + if(cc<=1) { + reorderStart=codePointLimit+2; + } + } + } + + private final NormalizerImpl impl; + private final Appendable app; + private final StringBuilder str; + private final boolean appIsStringBuilder; + private int reorderStart; + private int lastCC; + + // private backward iterator + private void setIterator() { codePointStart=str.length(); } + private void skipPrevious() { // Requires 0<codePointStart. + codePointLimit=codePointStart; + codePointStart=str.offsetByCodePoints(codePointStart, -1); + } + private int previousCC() { // Returns 0 if there is no previous character. + codePointLimit=codePointStart; + if(reorderStart>=codePointStart) { + return 0; + } + int c=str.codePointBefore(codePointStart); + codePointStart-=Character.charCount(c); + return impl.getCCFromYesOrMaybeCP(c); + } + private int codePointStart, codePointLimit; + } + + // TODO: Propose as public API on the UTF16 class. + // TODO: Propose widening UTF16 methods that take char to take int. + // TODO: Propose widening UTF16 methods that take String to take CharSequence. + public static final class UTF16Plus { + /** + * Is this code point a lead surrogate (U+d800..U+dbff)? + * @param c code unit or code point + * @return true or false + */ + public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; } + /** + * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), + * is it a lead surrogate? + * @param c code unit or code point + * @return true or false + */ + public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } + + /** + * Compares two CharSequence subsequences for binary equality. + * @param s1 first sequence + * @param start1 start offset in first sequence + * @param limit1 limit offset in first sequence + * @param s2 second sequence + * @param start2 start offset in second sequence + * @param limit2 limit offset in second sequence + * @return true if s1.subSequence(start1, limit1) contains the same text + * as s2.subSequence(start2, limit2) + */ + public static boolean equal(CharSequence s1, int start1, int limit1, + CharSequence s2, int start2, int limit2) { + if((limit1-start1)!=(limit2-start2)) { + return false; + } + if(s1==s2 && start1==start2) { + return true; + } + while(start1<limit1) { + if(s1.charAt(start1++)!=s2.charAt(start2++)) { + return false; + } + } + return true; + } + } + + public NormalizerImpl() {} + + private static final class IsAcceptable implements ICUBinary.Authenticate { + public boolean isDataVersionAcceptable(byte version[]) { + return version[0]==4; + } + } + private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); + private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" + + public NormalizerImpl load(ByteBuffer bytes) { + try { + dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); + int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 + if(indexesLength<=IX_MIN_LCCC_CP) { + throw new InternalError("Normalizer2 data: not enough indexes"); + } + int[] inIndexes=new int[indexesLength]; + inIndexes[0]=indexesLength*4; + for(int i=1; i<indexesLength; ++i) { + inIndexes[i]=bytes.getInt(); + } + + minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; + minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; + minLcccCP=inIndexes[IX_MIN_LCCC_CP]; + + minYesNo=inIndexes[IX_MIN_YES_NO]; + minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; + minNoNo=inIndexes[IX_MIN_NO_NO]; + minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]; + minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]; + minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY]; + limitNoNo=inIndexes[IX_LIMIT_NO_NO]; + minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; + assert((minMaybeYes&7)==0); // 8-aligned for noNoDelta bit fields + centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1; + + // Read the normTrie. + int offset=inIndexes[IX_NORM_TRIE_OFFSET]; + int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; + int triePosition = bytes.position(); + normTrie = CodePointTrie.Fast16.fromBinary(bytes); + int trieLength = bytes.position() - triePosition; + if(trieLength>(nextOffset-offset)) { + throw new InternalError("Normalizer2 data: not enough bytes for normTrie"); + } + ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes + + // Read the composition and mapping data. + offset=nextOffset; + nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; + int numChars=(nextOffset-offset)/2; + if(numChars!=0) { + maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0); + extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT); + } + + // smallFCD: new in formatVersion 2 + offset=nextOffset; + smallFCD=new byte[0x100]; + bytes.get(smallFCD); + + return this; + } catch(IOException e) { + throw new InternalError(e); + } + } + public NormalizerImpl load(String name) { + return load(ICUBinary.getRequiredData(name)); + } + + // The trie stores values for lead surrogate code *units*. + // Surrogate code *points* are inert. + public int getNorm16(int c) { + return UTF16Plus.isLeadSurrogate(c) ? INERT : normTrie.get(c); + } + public int getRawNorm16(int c) { return normTrie.get(c); } + public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; } + public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; } + public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } + + public int getCC(int norm16) { + if(norm16>=MIN_NORMAL_MAYBE_YES) { + return getCCFromNormalYesOrMaybe(norm16); + } + if(norm16<minNoNo || limitNoNo<=norm16) { + return 0; + } + return getCCFromNoNo(norm16); + } + public static int getCCFromNormalYesOrMaybe(int norm16) { + return (norm16 >> OFFSET_SHIFT) & 0xff; + } + public static int getCCFromYesOrMaybe(int norm16) { + return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0; + } + public int getCCFromYesOrMaybeCP(int c) { + if (c < minCompNoMaybeCP) { return 0; } + return getCCFromYesOrMaybe(getNorm16(c)); + } + + /** + * Returns the FCD data for code point c. + * @param c A Unicode code point. + * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. + */ + public int getFCD16(int c) { + if(c<minDecompNoCP) { + return 0; + } else if(c<=0xffff) { + if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } + } + return getFCD16FromNormData(c); + } + /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */ + public boolean singleLeadMightHaveNonZeroFCD16(int lead) { + // 0<=lead<=0xffff + byte bits=smallFCD[lead>>8]; + if(bits==0) { return false; } + return ((bits>>((lead>>5)&7))&1)!=0; + } + + /** Gets the FCD value from the regular normalization data. */ + public int getFCD16FromNormData(int c) { + int norm16=getNorm16(c); + if (norm16 >= limitNoNo) { + if(norm16>=MIN_NORMAL_MAYBE_YES) { + // combining mark + norm16=getCCFromNormalYesOrMaybe(norm16); + return norm16|(norm16<<8); + } else if(norm16>=minMaybeYes) { + return 0; + } else { // isDecompNoAlgorithmic(norm16) + int deltaTrailCC = norm16 & DELTA_TCCC_MASK; + if (deltaTrailCC <= DELTA_TCCC_1) { + return deltaTrailCC >> OFFSET_SHIFT; + } + // Maps to an isCompYesAndZeroCC. + c=mapAlgorithmic(c, norm16); + norm16=getRawNorm16(c); + } + } + if(norm16<=minYesNo || isHangulLVT(norm16)) { + // no decomposition or Hangul syllable, all zeros + return 0; + } + // c decomposes, get everything from the variable-length extra data + int mapping=norm16>>OFFSET_SHIFT; + int firstUnit=extraData.charAt(mapping); + int fcd16=firstUnit>>8; // tccc + if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { + fcd16|=extraData.charAt(mapping-1)&0xff00; // lccc + } + return fcd16; + } + + /** + * Gets the decomposition for one code point. + * @param c code point + * @return c's decomposition, if it has one; returns null if it does not have a decomposition + */ + public String getDecomposition(int c) { + int norm16; + if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) { + // c does not decompose + return null; + } + int decomp = -1; + if(isDecompNoAlgorithmic(norm16)) { + // Maps to an isCompYesAndZeroCC. + decomp=c=mapAlgorithmic(c, norm16); + // The mapping might decompose further. + norm16 = getRawNorm16(c); + } + if (norm16 < minYesNo) { + if(decomp<0) { + return null; + } else { + return UTF16.valueOf(decomp); + } + } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { + // Hangul syllable: decompose algorithmically + StringBuilder buffer=new StringBuilder(); + Hangul.decompose(c, buffer); + return buffer.toString(); + } + // c decomposes, get everything from the variable-length extra data + int mapping=norm16>>OFFSET_SHIFT; + int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK; + return extraData.substring(mapping, mapping+length); + } + + // Fixed norm16 values. + public static final int MIN_YES_YES_WITH_CC=0xfe02; + public static final int JAMO_VT=0xfe00; + public static final int MIN_NORMAL_MAYBE_YES=0xfc00; + public static final int JAMO_L=2; // offset=1 hasCompBoundaryAfter=FALSE + public static final int INERT=1; // offset=0 hasCompBoundaryAfter=TRUE + + // norm16 bit 0 is comp-boundary-after. + public static final int HAS_COMP_BOUNDARY_AFTER=1; + public static final int OFFSET_SHIFT=1; + + // For algorithmic one-way mappings, norm16 bits 2..1 indicate the + // tccc (0, 1, >1) for quick FCC boundary-after tests. + public static final int DELTA_TCCC_0=0; + public static final int DELTA_TCCC_1=2; + public static final int DELTA_TCCC_GT_1=4; + public static final int DELTA_TCCC_MASK=6; + public static final int DELTA_SHIFT=3; + + public static final int MAX_DELTA=0x40; + + // Byte offsets from the start of the data, after the generic header. + public static final int IX_NORM_TRIE_OFFSET=0; + public static final int IX_EXTRA_DATA_OFFSET=1; + public static final int IX_SMALL_FCD_OFFSET=2; + public static final int IX_RESERVED3_OFFSET=3; + public static final int IX_TOTAL_SIZE=7; + public static final int MIN_CCC_LCCC_CP=0x300; + // Code point thresholds for quick check codes. + public static final int IX_MIN_DECOMP_NO_CP=8; + public static final int IX_MIN_COMP_NO_MAYBE_CP=9; + + // Norm16 value thresholds for quick check combinations and types of extra data. + + /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ + public static final int IX_MIN_YES_NO=10; + /** Mappings are comp-normalized. */ + public static final int IX_MIN_NO_NO=11; + public static final int IX_LIMIT_NO_NO=12; + public static final int IX_MIN_MAYBE_YES=13; + + /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ + public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; + /** Mappings are not comp-normalized but have a comp boundary before. */ + public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15; + /** Mappings do not have a comp boundary before. */ + public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16; + /** Mappings to the empty string. */ + public static final int IX_MIN_NO_NO_EMPTY=17; + + public static final int IX_MIN_LCCC_CP=18; + public static final int IX_COUNT=20; + + public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; + public static final int MAPPING_HAS_RAW_MAPPING=0x40; + // unused bit 0x20; + public static final int MAPPING_LENGTH_MASK=0x1f; + + public static final int COMP_1_LAST_TUPLE=0x8000; + public static final int COMP_1_TRIPLE=1; + public static final int COMP_1_TRAIL_LIMIT=0x3400; + public static final int COMP_1_TRAIL_MASK=0x7ffe; + public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit + public static final int COMP_2_TRAIL_SHIFT=6; + public static final int COMP_2_TRAIL_MASK=0xffc0; + + // higher-level functionality ------------------------------------------ *** + + /** + * Decomposes s[src, limit[ and writes the result to dest. + * limit can be NULL if src is NUL-terminated. + * destLengthEstimate is the initial dest buffer capacity and can be -1. + */ + public void decompose(CharSequence s, int src, int limit, StringBuilder dest, + int destLengthEstimate) { + if(destLengthEstimate<0) { + destLengthEstimate=limit-src; + } + dest.setLength(0); + ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); + decompose(s, src, limit, buffer); + } + + // Dual functionality: + // buffer!=NULL: normalize + // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes + public int decompose(CharSequence s, int src, int limit, + ReorderingBuffer buffer) { + int minNoCP=minDecompNoCP; + + int prevSrc; + int c=0; + int norm16=0; + + // only for quick check + int prevBoundary=src; + int prevCC=0; + + for(;;) { + // count code units below the minimum or with irrelevant data for the quick check + for(prevSrc=src; src!=limit;) { + if( (c=s.charAt(src))<minNoCP || + isMostDecompYesAndZeroCC(norm16=normTrie.bmpGet(c)) + ) { + ++src; + } else if(!UTF16Plus.isLeadSurrogate(c)) { + break; + } else { + char c2; + if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) { + c = Character.toCodePoint((char)c, c2); + norm16 = normTrie.suppGet(c); + if (isMostDecompYesAndZeroCC(norm16)) { + src += 2; + } else { + break; + } + } else { + ++src; // unpaired lead surrogate: inert + } + } + } + // copy these code units all at once + if(src!=prevSrc) { + if(buffer!=null) { + buffer.flushAndAppendZeroCC(s, prevSrc, src); + } else { + prevCC=0; + prevBoundary=src; + } + } + if(src==limit) { + break; + } + + // Check one above-minimum, relevant code point. + src+=Character.charCount(c); + if(buffer!=null) { + decompose(c, norm16, buffer); + } else { + if(isDecompYes(norm16)) { + int cc=getCCFromYesOrMaybe(norm16); + if(prevCC<=cc || cc==0) { + prevCC=cc; + if(cc<=1) { + prevBoundary=src; + } + continue; + } + } + return prevBoundary; // "no" or cc out of order + } + } + return src; + } + public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) { + int limit=s.length(); + if(limit==0) { + return; + } + if(doDecompose) { + decompose(s, 0, limit, buffer); + return; + } + // Just merge the strings at the boundary. + int c=Character.codePointAt(s, 0); + int src=0; + int firstCC, prevCC, cc; + firstCC=prevCC=cc=getCC(getNorm16(c)); + while(cc!=0) { + prevCC=cc; + src+=Character.charCount(c); + if(src>=limit) { + break; + } + c=Character.codePointAt(s, src); + cc=getCC(getNorm16(c)); + }; + buffer.append(s, 0, src, false, firstCC, prevCC); + buffer.append(s, src, limit); + } + + // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. + // doCompose: normalize + // !doCompose: isNormalized (buffer must be empty and initialized) + public boolean compose(CharSequence s, int src, int limit, + boolean onlyContiguous, + boolean doCompose, + ReorderingBuffer buffer) { + int prevBoundary=src; + int minNoMaybeCP=minCompNoMaybeCP; + + for (;;) { + // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, + // or with (compYes && ccc==0) properties. + int prevSrc; + int c = 0; + int norm16 = 0; + for (;;) { + if (src == limit) { + if (prevBoundary != limit && doCompose) { + buffer.append(s, prevBoundary, limit); + } + return true; + } + if( (c=s.charAt(src))<minNoMaybeCP || + isCompYesAndZeroCC(norm16=normTrie.bmpGet(c)) + ) { + ++src; + } else { + prevSrc = src++; + if (!UTF16Plus.isLeadSurrogate(c)) { + break; + } else { + char c2; + if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) { + ++src; + c = Character.toCodePoint((char)c, c2); + norm16 = normTrie.suppGet(c); + if (!isCompYesAndZeroCC(norm16)) { + break; + } + } + } + } + } + // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. + // The current character is either a "noNo" (has a mapping) + // or a "maybeYes" (combines backward) + // or a "yesYes" with ccc!=0. + // It is not a Hangul syllable or Jamo L because those have "yes" properties. + + // Medium-fast path: Handle cases that do not require full decomposition and recomposition. + if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes + if (!doCompose) { + return false; + } + // Fast path for mapping a character that is immediately surrounded by boundaries. + // In this case, we need not decompose around the current character. + if (isDecompNoAlgorithmic(norm16)) { + // Maps to a single isCompYesAndZeroCC character + // which also implies hasCompBoundaryBefore. + if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || + hasCompBoundaryBefore(s, src, limit)) { + if (prevBoundary != prevSrc) { + buffer.append(s, prevBoundary, prevSrc); + } + buffer.append(mapAlgorithmic(c, norm16), 0); + prevBoundary = src; + continue; + } + } else if (norm16 < minNoNoCompBoundaryBefore) { + // The mapping is comp-normalized which also implies hasCompBoundaryBefore. + if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || + hasCompBoundaryBefore(s, src, limit)) { + if (prevBoundary != prevSrc) { + buffer.append(s, prevBoundary, prevSrc); + } + int mapping = norm16 >> OFFSET_SHIFT; + int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK; + buffer.append(extraData, mapping, mapping + length); + prevBoundary = src; + continue; + } + } else if (norm16 >= minNoNoEmpty) { + // The current character maps to nothing. + // Simply omit it from the output if there is a boundary before _or_ after it. + // The character itself implies no boundaries. + if (hasCompBoundaryBefore(s, src, limit) || + hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) { + if (prevBoundary != prevSrc) { + buffer.append(s, prevBoundary, prevSrc); + } + prevBoundary = src; + continue; + } + } + // Other "noNo" type, or need to examine more text around this character: + // Fall through to the slow path. + } else if (isJamoVT(norm16) && prevBoundary != prevSrc) { + char prev=s.charAt(prevSrc-1); + if(c<Hangul.JAMO_T_BASE) { + // The current character is a Jamo Vowel, + // compose with previous Jamo L and following Jamo T. + char l = (char)(prev-Hangul.JAMO_L_BASE); + if(l<Hangul.JAMO_L_COUNT) { + if (!doCompose) { + return false; + } + int t; + if (src != limit && + 0 < (t = (s.charAt(src) - Hangul.JAMO_T_BASE)) && + t < Hangul.JAMO_T_COUNT) { + // The next character is a Jamo T. + ++src; + } else if (hasCompBoundaryBefore(s, src, limit)) { + // No Jamo T follows, not even via decomposition. + t = 0; + } else { + t = -1; + } + if (t >= 0) { + int syllable = Hangul.HANGUL_BASE + + (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) * + Hangul.JAMO_T_COUNT + t; + --prevSrc; // Replace the Jamo L as well. + if (prevBoundary != prevSrc) { + buffer.append(s, prevBoundary, prevSrc); + } + buffer.append((char)syllable); + prevBoundary = src; + continue; + } + // If we see L+V+x where x!=T then we drop to the slow path, + // decompose and recompose. + // This is to deal with NFKC finding normal L and V but a + // compatibility variant of a T. + // We need to either fully compose that combination here + // (which would complicate the code and may not work with strange custom data) + // or use the slow path. + } + } else if (Hangul.isHangulLV(prev)) { + // The current character is a Jamo Trailing consonant, + // compose with previous Hangul LV that does not contain a Jamo T. + if (!doCompose) { + return false; + } + int syllable = prev + c - Hangul.JAMO_T_BASE; + --prevSrc; // Replace the Hangul LV as well. + if (prevBoundary != prevSrc) { + buffer.append(s, prevBoundary, prevSrc); + } + buffer.append((char)syllable); + prevBoundary = src; + continue; + } + // No matching context, or may need to decompose surrounding text first: + // Fall through to the slow path. + } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC + // One or more combining marks that do not combine-back: + // Check for canonical order, copy unchanged if ok and + // if followed by a character with a boundary-before. + int cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0 + if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) { + // Fails FCD test, need to decompose and contiguously recompose. + if (!doCompose) { + return false; + } + } else { + // If !onlyContiguous (not FCC), then we ignore the tccc of + // the previous character which passed the quick check "yes && ccc==0" test. + int n16; + for (;;) { + if (src == limit) { + if (doCompose) { + buffer.append(s, prevBoundary, limit); + } + return true; + } + int prevCC = cc; + c = Character.codePointAt(s, src); + n16 = normTrie.get(c); + if (n16 >= MIN_YES_YES_WITH_CC) { + cc = getCCFromNormalYesOrMaybe(n16); + if (prevCC > cc) { + if (!doCompose) { + return false; + } + break; + } + } else { + break; + } + src += Character.charCount(c); + } + // p is after the last in-order combining mark. + // If there is a boundary here, then we continue with no change. + if (norm16HasCompBoundaryBefore(n16)) { + if (isCompYesAndZeroCC(n16)) { + src += Character.charCount(c); + } + continue; + } + // Use the slow path. There is no boundary in [prevSrc, src[. + } + } + + // Slow path: Find the nearest boundaries around the current character, + // decompose and recompose. + if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { + c = Character.codePointBefore(s, prevSrc); + norm16 = normTrie.get(c); + if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { + prevSrc -= Character.charCount(c); + } + } + if (doCompose && prevBoundary != prevSrc) { + buffer.append(s, prevBoundary, prevSrc); + } + int recomposeStartIndex=buffer.length(); + // We know there is not a boundary here. + decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous, + buffer); + // Decompose until the next boundary. + src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous, + buffer); + recompose(buffer, recomposeStartIndex, onlyContiguous); + if(!doCompose) { + if(!buffer.equals(s, prevSrc, src)) { + return false; + } + buffer.remove(); + } + prevBoundary=src; + } + } + + /** + * Very similar to compose(): Make the same changes in both places if relevant. + * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) + * !doSpan: quickCheck + * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and + * bit 0: set if "maybe"; otherwise, if the span length<s.length() + * then the quick check result is "no" + */ + public int composeQuickCheck(CharSequence s, int src, int limit, + boolean onlyContiguous, boolean doSpan) { + int qcResult=0; + int prevBoundary=src; + int minNoMaybeCP=minCompNoMaybeCP; + + for(;;) { + // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, + // or with (compYes && ccc==0) properties. + int prevSrc; + int c = 0; + int norm16 = 0; + for (;;) { + if(src==limit) { + return (src<<1)|qcResult; // "yes" or "maybe" + } + if( (c=s.charAt(src))<minNoMaybeCP || + isCompYesAndZeroCC(norm16=normTrie.bmpGet(c)) + ) { + ++src; + } else { + prevSrc = src++; + if (!UTF16Plus.isLeadSurrogate(c)) { + break; + } else { + char c2; + if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) { + ++src; + c = Character.toCodePoint((char)c, c2); + norm16 = normTrie.suppGet(c); + if (!isCompYesAndZeroCC(norm16)) { + break; + } + } + } + } + } + // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. + // The current character is either a "noNo" (has a mapping) + // or a "maybeYes" (combines backward) + // or a "yesYes" with ccc!=0. + // It is not a Hangul syllable or Jamo L because those have "yes" properties. + + int prevNorm16 = INERT; + if (prevBoundary != prevSrc) { + prevBoundary = prevSrc; + if (!norm16HasCompBoundaryBefore(norm16)) { + c = Character.codePointBefore(s, prevSrc); + int n16 = getNorm16(c); + if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) { + prevBoundary -= Character.charCount(c); + prevNorm16 = n16; + } + } + } + + if(isMaybeOrNonZeroCC(norm16)) { + int cc=getCCFromYesOrMaybe(norm16); + if (onlyContiguous /* FCC */ && cc != 0 && + getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) { + // The [prevBoundary..prevSrc[ character + // passed the quick check "yes && ccc==0" test + // but is out of canonical order with the current combining mark. + } else { + // If !onlyContiguous (not FCC), then we ignore the tccc of + // the previous character which passed the quick check "yes && ccc==0" test. + for (;;) { + if (norm16 < MIN_YES_YES_WITH_CC) { + if (!doSpan) { + qcResult = 1; + } else { + return prevBoundary << 1; // spanYes does not care to know it's "maybe" + } + } + if (src == limit) { + return (src<<1) | qcResult; // "yes" or "maybe" + } + int prevCC = cc; + c = Character.codePointAt(s, src); + norm16 = getNorm16(c); + if (isMaybeOrNonZeroCC(norm16)) { + cc = getCCFromYesOrMaybe(norm16); + if (!(prevCC <= cc || cc == 0)) { + break; + } + } else { + break; + } + src += Character.charCount(c); + } + // src is after the last in-order combining mark. + if (isCompYesAndZeroCC(norm16)) { + prevBoundary = src; + src += Character.charCount(c); + continue; + } + } + } + return prevBoundary<<1; // "no" + } + } + public void composeAndAppend(CharSequence s, + boolean doCompose, + boolean onlyContiguous, + ReorderingBuffer buffer) { + int src=0, limit=s.length(); + if(!buffer.isEmpty()) { + int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous); + if(0!=firstStarterInSrc) { + int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), + buffer.length(), onlyContiguous); + StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ + firstStarterInSrc+16); + middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); + buffer.removeSuffix(buffer.length()-lastStarterInDest); + middle.append(s, 0, firstStarterInSrc); + compose(middle, 0, middle.length(), onlyContiguous, true, buffer); + src=firstStarterInSrc; + } + } + if(doCompose) { + compose(s, src, limit, onlyContiguous, true, buffer); + } else { + buffer.append(s, src, limit); + } + } + // Dual functionality: + // buffer!=NULL: normalize + // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes + public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { + // Note: In this function we use buffer->appendZeroCC() because we track + // the lead and trail combining classes here, rather than leaving it to + // the ReorderingBuffer. + // The exception is the call to decomposeShort() which uses the buffer + // in the normal way. + + // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. + // Similar to the prevBoundary in the compose() implementation. + int prevBoundary=src; + int prevSrc; + int c=0; + int prevFCD16=0; + int fcd16=0; + + for(;;) { + // count code units with lccc==0 + for(prevSrc=src; src!=limit;) { + if((c=s.charAt(src))<minLcccCP) { + prevFCD16=~c; + ++src; + } else if(!singleLeadMightHaveNonZeroFCD16(c)) { + prevFCD16=0; + ++src; + } else { + if (UTF16Plus.isLeadSurrogate(c)) { + char c2; + if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) { + c = Character.toCodePoint((char)c, c2); + } + } + if((fcd16=getFCD16FromNormData(c))<=0xff) { + prevFCD16=fcd16; + src+=Character.charCount(c); + } else { + break; + } + } + } + // copy these code units all at once + if(src!=prevSrc) { + if(src==limit) { + if(buffer!=null) { + buffer.flushAndAppendZeroCC(s, prevSrc, src); + } + break; + } + prevBoundary=src; + // We know that the previous character's lccc==0. + if(prevFCD16<0) { + // Fetching the fcd16 value was deferred for this below-minLcccCP code point. + int prev=~prevFCD16; + if(prev<minDecompNoCP) { + prevFCD16=0; + } else { + prevFCD16=getFCD16FromNormData(prev); + if(prevFCD16>1) { + --prevBoundary; + } + } + } else { + int p=src-1; + if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p && + Character.isHighSurrogate(s.charAt(p-1)) + ) { + --p; + // Need to fetch the previous character's FCD value because + // prevFCD16 was just for the trail surrogate code point. + prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1))); + // Still known to have lccc==0 because its lead surrogate unit had lccc==0. + } + if(prevFCD16>1) { + prevBoundary=p; + } + } + if(buffer!=null) { + // The last lccc==0 character is excluded from the + // flush-and-append call in case it needs to be modified. + buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); + buffer.append(s, prevBoundary, src); + } + // The start of the current character (c). + prevSrc=src; + } else if(src==limit) { + break; + } + + src+=Character.charCount(c); + // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. + // Check for proper order, and decompose locally if necessary. + if((prevFCD16&0xff)<=(fcd16>>8)) { + // proper order: prev tccc <= current lccc + if((fcd16&0xff)<=1) { + prevBoundary=src; + } + if(buffer!=null) { + buffer.appendZeroCC(c); + } + prevFCD16=fcd16; + continue; + } else if(buffer==null) { + return prevBoundary; // quick check "no" + } else { + /* + * Back out the part of the source that we copied or appended + * already but is now going to be decomposed. + * prevSrc is set to after what was copied/appended. + */ + buffer.removeSuffix(prevSrc-prevBoundary); + /* + * Find the part of the source that needs to be decomposed, + * up to the next safe boundary. + */ + src=findNextFCDBoundary(s, src, limit); + /* + * The source text does not fulfill the conditions for FCD. + * Decompose and reorder a limited piece of the text. + */ + decomposeShort(s, prevBoundary, src, false, false, buffer); + prevBoundary=src; + prevFCD16=0; + } + } + return src; + } + + public boolean hasDecompBoundaryBefore(int c) { + return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) || + norm16HasDecompBoundaryBefore(getNorm16(c)); + } + public boolean norm16HasDecompBoundaryBefore(int norm16) { + if (norm16 < minNoNoCompNoMaybeCC) { + return true; + } + if (norm16 >= limitNoNo) { + return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; + } + // c decomposes, get everything from the variable-length extra data + int mapping=norm16>>OFFSET_SHIFT; + int firstUnit=extraData.charAt(mapping); + // true if leadCC==0 (hasFCDBoundaryBefore()) + return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; + } + public boolean hasDecompBoundaryAfter(int c) { + if (c < minDecompNoCP) { + return true; + } + if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) { + return true; + } + return norm16HasDecompBoundaryAfter(getNorm16(c)); + } + public boolean norm16HasDecompBoundaryAfter(int norm16) { + if(norm16 <= minYesNo || isHangulLVT(norm16)) { + return true; + } + if (norm16 >= limitNoNo) { + if (isMaybeOrNonZeroCC(norm16)) { + return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; + } + // Maps to an isCompYesAndZeroCC. + return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; + } + // c decomposes, get everything from the variable-length extra data + int mapping=norm16>>OFFSET_SHIFT; + int firstUnit=extraData.charAt(mapping); + // decomp after-boundary: same as hasFCDBoundaryAfter(), + // fcd16<=1 || trailCC==0 + if(firstUnit>0x1ff) { + return false; // trailCC>1 + } + if(firstUnit<=0xff) { + return true; // trailCC==0 + } + // if(trailCC==1) test leadCC==0, same as checking for before-boundary + // true if leadCC==0 (hasFCDBoundaryBefore()) + return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; + } + public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } + + public boolean hasCompBoundaryBefore(int c) { + return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c)); + } + public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous) { + return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous); + } + + private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } + private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } + private static boolean isInert(int norm16) { return norm16==INERT; } + private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } + private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; } + private boolean isHangulLV(int norm16) { return norm16==minYesNo; } + private boolean isHangulLVT(int norm16) { + return norm16==hangulLVT(); + } + private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } + // UBool isCompYes(uint16_t norm16) const { + // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; + // } + // UBool isCompYesOrMaybe(uint16_t norm16) const { + // return norm16<minNoNo || minMaybeYes<=norm16; + // } + // private boolean hasZeroCCFromDecompYes(int norm16) { + // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; + // } + private boolean isDecompYesAndZeroCC(int norm16) { + return norm16<minYesNo || + norm16==JAMO_VT || + (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); + } + /** + * A little faster and simpler than isDecompYesAndZeroCC() but does not include + * the MaybeYes which combine-forward and have ccc=0. + * (Standard Unicode 10 normalization does not have such characters.) + */ + private boolean isMostDecompYesAndZeroCC(int norm16) { + return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; + } + private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; } + + // For use with isCompYes(). + // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. + // static uint8_t getCCFromYes(uint16_t norm16) { + // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0; + // } + private int getCCFromNoNo(int norm16) { + int mapping=norm16>>OFFSET_SHIFT; + if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { + return extraData.charAt(mapping-1)&0xff; + } else { + return 0; + } + } + int getTrailCCFromCompYesAndZeroCC(int norm16) { + if(norm16<=minYesNo) { + return 0; // yesYes and Hangul LV have ccc=tccc=0 + } else { + // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here. + return extraData.charAt(norm16>>OFFSET_SHIFT)>>8; // tccc from yesNo + } + } + + // Requires algorithmic-NoNo. + private int mapAlgorithmic(int c, int norm16) { + return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta; + } + + // Requires minYesNo<norm16<limitNoNo. + // private int getMapping(int norm16) { return extraData+(norm16>>OFFSET_SHIFT); } + + /** + * @return index into maybeYesCompositions, or -1 + */ + private int getCompositionsListForDecompYes(int norm16) { + if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) { + return -1; + } else { + if((norm16-=minMaybeYes)<0) { + // norm16<minMaybeYes: index into extraData which is a substring at + // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes] + // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16 + norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list + } + return norm16>>OFFSET_SHIFT; + } + } + /** + * @return index into maybeYesCompositions + */ + private int getCompositionsListForComposite(int norm16) { + // A composite has both mapping & compositions list. + int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; + int firstUnit=maybeYesCompositions.charAt(list); + return list+ // mapping in maybeYesCompositions + 1+ // +1 to skip the first unit with the mapping length + (firstUnit&MAPPING_LENGTH_MASK); // + mapping length + } + + // Decompose a short piece of text which is likely to contain characters that + // fail the quick check loop and/or where the quick check loop's overhead + // is unlikely to be amortized. + // Called by the compose() and makeFCD() implementations. + // Public in Java for collation implementation code. + private int decomposeShort( + CharSequence s, int src, int limit, + boolean stopAtCompBoundary, boolean onlyContiguous, + ReorderingBuffer buffer) { + while(src<limit) { + int c=Character.codePointAt(s, src); + if (stopAtCompBoundary && c < minCompNoMaybeCP) { + return src; + } + int norm16 = getNorm16(c); + if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) { + return src; + } + src+=Character.charCount(c); + decompose(c, norm16, buffer); + if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { + return src; + } + } + return src; + } + private void decompose(int c, int norm16, ReorderingBuffer buffer) { + // get the decomposition and the lead and trail cc's + if (norm16 >= limitNoNo) { + if (isMaybeOrNonZeroCC(norm16)) { + buffer.append(c, getCCFromYesOrMaybe(norm16)); + return; + } + // Maps to an isCompYesAndZeroCC. + c=mapAlgorithmic(c, norm16); + norm16=getRawNorm16(c); + } + if (norm16 < minYesNo) { + // c does not decompose + buffer.append(c, 0); + } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { + // Hangul syllable: decompose algorithmically + Hangul.decompose(c, buffer); + } else { + // c decomposes, get everything from the variable-length extra data + int mapping=norm16>>OFFSET_SHIFT; + int firstUnit=extraData.charAt(mapping); + int length=firstUnit&MAPPING_LENGTH_MASK; + int leadCC, trailCC; + trailCC=firstUnit>>8; + if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { + leadCC=extraData.charAt(mapping-1)>>8; + } else { + leadCC=0; + } + ++mapping; // skip over the firstUnit + buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC); + } + } + + /** + * Finds the recomposition result for + * a forward-combining "lead" character, + * specified with a pointer to its compositions list, + * and a backward-combining "trail" character. + * + * <p>If the lead and trail characters combine, then this function returns + * the following "compositeAndFwd" value: + * <pre> + * Bits 21..1 composite character + * Bit 0 set if the composite is a forward-combining starter + * </pre> + * otherwise it returns -1. + * + * <p>The compositions list has (trail, compositeAndFwd) pair entries, + * encoded as either pairs or triples of 16-bit units. + * The last entry has the high bit of its first unit set. + * + * <p>The list is sorted by ascending trail characters (there are no duplicates). + * A linear search is used. + * + * <p>See normalizer2impl.h for a more detailed description + * of the compositions list format. + */ + private static int combine(String compositions, int list, int trail) { + int key1, firstUnit; + if(trail<COMP_1_TRAIL_LIMIT) { + // trail character is 0..33FF + // result entry may have 2 or 3 units + key1=(trail<<1); + while(key1>(firstUnit=compositions.charAt(list))) { + list+=2+(firstUnit&COMP_1_TRIPLE); + } + if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { + if((firstUnit&COMP_1_TRIPLE)!=0) { + return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2); + } else { + return compositions.charAt(list+1); + } + } + } else { + // trail character is 3400..10FFFF + // result entry has 3 units + key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); + int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff; + int secondUnit; + for(;;) { + if(key1>(firstUnit=compositions.charAt(list))) { + list+=2+(firstUnit&COMP_1_TRIPLE); + } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { + if(key2>(secondUnit=compositions.charAt(list+1))) { + if((firstUnit&COMP_1_LAST_TUPLE)!=0) { + break; + } else { + list+=3; + } + } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { + return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); + } else { + break; + } + } else { + break; + } + } + } + return -1; + } + + /* + * Recomposes the buffer text starting at recomposeStartIndex + * (which is in NFD - decomposed and canonically ordered), + * and truncates the buffer contents. + * + * Note that recomposition never lengthens the text: + * Any character consists of either one or two code units; + * a composition may contain at most one more code unit than the original starter, + * while the combining mark that is removed has at least one code unit. + */ + private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, + boolean onlyContiguous) { + StringBuilder sb=buffer.getStringBuilder(); + int p=recomposeStartIndex; + if(p==sb.length()) { + return; + } + + int starter, pRemove; + int compositionsList; + int c, compositeAndFwd; + int norm16; + int cc, prevCC; + boolean starterIsSupplementary; + + // Some of the following variables are not used until we have a forward-combining starter + // and are only initialized now to avoid compiler warnings. + compositionsList=-1; // used as indicator for whether we have a forward-combining starter + starter=-1; + starterIsSupplementary=false; + prevCC=0; + + for(;;) { + c=sb.codePointAt(p); + p+=Character.charCount(c); + norm16=getNorm16(c); + cc=getCCFromYesOrMaybe(norm16); + if( // this character combines backward and + isMaybe(norm16) && + // we have seen a starter that combines forward and + compositionsList>=0 && + // the backward-combining character is not blocked + (prevCC<cc || prevCC==0) + ) { + if(isJamoVT(norm16)) { + // c is a Jamo V/T, see if we can compose it with the previous character. + if(c<Hangul.JAMO_T_BASE) { + // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. + char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE); + if(prev<Hangul.JAMO_L_COUNT) { + pRemove=p-1; + char syllable=(char) + (Hangul.HANGUL_BASE+ + (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* + Hangul.JAMO_T_COUNT); + char t; + if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { + ++p; + syllable+=t; // The next character was a Jamo T. + } + sb.setCharAt(starter, syllable); + // remove the Jamo V/T + sb.delete(pRemove, p); + p=pRemove; + } + } + /* + * No "else" for Jamo T: + * Since the input is in NFD, there are no Hangul LV syllables that + * a Jamo T could combine with. + * All Jamo Ts are combined above when handling Jamo Vs. + */ + if(p==sb.length()) { + break; + } + compositionsList=-1; + continue; + } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) { + // The starter and the combining mark (c) do combine. + int composite=compositeAndFwd>>1; + + // Remove the combining mark. + pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark + sb.delete(pRemove, p); + p=pRemove; + // Replace the starter with the composite. + if(starterIsSupplementary) { + if(composite>0xffff) { + // both are supplementary + sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); + sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); + } else { + sb.setCharAt(starter, (char)c); + sb.deleteCharAt(starter+1); + // The composite is shorter than the starter, + // move the intermediate characters forward one. + starterIsSupplementary=false; + --p; + } + } else if(composite>0xffff) { + // The composite is longer than the starter, + // move the intermediate characters back one. + starterIsSupplementary=true; + sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); + sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); + ++p; + } else { + // both are on the BMP + sb.setCharAt(starter, (char)composite); + } + + // Keep prevCC because we removed the combining mark. + + if(p==sb.length()) { + break; + } + // Is the composite a starter that combines forward? + if((compositeAndFwd&1)!=0) { + compositionsList= + getCompositionsListForComposite(getRawNorm16(composite)); + } else { + compositionsList=-1; + } + + // We combined; continue with looking for compositions. + continue; + } + } + + // no combination this time + prevCC=cc; + if(p==sb.length()) { + break; + } + + // If c did not combine, then check if it is a starter. + if(cc==0) { + // Found a new starter. + if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { + // It may combine with something, prepare for it. + if(c<=0xffff) { + starterIsSupplementary=false; + starter=p-1; + } else { + starterIsSupplementary=true; + starter=p-2; + } + } + } else if(onlyContiguous) { + // FCC: no discontiguous compositions; any intervening character blocks. + compositionsList=-1; + } + } + buffer.flush(); + } + + /** + * Does c have a composition boundary before it? + * True if its decomposition begins with a character that has + * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). + * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes + * (isCompYesAndZeroCC()) so we need not decompose. + */ + private boolean hasCompBoundaryBefore(int c, int norm16) { + return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16); + } + private boolean norm16HasCompBoundaryBefore(int norm16) { + return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16); + } + private boolean hasCompBoundaryBefore(CharSequence s, int src, int limit) { + return src == limit || hasCompBoundaryBefore(Character.codePointAt(s, src)); + } + private boolean norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous) { + return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 && + (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16)); + } + private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous) { + return start == p || hasCompBoundaryAfter(Character.codePointBefore(s, p), onlyContiguous); + } + /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */ + private boolean isTrailCC01ForCompBoundaryAfter(int norm16) { + return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ? + (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff); + } + + private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) { + while(p>0) { + int c=Character.codePointBefore(s, p); + int norm16 = getNorm16(c); + if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { + break; + } + p-=Character.charCount(c); + if(hasCompBoundaryBefore(c, norm16)) { + break; + } + } + return p; + } + private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) { + while(p<limit) { + int c=Character.codePointAt(s, p); + int norm16=normTrie.get(c); + if(hasCompBoundaryBefore(c, norm16)) { + break; + } + p+=Character.charCount(c); + if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { + break; + } + } + return p; + } + + + private int findNextFCDBoundary(CharSequence s, int p, int limit) { + while(p<limit) { + int c=Character.codePointAt(s, p); + int norm16; + if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16 = getNorm16(c))) { + break; + } + p+=Character.charCount(c); + if (norm16HasDecompBoundaryAfter(norm16)) { + break; + } + } + return p; + } + + /** + * Get the canonical decomposition + * sherman for ComposedCharIter + */ + public static int getDecompose(int chars[], String decomps[]) { + Normalizer2 impl = Normalizer2.getNFDInstance(); + + int length=0; + int norm16 = 0; + int ch = -1; + int i = 0; + + while (++ch < 0x2fa1e) { //no cannoical above 0x3ffff + //TBD !!!! the hack code heres save us about 50ms for startup + //need a better solution/lookup + if (ch == 0x30ff) + ch = 0xf900; + else if (ch == 0x115bc) + ch = 0x1d15e; + else if (ch == 0x1d1c1) + ch = 0x2f800; + + String s = impl.getDecomposition(ch); + + if(s != null && i < chars.length) { + chars[i] = ch; + decomps[i++] = s; + } + } + return i; + } + + //------------------------------------------------------ + // special method for Collation (RBTableBuilder.build()) + //------------------------------------------------------ + private static boolean needSingleQuotation(char c) { + return (c >= 0x0009 && c <= 0x000D) || + (c >= 0x0020 && c <= 0x002F) || + (c >= 0x003A && c <= 0x0040) || + (c >= 0x005B && c <= 0x0060) || + (c >= 0x007B && c <= 0x007E); + } + + public static String canonicalDecomposeWithSingleQuotation(String string) { + Normalizer2 impl = Normalizer2.getNFDInstance(); + char[] src = string.toCharArray(); + int srcIndex = 0; + int srcLimit = src.length; + char[] dest = new char[src.length * 3]; //MAX_BUF_SIZE_DECOMPOSE = 3 + int destIndex = 0; + int destLimit = dest.length; + + int prevSrc; + String norm; + int reorderStartIndex, length; + char c1, c2; + int cp; + int minNoMaybe = 0x00c0; + int cc, prevCC, trailCC; + char[] p; + int pStart; + + // initialize + reorderStartIndex = 0; + prevCC = 0; + norm = null; + cp = 0; + pStart = 0; + + cc = trailCC = -1; // initialize to bogus value + c1 = 0; + for (;;) { + prevSrc=srcIndex; + //quick check (1)less than minNoMaybe (2)no decomp (3)hangual + while (srcIndex != srcLimit && + ((c1 = src[srcIndex]) < minNoMaybe || + (norm = impl.getDecomposition(cp = string.codePointAt(srcIndex))) == null || + (c1 >= '\uac00' && c1 <= '\ud7a3'))) { // Hangul Syllables + prevCC = 0; + srcIndex += (cp < 0x10000) ? 1 : 2; + } + + // copy these code units all at once + if (srcIndex != prevSrc) { + length = srcIndex - prevSrc; + if ((destIndex + length) <= destLimit) { + System.arraycopy(src,prevSrc,dest,destIndex,length); + } + + destIndex += length; + reorderStartIndex = destIndex; + } + + // end of source reached? + if (srcIndex == srcLimit) { + break; + } + + // cp already contains *src and norm32 is set for it, increment src + srcIndex += (cp < 0x10000) ? 1 : 2; + + if (cp < Character.MIN_SUPPLEMENTARY_CODE_POINT) { + c2 = 0; + length = 1; + + if (Character.isHighSurrogate(c1) + || Character.isLowSurrogate(c1)) { + norm = null; + } + } else { + length = 2; + c2 = src[srcIndex-1]; + } + + // get the decomposition and the lead and trail cc's + if (norm == null) { + // cp does not decompose + cc = trailCC = UCharacter.getCombiningClass(cp); + p = null; + pStart = -1; + } else { + + pStart = 0; + p = norm.toCharArray(); + length = p.length; + int cpNum = norm.codePointCount(0, length); + cc= UCharacter.getCombiningClass(norm.codePointAt(0)); + trailCC= UCharacter.getCombiningClass(norm.codePointAt(cpNum-1)); + if (length == 1) { + // fastpath a single code unit from decomposition + c1 = p[pStart]; + c2 = 0; + p = null; + pStart = -1; + } + } + + if((destIndex + length * 3) >= destLimit) { // 2 SingleQuotations + // buffer overflow + char[] tmpBuf = new char[destLimit * 2]; + System.arraycopy(dest, 0, tmpBuf, 0, destIndex); + dest = tmpBuf; + destLimit = dest.length; + } + + // append the decomposition to the destination buffer, assume length>0 + { + int reorderSplit = destIndex; + if (p == null) { + // fastpath: single code point + if (needSingleQuotation(c1)) { + //if we need single quotation, no need to consider "prevCC" + //and it must NOT be a supplementary pair + dest[destIndex++] = '\''; + dest[destIndex++] = c1; + dest[destIndex++] = '\''; + trailCC = 0; + } else if(cc != 0 && cc < prevCC) { + // (c1, c2) is out of order with respect to the preceding + // text + destIndex += length; + trailCC = insertOrdered(dest, reorderStartIndex, + reorderSplit, destIndex, c1, c2, cc); + } else { + // just append (c1, c2) + dest[destIndex++] = c1; + if(c2 != 0) { + dest[destIndex++] = c2; + } + } + } else { + // general: multiple code points (ordered by themselves) + // from decomposition + if (needSingleQuotation(p[pStart])) { + dest[destIndex++] = '\''; + dest[destIndex++] = p[pStart++]; + dest[destIndex++] = '\''; + length--; + do { + dest[destIndex++] = p[pStart++]; + } while(--length > 0); + } else if (cc != 0 && cc < prevCC) { + destIndex += length; + trailCC = mergeOrdered(dest, reorderStartIndex, + reorderSplit, p, pStart, + pStart+length); + } else { + // just append the decomposition + do { + dest[destIndex++] = p[pStart++]; + } while (--length > 0); + } + } + } + prevCC = trailCC; + if(prevCC == 0) { + reorderStartIndex = destIndex; + } + } + + return new String(dest, 0, destIndex); + } + + /** + * simpler, single-character version of mergeOrdered() - + * bubble-insert one single code point into the preceding string + * which is already canonically ordered + * (c, c2) may or may not yet have been inserted at src[current]..src[p] + * + * it must be p=current+lengthof(c, c2) i.e. p=current+(c2==0 ? 1 : 2) + * + * before: src[start]..src[current] is already ordered, and + * src[current]..src[p] may or may not hold (c, c2) but + * must be exactly the same length as (c, c2) + * after: src[start]..src[p] is ordered + * + * @return the trailing combining class + */ + private static int/*unsigned byte*/ insertOrdered(char[] source, + int start, + int current, int p, + char c1, char c2, + int/*unsigned byte*/ cc) { + int back, preBack; + int r; + int prevCC, trailCC=cc; + + if (start<current && cc!=0) { + // search for the insertion point where cc>=prevCC + preBack=back=current; + + PrevArgs prevArgs = new PrevArgs(); + prevArgs.current = current; + prevArgs.start = start; + prevArgs.src = source; + prevArgs.c1 = c1; + prevArgs.c2 = c2; + + // get the prevCC + prevCC=getPrevCC(prevArgs); + preBack = prevArgs.current; + + if(cc<prevCC) { + // this will be the last code point, so keep its cc + trailCC=prevCC; + back=preBack; + while(start<preBack) { + prevCC=getPrevCC(prevArgs); + preBack=prevArgs.current; + if(cc>=prevCC) { + break; + } + back=preBack; + } + + // this is where we are right now with all these indicies: + // [start]..[pPreBack] 0..? code points that we can ignore + // [pPreBack]..[pBack] 0..1 code points with prevCC<=cc + // [pBack]..[current] 0..n code points with >cc, move up to insert (c, c2) + // [current]..[p] 1 code point (c, c2) with cc + + // move the code units in between up + r=p; + do { + source[--r]=source[--current]; + } while (back!=current); + } + } + + // insert (c1, c2) + source[current] = c1; + if (c2!=0) { + source[(current+1)] = c2; + } + + // we know the cc of the last code point + return trailCC; + } + /** + * merge two UTF-16 string parts together + * to canonically order (order by combining classes) their concatenation + * + * the two strings may already be adjacent, so that the merging is done + * in-place if the two strings are not adjacent, then the buffer holding the + * first one must be large enough + * the second string may or may not be ordered in itself + * + * before: [start]..[current] is already ordered, and + * [next]..[limit] may be ordered in itself, but + * is not in relation to [start..current[ + * after: [start..current+(limit-next)[ is ordered + * + * the algorithm is a simple bubble-sort that takes the characters from + * src[next++] and inserts them in correct combining class order into the + * preceding part of the string + * + * since this function is called much less often than the single-code point + * insertOrdered(), it just uses that for easier maintenance + * + * @return the trailing combining class + */ + private static int /*unsigned byte*/ mergeOrdered(char[] source, + int start, + int current, + char[] data, + int next, + int limit) { + int r; + int /*unsigned byte*/ cc, trailCC=0; + boolean adjacent; + + adjacent= current==next; + NextCCArgs ncArgs = new NextCCArgs(); + ncArgs.source = data; + ncArgs.next = next; + ncArgs.limit = limit; + + if(start!=current) { + + while(ncArgs.next<ncArgs.limit) { + cc=getNextCC(ncArgs); + if(cc==0) { + // does not bubble back + trailCC=0; + if(adjacent) { + current=ncArgs.next; + } else { + data[current++]=ncArgs.c1; + if(ncArgs.c2!=0) { + data[current++]=ncArgs.c2; + } + } + break; + } else { + r=current+(ncArgs.c2==0 ? 1 : 2); + trailCC=insertOrdered(source,start, current, r, + ncArgs.c1, ncArgs.c2, cc); + current=r; + } + } + } + + if(ncArgs.next==ncArgs.limit) { + // we know the cc of the last code point + return trailCC; + } else { + if(!adjacent) { + // copy the second string part + do { + source[current++]=data[ncArgs.next++]; + } while(ncArgs.next!=ncArgs.limit); + ncArgs.limit=current; + } + PrevArgs prevArgs = new PrevArgs(); + prevArgs.src = data; + prevArgs.start = start; + prevArgs.current = ncArgs.limit; + return getPrevCC(prevArgs); + } + + } + private static final class PrevArgs{ + char[] src; + int start; + int current; + char c1; + char c2; + } + + private static final class NextCCArgs{ + char[] source; + int next; + int limit; + char c1; + char c2; + } + private static int /*unsigned byte*/ getNextCC(NextCCArgs args) { + args.c1=args.source[args.next++]; + args.c2=0; + + if (UTF16.isTrailSurrogate(args.c1)) { + /* unpaired second surrogate */ + return 0; + } else if (!UTF16.isLeadSurrogate(args.c1)) { + return UCharacter.getCombiningClass(args.c1); + } else if (args.next!=args.limit && + UTF16.isTrailSurrogate(args.c2=args.source[args.next])){ + ++args.next; + return UCharacter.getCombiningClass(Character.toCodePoint(args.c1, args.c2)); + } else { + /* unpaired first surrogate */ + args.c2=0; + return 0; + } + } + private static int /*unsigned*/ getPrevCC(PrevArgs args) { + args.c1=args.src[--args.current]; + args.c2=0; + + if (args.c1 < MIN_CCC_LCCC_CP) { + return 0; + } else if (UTF16.isLeadSurrogate(args.c1)) { + /* unpaired first surrogate */ + return 0; + } else if (!UTF16.isTrailSurrogate(args.c1)) { + return UCharacter.getCombiningClass(args.c1); + } else if (args.current!=args.start && + UTF16.isLeadSurrogate(args.c2=args.src[args.current-1])) { + --args.current; + return UCharacter.getCombiningClass(Character.toCodePoint(args.c2, args.c1)); + } else { + /* unpaired second surrogate */ + args.c2=0; + return 0; + } + } + + private int getPreviousTrailCC(CharSequence s, int start, int p) { + if (start == p) { + return 0; + } + return getFCD16(Character.codePointBefore(s, p)); + } + + private VersionInfo dataVersion; + + // BMP code point thresholds for quick check loops looking at single UTF-16 code units. + private int minDecompNoCP; + private int minCompNoMaybeCP; + private int minLcccCP; + + // Norm16 value thresholds for quick check combinations and types of extra data. + private int minYesNo; + private int minYesNoMappingsOnly; + private int minNoNo; + private int minNoNoCompBoundaryBefore; + private int minNoNoCompNoMaybeCC; + private int minNoNoEmpty; + private int limitNoNo; + private int centerNoNoDelta; + private int minMaybeYes; + + private CodePointTrie.Fast16 normTrie; + private String maybeYesCompositions; + private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters + private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/java.base/share/classes/jdk/internal/icu/impl/Punycode.java Mon Jan 13 08:05:59 2020 -0800 @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +/* + ******************************************************************************* + * Copyright (C) 2003-2004, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ +// +// CHANGELOG +// 2005-05-19 Edward Wang +// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/Punycode.java +// - move from package com.ibm.icu.text to package sun.net.idn +// - use ParseException instead of StringPrepParseException +// 2007-08-14 Martin Buchholz +// - remove redundant casts +// +package jdk.internal.icu.impl; + +import java.text.ParseException; +import jdk.internal.icu.lang.UCharacter; +import jdk.internal.icu.text.UTF16; + +/** + * Ported code from ICU punycode.c + * @author ram + */ + +/* Package Private class */ +public final class Punycode { + + /* Punycode parameters for Bootstring */ + private static final int BASE = 36; + private static final int TMIN = 1; + private static final int TMAX = 26; + private static final int SKEW = 38; + private static final int DAMP = 700; + private static final int INITIAL_BIAS = 72; + private static final int INITIAL_N = 0x80; + + /* "Basic" Unicode/ASCII code points */ + private static final int HYPHEN = 0x2d; + private static final int DELIMITER = HYPHEN; + + private static final int ZERO = 0x30; + private static final int NINE = 0x39; + + private static final int SMALL_A = 0x61; + private static final int SMALL_Z = 0x7a; + + private static final int CAPITAL_A = 0x41; + private static final int CAPITAL_Z = 0x5a; + + // TODO: eliminate the 256 limitation + private static final int MAX_CP_COUNT = 256; + + private static final int UINT_MAGIC = 0x80000000; + private static final long ULONG_MAGIC = 0x8000000000000000L; + + private static int adaptBias(int delta, int length, boolean firstTime){ + if(firstTime){ + delta /=DAMP; + }else{ + delta /= 2; + } + delta += delta/length; + + int count=0; + for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) { + delta/=(BASE-TMIN); + } + + return count+(((BASE-TMIN+1)*delta)/(delta+SKEW)); + } + + /** + * basicToDigit[] contains the numeric value of a basic code + * point (for use in representing integers) in the range 0 to + * BASE-1, or -1 if b is does not represent a value. + */ + static final int[] basicToDigit= new int[]{ + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, + + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + + private static char asciiCaseMap(char b, boolean uppercase) { + if(uppercase) { + if(SMALL_A<=b && b<=SMALL_Z) { + b-=(SMALL_A-CAPITAL_A); + } + } else { + if(CAPITAL_A<=b && b<=CAPITAL_Z) { + b+=(SMALL_A-CAPITAL_A); + } + } + return b; + } + + /** + * digitToBasic() returns the basic code point whose value + * (when used for representing integers) is d, which must be in the + * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is + * nonzero, in which case the uppercase form is used. + */ + private static char digitToBasic(int digit, boolean uppercase) { + /* 0..25 map to ASCII a..z or A..Z */ + /* 26..35 map to ASCII 0..9 */ + if(digit<26) { + if(uppercase) { + return (char)(CAPITAL_A+digit); + } else { + return (char)(SMALL_A+digit); + } + } else { + return (char)((ZERO-26)+digit); + } + } + /** + * Converts Unicode to Punycode. + * The input string must not contain single, unpaired surrogates. + * The output will be represented as an array of ASCII code points. + * + * @param src + * @param caseFlags + * @return + * @throws ParseException + */ + public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws ParseException{ + + int[] cpBuffer = new int[MAX_CP_COUNT]; + int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount; + char c, c2; + int srcLength = src.length(); + int destCapacity = MAX_CP_COUNT; + char[] dest = new char[destCapacity]; + StringBuffer result = new StringBuffer(); + /* + * Handle the basic code points and + * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit): + */ + srcCPCount=destLength=0; + + for(j=0; j<srcLength; ++j) { + if(srcCPCount==MAX_CP_COUNT) { + /* too many input code points */ + throw new IndexOutOfBoundsException(); + } + c=src.charAt(j); + if(isBasic(c)) { + if(destLength<destCapacity) { + cpBuffer[srcCPCount++]=0; + dest[destLength]= + caseFlags!=null ? + asciiCaseMap(c, caseFlags[j]) : + c; + } + ++destLength; + } else { + n=((caseFlags!=null && caseFlags[j])? 1 : 0)<<31L; + if(!UTF16.isSurrogate(c)) { + n|=c; + } else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) { + ++j; + + n|=UCharacter.getCodePoint(c, c2); + } else { + /* error: unmatched surrogate */ + throw new ParseException("Illegal char found", -1); + } + cpBuffer[srcCPCount++]=n; + } + } + + /* Finish the basic string - if it is not empty - with a delimiter. */ + basicLength=destLength; + if(basicLength>0) { + if(destLength<destCapacity) { + dest[destLength]=DELIMITER; + } + ++destLength; + } + + /* + * handledCPCount is the number of code points that have been handled + * basicLength is the number of basic code points + * destLength is the number of chars that have been output + */ + + /* Initialize the state: */ + n=INITIAL_N; + delta=0; + bias=INITIAL_BIAS; + + /* Main encoding loop: */ + for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) { + /* + * All non-basic code points < n have been handled already. + * Find the next larger one: + */ + for(m=0x7fffffff, j=0; j<srcCPCount; ++j) { + q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */ + if(n<=q && q<m) { + m=q; + } + } + + /* + * Increase delta enough to advance the decoder's + * <n,i> state to <m,0>, but guard against overflow: + */ + if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) { + throw new RuntimeException("Internal program error"); + } + delta+=(m-n)*(handledCPCount+1); + n=m; + + /* Encode a sequence of same code points n */ + for(j=0; j<srcCPCount; ++j) { + q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */ + if(q<n) { + ++delta; + } else if(q==n) { + /* Represent delta as a generalized variable-length integer: */ + for(q=delta, k=BASE; /* no condition */; k+=BASE) { + + /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt + + t=k-bias; + if(t<TMIN) { + t=TMIN; + } else if(t>TMAX) { + t=TMAX; + } + */ + + t=k-bias; + if(t<TMIN) { + t=TMIN; + } else if(k>=(bias+TMAX)) { + t=TMAX; + } + + if(q<t) { + break; + } + + if(destLength<destCapacity) { + dest[destLength++]=digitToBasic(t+(q-t)%(BASE-t), false); + } + q=(q-t)/(BASE-t); + } + + if(destLength<destCapacity) { + dest[destLength++]=digitToBasic(q, (cpBuffer[j]<0)); + } + bias=adaptBias(delta, handledCPCount+1,(handledCPCount==basicLength)); + delta=0; + ++handledCPCount; + } + } + + ++delta; + ++n; + } + + return result.append(dest, 0, destLength); + } + + private static boolean isBasic(int ch){ + return (ch < INITIAL_N); + } + + private static boolean isBasicUpperCase(int ch){ + return( CAPITAL_A <= ch && ch <= CAPITAL_Z); + } + + private static boolean isSurrogate(int ch){ + return (((ch)&0xfffff800)==0xd800); + } + /** + * Converts Punycode to Unicode. + * The Unicode string will be at most as long as the Punycode string. + * + * @param src + * @param caseFlags + * @return + * @throws ParseException + */ + public static StringBuffer decode(StringBuffer src, boolean[] caseFlags) + throws ParseException{ + int srcLength = src.length(); + StringBuffer result = new StringBuffer(); + int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t, + destCPCount, firstSupplementaryIndex, cpLength; + char b; + int destCapacity = MAX_CP_COUNT; + char[] dest = new char[destCapacity]; + + /* + * Handle the basic code points: + * Let basicLength be the number of input code points + * before the last delimiter, or 0 if there is none, + * then copy the first basicLength code points to the output. + * + * The two following loops iterate backward. + */ + for(j=srcLength; j>0;) { + if(src.charAt(--j)==DELIMITER) { + break; + } + } + destLength=basicLength=destCPCount=j; + + while(j>0) { + b=src.charAt(--j); + if(!isBasic(b)) { + throw new ParseException("Illegal char found", -1); + } + + if(j<destCapacity) { + dest[j]= b; + + if(caseFlags!=null) { + caseFlags[j]=isBasicUpperCase(b); + } + } + } + + /* Initialize the state: */ + n=INITIAL_N; + i=0; + bias=INITIAL_BIAS; + firstSupplementaryIndex=1000000000; + + /* + * Main decoding loop: + * Start just after the last delimiter if any + * basic code points were copied; start at the beginning otherwise. + */ + for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) { + /* + * in is the index of the next character to be consumed, and + * destCPCount is the number of code points in the output array. + * + * Decode a generalized variable-length integer into delta, + * which gets added to i. The overflow checking is easier + * if we increase i as we go, then subtract off its starting + * value at the end to obtain delta. + */ + for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) { + if(in>=srcLength) { + throw new ParseException("Illegal char found", -1); + } + + digit=basicToDigit[(byte)src.charAt(in++)]; + if(digit<0) { + throw new ParseException("Invalid char found", -1); + } + if(digit>(0x7fffffff-i)/w) { + /* integer overflow */ + throw new ParseException("Illegal char found", -1); + } + + i+=digit*w; + t=k-bias; + if(t<TMIN) { + t=TMIN; + } else if(k>=(bias+TMAX)) { + t=TMAX; + } + if(digit<t) { + break; + } + + if(w>0x7fffffff/(BASE-t)) { + /* integer overflow */ + throw new ParseException("Illegal char found", -1); + } + w*=BASE-t; + } + + /* + * Modification from sample code: + * Increments destCPCount here, + * where needed instead of in for() loop tail. + */ + ++destCPCount; + bias=adaptBias(i-oldi, destCPCount, (oldi==0)); + + /* + * i was supposed to wrap around from (incremented) destCPCount to 0, + * incrementing n each time, so we'll fix that now: + */ + if(i/destCPCount>(0x7fffffff-n)) { + /* integer overflow */ + throw new ParseException("Illegal char found", -1); + } + + n+=i/destCPCount; + i%=destCPCount; + /* not needed for Punycode: */ + /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */ + + if(n>0x10ffff || isSurrogate(n)) { + /* Unicode code point overflow */ + throw new ParseException("Illegal char found", -1); + } + + /* Insert n at position i of the output: */ + cpLength=UTF16.getCharCount(n); + if((destLength+cpLength)<destCapacity) { + int codeUnitIndex; + + /* + * Handle indexes when supplementary code points are present. + * + * In almost all cases, there will be only BMP code points before i + * and even in the entire string. + * This is handled with the same efficiency as with UTF-32. + * + * Only the rare cases with supplementary code points are handled + * more slowly - but not too bad since this is an insertion anyway. + */ + if(i<=firstSupplementaryIndex) { + codeUnitIndex=i; + if(cpLength>1) { + firstSupplementaryIndex=codeUnitIndex; + } else { + ++firstSupplementaryIndex; + } + } else { + codeUnitIndex=firstSupplementaryIndex; + codeUnitIndex=UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i-codeUnitIndex); + } + + /* use the UChar index codeUnitIndex instead of the code point index i */ + if(codeUnitIndex<destLength) { + System.arraycopy(dest, codeUnitIndex, + dest, codeUnitIndex+cpLength, + (destLength-codeUnitIndex)); + if(caseFlags!=null) { + System.arraycopy(caseFlags, codeUnitIndex, + caseFlags, codeUnitIndex+cpLength, + destLength-codeUnitIndex); + } + } + if(cpLength==1) { + /* BMP, insert one code unit */ + dest[codeUnitIndex]=(char)n; + } else { + /* supplementary character, insert two code units */ + dest[codeUnitIndex]=UTF16.getLeadSurrogate(n); + dest[codeUnitIndex+1]=UTF16.getTrailSurrogate(n); + } + if(caseFlags!=null) { + /* Case of last character determines uppercase flag: */ + caseFlags[codeUnitIndex]=isBasicUpperCase(src.charAt(in-1)); + if(cpLength==2) { + caseFlags[codeUnitIndex+1]=false; + } + } + } + destLength+=cpLength; + ++i; + } + result.append(dest, 0, destLength); + return result; + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/java.base/share/classes/jdk/internal/icu/impl/ReplaceableUCharacterIterator.java Mon Jan 13 08:05:59 2020 -0800 @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved * + * * + * The original version of this source code and documentation is copyrighted * + * and owned by IBM, These materials are provided under terms of a License * + * Agreement between IBM and Sun. This technology is protected by multiple * + * US and International patents. This notice and attribution to IBM may not * + * to removed. * + ******************************************************************************* + */ + +package jdk.internal.icu.impl; + +import jdk.internal.icu.text.Replaceable; +import jdk.internal.icu.text.ReplaceableString; +import jdk.internal.icu.text.UCharacterIterator; + +/** + * DLF docs must define behavior when Replaceable is mutated underneath + * the iterator. + * + * This and ICUCharacterIterator share some code, maybe they should share + * an implementation, or the common state and implementation should be + * moved up into UCharacterIterator. + * + * What are first, last, and getBeginIndex doing here?!?!?! + */ +public class ReplaceableUCharacterIterator extends UCharacterIterator { + + // public constructor ------------------------------------------------------ + + /** + * Public constructor + * @param str text which the iterator will be based on + */ + public ReplaceableUCharacterIterator(String str){ + if(str==null){ + throw new IllegalArgumentException(); + } + this.replaceable = new ReplaceableString(str); + this.currentIndex = 0; + } + + /** + * Public constructor + * @param buf buffer of text on which the iterator will be based + */ + public ReplaceableUCharacterIterator(StringBuffer buf){ + if(buf==null){ + throw new IllegalArgumentException(); + } + this.replaceable = new ReplaceableString(buf); + this.currentIndex = 0; + } + + // public methods ---------------------------------------------------------- + + /** + * Creates a copy of this iterator, does not clone the underlying + * <code>Replaceable</code>object + * @return copy of this iterator + */ + public Object clone(){ + try { + return super.clone(); + } catch (CloneNotSupportedException e) { + return null; // never invoked + } + } + + /** + * Returns the current UTF16 character. + * @return current UTF16 character + */ + public int current(){ + if (currentIndex < replaceable.length()) { + return replaceable.charAt(currentIndex); + } + return DONE; + } + + /** + * Returns the length of the text + * @return length of the text + */ + public int getLength(){ + return replaceable.length(); + } + + /** + * Gets the current currentIndex in text. + * @return current currentIndex in text. + */ + public int getIndex(){ + return currentIndex; + } + + /** + * Returns next UTF16 character and increments the iterator's currentIndex by 1. + * If the resulting currentIndex is greater or equal to the text length, the + * currentIndex is reset to the text length and a value of DONECODEPOINT is + * returned. + * @return next UTF16 character in text or DONE if the new currentIndex is off the + * end of the text range. + */ + public int next(){ + if (currentIndex < replaceable.length()) { + return replaceable.charAt(currentIndex++); + } + return DONE; + } + + + /** + * Returns previous UTF16 character and decrements the iterator's currentIndex by + * 1. + * If the resulting currentIndex is less than 0, the currentIndex is reset to 0 and a + * value of DONECODEPOINT is returned. + * @return next UTF16 character in text or DONE if the new currentIndex is off the + * start of the text range. + */ + public int previous(){ + if (currentIndex > 0) { + return replaceable.charAt(--currentIndex); + } + return DONE; + } + + /** + * Sets the currentIndex to the specified currentIndex in the text and returns that + * single UTF16 character at currentIndex. + * This assumes the text is stored as 16-bit code units. + * @param currentIndex the currentIndex within the text. + * @exception IllegalArgumentException is thrown if an invalid currentIndex is + * supplied. i.e. currentIndex is out of bounds. + */ + public void setIndex(int currentIndex) { + if (currentIndex < 0 || currentIndex > replaceable.length()) { + throw new IllegalArgumentException(); + } + this.currentIndex = currentIndex; + } + + public int getText(char[] fillIn, int offset){ + int length = replaceable.length(); + if(offset < 0 || offset + length > fillIn.length){ + throw new IndexOutOfBoundsException(Integer.toString(length)); + } + replaceable.getChars(0,length,fillIn,offset); + return length; + } + + // private data members ---------------------------------------------------- + + /** + * Replaceable object + */ + private Replaceable replaceable; + /** + * Current currentIndex + */ + private int currentIndex; + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/java.base/share/classes/jdk/internal/icu/impl/StringPrepDataReader.java Mon Jan 13 08:05:59 2020 -0800 @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +/* +/* + ****************************************************************************** + * Copyright (C) 2003, International Business Machines Corporation and * + * others. All Rights Reserved. * + ****************************************************************************** + * + * Created on May 2, 2003 + * + * To change the template for this generated file go to + * Window>Preferences>Java>Code Generation>Code and Comments + */ +// CHANGELOG +// 2005-05-19 Edward Wang +// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/impl/StringPrepDataReader.java +// - move from package com.ibm.icu.impl to package sun.net.idn +// +package jdk.internal.icu.impl; + +import java.io.DataInputStream; +import java.io.IOException; +import java.io.InputStream; + +import jdk.internal.icu.impl.ICUBinary; + + +/** + * @author ram + * + * To change the template for this generated type comment go to + * Window>Preferences>Java>Code Generation>Code and Comments + */ +public final class StringPrepDataReader implements ICUBinary.Authenticate { + + /** + * <p>private constructor.</p> + * @param inputStream ICU uprop.dat file input stream + * @exception IOException throw if data file fails authentication + * @draft 2.1 + */ + public StringPrepDataReader(InputStream inputStream) + throws IOException{ + + unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this); + + + dataInputStream = new DataInputStream(inputStream); + + } + + public void read(byte[] idnaBytes, + char[] mappingTable) + throws IOException{ + + //Read the bytes that make up the idnaTrie + dataInputStream.read(idnaBytes); + + //Read the extra data + for(int i=0;i<mappingTable.length;i++){ + mappingTable[i]=dataInputStream.readChar(); + } + } + + public byte[] getDataFormatVersion(){ + return DATA_FORMAT_VERSION; + } + + public boolean isDataVersionAcceptable(byte version[]){ + return version[0] == DATA_FORMAT_VERSION[0] + && version[2] == DATA_FORMAT_VERSION[2] + && version[3] == DATA_FORMAT_VERSION[3]; + } + public int[] readIndexes(int length)throws IOException{ + int[] indexes = new int[length]; + //Read the indexes + for (int i = 0; i <length ; i++) { + indexes[i] = dataInputStream.readInt(); + } + return indexes; + } + + public byte[] getUnicodeVersion(){ + return unicodeVersion; + } + // private data members ------------------------------------------------- + + + /** + * ICU data file input stream + */ + private DataInputStream dataInputStream; + private byte[] unicodeVersion; + /** + * File format version that this class understands. + * No guarantees are made if a older version is used + * see store.c of gennorm for more information and values + */ + ///* dataFormat="SPRP" 0x53, 0x50, 0x52, 0x50 */ + private static final byte DATA_FORMAT_ID[] = {(byte)0x53, (byte)0x50, + (byte)0x52, (byte)0x50}; + private static final byte DATA_FORMAT_VERSION[] = {(byte)0x3, (byte)0x2, + (byte)0x5, (byte)0x2}; + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/java.base/share/classes/jdk/internal/icu/impl/Trie.java Mon Jan 13 08:05:59 2020 -0800 @@ -0,0 +1,367 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ****************************************************************************** + * Copyright (C) 1996-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ****************************************************************************** + */ + +package jdk.internal.icu.impl; + +import jdk.internal.icu.lang.UCharacter; +import jdk.internal.icu.text.UTF16; + +import java.io.DataInputStream; +import java.io.InputStream; +import java.io.IOException; + +/** + * <p>A trie is a kind of compressed, serializable table of values + * associated with Unicode code points (0..0x10ffff).</p> + * <p>This class defines the basic structure of a trie and provides methods + * to <b>retrieve the offsets to the actual data</b>.</p> + * <p>Data will be the form of an array of basic types, char or int.</p> + * <p>The actual data format will have to be specified by the user in the + * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p> + * <p>This trie implementation is optimized for getting offset while walking + * forward through a UTF-16 string. + * Therefore, the simplest and fastest access macros are the + * fromLead() and fromOffsetTrail() methods. + * The fromBMP() method are a little more complicated; they get offsets even + * for lead surrogate codepoints, while the fromLead() method get special + * "folded" offsets for lead surrogate code units if there is relevant data + * associated with them. + * From such a folded offsets, an offset needs to be extracted to supply + * to the fromOffsetTrail() methods. + * To handle such supplementary codepoints, some offset information are kept + * in the data.</p> + * <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve + * that offset from the folded value for the lead surrogate unit.</p> + * <p>For examples of use, see com.ibm.icu.impl.CharTrie or + * com.ibm.icu.impl.IntTrie.</p> + * @author synwee + * @see com.ibm.icu.impl.CharTrie + * @see com.ibm.icu.impl.IntTrie + * @since release 2.1, Jan 01 2002 + */ +public abstract class Trie +{ + // public class declaration ---------------------------------------- + + /** + * Character data in com.ibm.impl.Trie have different user-specified format + * for different purposes. + * This interface specifies methods to be implemented in order for + * com.ibm.impl.Trie, to surrogate offset information encapsulated within + * the data. + */ + public static interface DataManipulate + { + /** + * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's + * data + * the index array offset of the indexes for that lead surrogate. + * @param value data value for a surrogate from the trie, including the + * folding offset + * @return data offset or 0 if there is no data for the lead surrogate + */ + public int getFoldingOffset(int value); + } + + // default implementation + private static class DefaultGetFoldingOffset implements DataManipulate { + public int getFoldingOffset(int value) { + return value; + } + } + + // protected constructor ------------------------------------------- + + /** + * Trie constructor for CharTrie use. + * @param inputStream ICU data file input stream which contains the + * trie + * @param dataManipulate object containing the information to parse the + * trie data + * @throws IOException thrown when input stream does not have the + * right header. + */ + protected Trie(InputStream inputStream, + DataManipulate dataManipulate) throws IOException + { + DataInputStream input = new DataInputStream(inputStream); + // Magic number to authenticate the data. + int signature = input.readInt(); + m_options_ = input.readInt(); + + if (!checkHeader(signature)) { + throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file"); + } + + if(dataManipulate != null) { + m_dataManipulate_ = dataManipulate; + } else { + m_dataManipulate_ = new DefaultGetFoldingOffset(); + } + m_isLatin1Linear_ = (m_options_ & + HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0; + m_dataOffset_ = input.readInt(); + m_dataLength_ = input.readInt(); + unserialize(inputStream); + } + + // protected data members ------------------------------------------ + + /** + * Lead surrogate code points' index displacement in the index array. + * <pre>{@code + * 0x10000-0xd800=0x2800 + * 0x2800 >> INDEX_STAGE_1_SHIFT_ + * }</pre> + */ + protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5; + /** + * Shift size for shifting right the input index. 1..9 + */ + protected static final int INDEX_STAGE_1_SHIFT_ = 5; + /** + * Shift size for shifting left the index array values. + * Increases possible data size with 16-bit index values at the cost + * of compactability. + * This requires blocks of stage 2 data to be aligned by + * DATA_GRANULARITY. + * 0..INDEX_STAGE_1_SHIFT + */ + protected static final int INDEX_STAGE_2_SHIFT_ = 2; + /** + * Number of data values in a stage 2 (data array) block. + */ + protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_; + /** + * Mask for getting the lower bits from the input index. + * DATA_BLOCK_LENGTH - 1. + */ + protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1; + /** + * Surrogate mask to use when shifting offset to retrieve supplementary + * values + */ + protected static final int SURROGATE_MASK_ = 0x3FF; + /** + * Index or UTF16 characters + */ + protected char m_index_[]; + /** + * Internal TrieValue which handles the parsing of the data value. + * This class is to be implemented by the user + */ + protected DataManipulate m_dataManipulate_; + /** + * Start index of the data portion of the trie. CharTrie combines + * index and data into a char array, so this is used to indicate the + * initial offset to the data portion. + * Note this index always points to the initial value. + */ + protected int m_dataOffset_; + /** + * Length of the data array + */ + protected int m_dataLength_; + + // protected methods ----------------------------------------------- + + /** + * Gets the offset to the data which the surrogate pair points to. + * @param lead lead surrogate + * @param trail trailing surrogate + * @return offset to data + */ + protected abstract int getSurrogateOffset(char lead, char trail); + + /** + * Gets the offset to the data which the index ch after variable offset + * points to. + * Note for locating a non-supplementary character data offset, calling + * <p> + * getRawOffset(0, ch); + * </p> + * will do. Otherwise if it is a supplementary character formed by + * surrogates lead and trail. Then we would have to call getRawOffset() + * with getFoldingIndexOffset(). See getSurrogateOffset(). + * @param offset index offset which ch is to start from + * @param ch index to be used after offset + * @return offset to the data + */ + protected final int getRawOffset(int offset, char ch) + { + return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)] + << INDEX_STAGE_2_SHIFT_) + + (ch & INDEX_STAGE_3_MASK_); + } + + /** + * Gets the offset to data which the BMP character points to + * Treats a lead surrogate as a normal code point. + * @param ch BMP character + * @return offset to data + */ + protected final int getBMPOffset(char ch) + { + return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE + && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) + ? getRawOffset(LEAD_INDEX_OFFSET_, ch) + : getRawOffset(0, ch); + // using a getRawOffset(ch) makes no diff + } + + /** + * Gets the offset to the data which this lead surrogate character points + * to. + * Data at the returned offset may contain folding offset information for + * the next trailing surrogate character. + * @param ch lead surrogate character + * @return offset to data + */ + protected final int getLeadOffset(char ch) + { + return getRawOffset(0, ch); + } + + /** + * Internal trie getter from a code point. + * Could be faster(?) but longer with + * {@code if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }} + * Gets the offset to data which the codepoint points to + * @param ch codepoint + * @return offset to data + */ + protected final int getCodePointOffset(int ch) + { + // if ((ch >> 16) == 0) slower + if (ch < 0) { + return -1; + } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) { + // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works + return getRawOffset(0, (char)ch); + } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) { + // BMP codepoint + return getBMPOffset((char)ch); + } else if (ch <= UCharacter.MAX_VALUE) { + // look at the construction of supplementary characters + // trail forms the ends of it. + return getSurrogateOffset(UTF16.getLeadSurrogate(ch), + (char)(ch & SURROGATE_MASK_)); + } else { + // return -1 if there is an error, in this case we return + return -1; + } + } + + /** + * <p>Parses the inputstream and creates the trie index with it.</p> + * <p>This is overwritten by the child classes. + * @param inputStream input stream containing the trie information + * @exception IOException thrown when data reading fails. + */ + protected void unserialize(InputStream inputStream) throws IOException + { + //indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_ + m_index_ = new char[m_dataOffset_]; + DataInputStream input = new DataInputStream(inputStream); + for (int i = 0; i < m_dataOffset_; i ++) { + m_index_[i] = input.readChar(); + } + } + + /** + * Determines if this is a 16 bit trie + * @return true if this is a 16 bit trie + */ + protected final boolean isCharTrie() + { + return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0; + } + + // private data members -------------------------------------------- + + /** + * Latin 1 option mask + */ + protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200; + /** + * Constant number to authenticate the byte block + */ + protected static final int HEADER_SIGNATURE_ = 0x54726965; + /** + * Header option formatting + */ + private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF; + protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4; + protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100; + + /** + * Flag indicator for Latin quick access data block + */ + private boolean m_isLatin1Linear_; + + /** + * <p>Trie options field.</p> + * <p>options bit field:<br> + * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br> + * 8 0 = 16-bit data, 1=32-bit data<br> + * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br> + * 3..0 INDEX_STAGE_2_SHIFT // 1..9<br> + */ + private int m_options_; + + // private methods --------------------------------------------------- + + /** + * Authenticates raw data header. + * Checking the header information, signature and options. + * @param signature This contains the options and type of a Trie + * @return true if the header is authenticated valid + */ + private final boolean checkHeader(int signature) + { + // check the signature + // Trie in big-endian US-ASCII (0x54726965). + // Magic number to authenticate the data. + if (signature != HEADER_SIGNATURE_) { + return false; + } + + if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) != + INDEX_STAGE_1_SHIFT_ || + ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) & + HEADER_OPTIONS_SHIFT_MASK_) + != INDEX_STAGE_2_SHIFT_) { + return false; + } + return true; + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/java.base/share/classes/jdk/internal/icu/impl/Trie2.java Mon Jan 13 08:05:59 2020 -0800 @@ -0,0 +1,655 @@ +/* + * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 2009-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ + +package jdk.internal.icu.impl; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Iterator; +import java.util.NoSuchElementException; + + +/** + * This is the interface and common implementation of a Unicode Trie2. + * It is a kind of compressed table that maps from Unicode code points (0..0x10ffff) + * to 16- or 32-bit integer values. It works best when there are ranges of + * characters with the same value, which is generally the case with Unicode + * character properties. + * + * This is the second common version of a Unicode trie (hence the name Trie2). + * + */ +abstract class Trie2 implements Iterable<Trie2.Range> { + + /** + * Create a Trie2 from its serialized form. Inverse of utrie2_serialize(). + * + * Reads from the current position and leaves the buffer after the end of the trie. + * + * The serialized format is identical between ICU4C and ICU4J, so this function + * will work with serialized Trie2s from either. + * + * The actual type of the returned Trie2 will be either Trie2_16 or Trie2_32, depending + * on the width of the data. + * + * To obtain the width of the Trie2, check the actual class type of the returned Trie2. + * Or use the createFromSerialized() function of Trie2_16 or Trie2_32, which will + * return only Tries of their specific type/size. + * + * The serialized Trie2 on the stream may be in either little or big endian byte order. + * This allows using serialized Tries from ICU4C without needing to consider the + * byte order of the system that created them. + * + * @param bytes a byte buffer to the serialized form of a UTrie2. + * @return An unserialized Trie2, ready for use. + * @throws IllegalArgumentException if the stream does not contain a serialized Trie2. + * @throws IOException if a read error occurs in the buffer. + * + */ + public static Trie2 createFromSerialized(ByteBuffer bytes) throws IOException { + // From ICU4C utrie2_impl.h + // * Trie2 data structure in serialized form: + // * + // * UTrie2Header header; + // * uint16_t index[header.index2Length]; + // * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...] + // * @internal + // */ + // typedef struct UTrie2Header { + // /** "Tri2" in big-endian US-ASCII (0x54726932) */ + // uint32_t signature; + + // /** + // * options bit field: + // * 15.. 4 reserved (0) + // * 3.. 0 UTrie2ValueBits valueBits + // */ + // uint16_t options; + // + // /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH */ + // uint16_t indexLength; + // + // /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT */ + // uint16_t shiftedDataLength; + // + // /** Null index and data blocks, not shifted. */ + // uint16_t index2NullOffset, dataNullOffset; + // + // /** + // * First code point of the single-value range ending with U+10ffff, + // * rounded up and then shifted right by UTRIE2_SHIFT_1. + // */ + // uint16_t shiftedHighStart; + // } UTrie2Header; + + ByteOrder outerByteOrder = bytes.order(); + try { + UTrie2Header header = new UTrie2Header(); + + /* check the signature */ + header.signature = bytes.getInt(); + switch (header.signature) { + case 0x54726932: + // The buffer is already set to the trie data byte order. + break; + case 0x32697254: + // Temporarily reverse the byte order. + boolean isBigEndian = outerByteOrder == ByteOrder.BIG_ENDIAN; + bytes.order(isBigEndian ? ByteOrder.LITTLE_ENDIAN : ByteOrder.BIG_ENDIAN); + header.signature = 0x54726932; + break; + default: + throw new IllegalArgumentException("Buffer does not contain a serialized UTrie2"); + } + + header.options = bytes.getChar(); + header.indexLength = bytes.getChar(); + header.shiftedDataLength = bytes.getChar(); + header.index2NullOffset = bytes.getChar(); + header.dataNullOffset = bytes.getChar(); + header.shiftedHighStart = bytes.getChar(); + + if ((header.options & UTRIE2_OPTIONS_VALUE_BITS_MASK) != 0) { + throw new IllegalArgumentException("UTrie2 serialized format error."); + } + + Trie2 This; + This = new Trie2_16(); + This.header = header; + + /* get the length values and offsets */ + This.indexLength = header.indexLength; + This.dataLength = header.shiftedDataLength << UTRIE2_INDEX_SHIFT; + This.index2NullOffset = header.index2NullOffset; + This.dataNullOffset = header.dataNullOffset; + This.highStart = header.shiftedHighStart << UTRIE2_SHIFT_1; + This.highValueIndex = This.dataLength - UTRIE2_DATA_GRANULARITY; + This.highValueIndex += This.indexLength; + + // Allocate the Trie2 index array. If the data width is 16 bits, the array also + // includes the space for the data. + + int indexArraySize = This.indexLength; + indexArraySize += This.dataLength; + This.index = new char[indexArraySize]; + + /* Read in the index */ + int i; + for (i=0; i<This.indexLength; i++) { + This.index[i] = bytes.getChar(); + } + + /* Read in the data. 16 bit data goes in the same array as the index. + * 32 bit data goes in its own separate data array. + */ + This.data16 = This.indexLength; + for (i=0; i<This.dataLength; i++) { + This.index[This.data16 + i] = bytes.getChar(); + } + + This.data32 = null; + This.initialValue = This.index[This.dataNullOffset]; + This.errorValue = This.index[This.data16+UTRIE2_BAD_UTF8_DATA_OFFSET]; + + return This; + } finally { + bytes.order(outerByteOrder); + } + } + + /** + * Get the value for a code point as stored in the Trie2. + * + * @param codePoint the code point + * @return the value + */ + public abstract int get(int codePoint); + + /** + * Get the trie value for a UTF-16 code unit. + * + * A Trie2 stores two distinct values for input in the lead surrogate + * range, one for lead surrogates, which is the value that will be + * returned by this function, and a second value that is returned + * by Trie2.get(). + * + * For code units outside of the lead surrogate range, this function + * returns the same result as Trie2.get(). + * + * This function, together with the alternate value for lead surrogates, + * makes possible very efficient processing of UTF-16 strings without + * first converting surrogate pairs to their corresponding 32 bit code point + * values. + * + * At build-time, enumerate the contents of the Trie2 to see if there + * is non-trivial (non-initialValue) data for any of the supplementary + * code points associated with a lead surrogate. + * If so, then set a special (application-specific) value for the + * lead surrogate code _unit_, with Trie2Writable.setForLeadSurrogateCodeUnit(). + * + * At runtime, use Trie2.getFromU16SingleLead(). If there is non-trivial + * data and the code unit is a lead surrogate, then check if a trail surrogate + * follows. If so, assemble the supplementary code point and look up its value + * with Trie2.get(); otherwise reset the lead + * surrogate's value or do a code point lookup for it. + * + * If there is only trivial data for lead and trail surrogates, then processing + * can often skip them. For example, in normalization or case mapping + * all characters that do not have any mappings are simply copied as is. + * + * @param c the code point or lead surrogate value. + * @return the value + */ + public abstract int getFromU16SingleLead(char c); + + /** + * When iterating over the contents of a Trie2, Elements of this type are produced. + * The iterator will return one item for each contiguous range of codepoints having the same value. + * + * When iterating, the same Trie2EnumRange object will be reused and returned for each range. + * If you need to retain complete iteration results, clone each returned Trie2EnumRange, + * or save the range in some other way, before advancing to the next iteration step. + */ + public static class Range { + public int startCodePoint; + public int endCodePoint; // Inclusive. + public int value; + public boolean leadSurrogate; + + public boolean equals(Object other) { + if (other == null || !(other.getClass().equals(getClass()))) { + return false; + } + Range tother = (Range)other; + return this.startCodePoint == tother.startCodePoint && + this.endCodePoint == tother.endCodePoint && + this.value == tother.value && + this.leadSurrogate == tother.leadSurrogate; + } + + public int hashCode() { + int h = initHash(); + h = hashUChar32(h, startCodePoint); + h = hashUChar32(h, endCodePoint); + h = hashInt(h, value); + h = hashByte(h, leadSurrogate? 1: 0); + return h; + } + } + + /** + * Create an iterator over the value ranges in this Trie2. + * Values from the Trie2 are not remapped or filtered, but are returned as they + * are stored in the Trie2. + * + * @return an Iterator + */ + public Iterator<Range> iterator() { + return iterator(defaultValueMapper); + } + + private static ValueMapper defaultValueMapper = new ValueMapper() { + public int map(int in) { + return in; + } + }; + + /** + * Create an iterator over the value ranges from this Trie2. + * Values from the Trie2 are passed through a caller-supplied remapping function, + * and it is the remapped values that determine the ranges that + * will be produced by the iterator. + * + * + * @param mapper provides a function to remap values obtained from the Trie2. + * @return an Iterator + */ + public Iterator<Range> iterator(ValueMapper mapper) { + return new Trie2Iterator(mapper); + } + + /** + * When iterating over the contents of a Trie2, an instance of TrieValueMapper may + * be used to remap the values from the Trie2. The remapped values will be used + * both in determining the ranges of codepoints and as the value to be returned + * for each range. + * + * Example of use, with an anonymous subclass of TrieValueMapper: + * + * + * ValueMapper m = new ValueMapper() { + * int map(int in) {return in & 0x1f;}; + * } + * for (Iterator<Trie2EnumRange> iter = trie.iterator(m); i.hasNext(); ) { + * Trie2EnumRange r = i.next(); + * ... // Do something with the range r. + * } + * + */ + public interface ValueMapper { + public int map(int originalVal); + } + + //-------------------------------------------------------------------------------- + // + // Below this point are internal implementation items. No further public API. + // + //-------------------------------------------------------------------------------- + + /** + * Trie2 data structure in serialized form: + * + * UTrie2Header header; + * uint16_t index[header.index2Length]; + * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...] + * + * For Java, this is read from the stream into an instance of UTrie2Header. + * (The C version just places a struct over the raw serialized data.) + * + * @internal + */ + static class UTrie2Header { + /** "Tri2" in big-endian US-ASCII (0x54726932) */ + int signature; + + /** + * options bit field (uint16_t): + * 15.. 4 reserved (0) + * 3.. 0 UTrie2ValueBits valueBits + */ + int options; + + /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH (uint16_t) */ + int indexLength; + + /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT (uint16_t) */ + int shiftedDataLength; + + /** Null index and data blocks, not shifted. (uint16_t) */ + int index2NullOffset, dataNullOffset; + + /** + * First code point of the single-value range ending with U+10ffff, + * rounded up and then shifted right by UTRIE2_SHIFT_1. (uint16_t) + */ + int shiftedHighStart; + } + + // + // Data members of UTrie2. + // + UTrie2Header header; + char index[]; // Index array. Includes data for 16 bit Tries. + int data16; // Offset to data portion of the index array, if 16 bit data. + // zero if 32 bit data. + int data32[]; // NULL if 16b data is used via index + + int indexLength; + int dataLength; + int index2NullOffset; // 0xffff if there is no dedicated index-2 null block + int initialValue; + + /** Value returned for out-of-range code points and illegal UTF-8. */ + int errorValue; + + /* Start of the last range which ends at U+10ffff, and its value. */ + int highStart; + int highValueIndex; + + int dataNullOffset; + + /** + * Trie2 constants, defining shift widths, index array lengths, etc. + * + * These are needed for the runtime macros but users can treat these as + * implementation details and skip to the actual public API further below. + */ + + static final int UTRIE2_OPTIONS_VALUE_BITS_MASK=0x000f; + + + /** Shift size for getting the index-1 table offset. */ + static final int UTRIE2_SHIFT_1=6+5; + + /** Shift size for getting the index-2 table offset. */ + static final int UTRIE2_SHIFT_2=5; + + /** + * Difference between the two shift sizes, + * for getting an index-1 offset from an index-2 offset. 6=11-5 + */ + static final int UTRIE2_SHIFT_1_2=UTRIE2_SHIFT_1-UTRIE2_SHIFT_2; + + /** + * Number of index-1 entries for the BMP. 32=0x20 + * This part of the index-1 table is omitted from the serialized form. + */ + static final int UTRIE2_OMITTED_BMP_INDEX_1_LENGTH=0x10000>>UTRIE2_SHIFT_1; + + /** Number of entries in an index-2 block. 64=0x40 */ + static final int UTRIE2_INDEX_2_BLOCK_LENGTH=1<<UTRIE2_SHIFT_1_2; + + /** Mask for getting the lower bits for the in-index-2-block offset. */ + static final int UTRIE2_INDEX_2_MASK=UTRIE2_INDEX_2_BLOCK_LENGTH-1; + + /** Number of entries in a data block. 32=0x20 */ + static final int UTRIE2_DATA_BLOCK_LENGTH=1<<UTRIE2_SHIFT_2; + + /** Mask for getting the lower bits for the in-data-block offset. */ + static final int UTRIE2_DATA_MASK=UTRIE2_DATA_BLOCK_LENGTH-1; + + /** + * Shift size for shifting left the index array values. + * Increases possible data size with 16-bit index values at the cost + * of compactability. + * This requires data blocks to be aligned by UTRIE2_DATA_GRANULARITY. + */ + static final int UTRIE2_INDEX_SHIFT=2; + + /** The alignment size of a data block. Also the granularity for compaction. */ + static final int UTRIE2_DATA_GRANULARITY=1<<UTRIE2_INDEX_SHIFT; + + /** + * The part of the index-2 table for U+D800..U+DBFF stores values for + * lead surrogate code _units_ not code _points_. + * Values for lead surrogate code _points_ are indexed with this portion of the table. + * Length=32=0x20=0x400>>UTRIE2_SHIFT_2. (There are 1024=0x400 lead surrogates.) + */ + static final int UTRIE2_LSCP_INDEX_2_OFFSET=0x10000>>UTRIE2_SHIFT_2; + static final int UTRIE2_LSCP_INDEX_2_LENGTH=0x400>>UTRIE2_SHIFT_2; + + /** Count the lengths of both BMP pieces. 2080=0x820 */ + static final int UTRIE2_INDEX_2_BMP_LENGTH=UTRIE2_LSCP_INDEX_2_OFFSET+UTRIE2_LSCP_INDEX_2_LENGTH; + + /** + * The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820. + * Length 32=0x20 for lead bytes C0..DF, regardless of UTRIE2_SHIFT_2. + */ + static final int UTRIE2_UTF8_2B_INDEX_2_OFFSET=UTRIE2_INDEX_2_BMP_LENGTH; + static final int UTRIE2_UTF8_2B_INDEX_2_LENGTH=0x800>>6; /* U+0800 is the first code point after 2-byte UTF-8 */ + + /** + * The index-1 table, only used for supplementary code points, at offset 2112=0x840. + * Variable length, for code points up to highStart, where the last single-value range starts. + * Maximum length 512=0x200=0x100000>>UTRIE2_SHIFT_1. + * (For 0x100000 supplementary code points U+10000..U+10ffff.) + * + * The part of the index-2 table for supplementary code points starts + * after this index-1 table. + * + * Both the index-1 table and the following part of the index-2 table + * are omitted completely if there is only BMP data. + */ + static final int UTRIE2_INDEX_1_OFFSET=UTRIE2_UTF8_2B_INDEX_2_OFFSET+UTRIE2_UTF8_2B_INDEX_2_LENGTH; + + /** + * The illegal-UTF-8 data block follows the ASCII block, at offset 128=0x80. + * Used with linear access for single bytes 0..0xbf for simple error handling. + * Length 64=0x40, not UTRIE2_DATA_BLOCK_LENGTH. + */ + static final int UTRIE2_BAD_UTF8_DATA_OFFSET=0x80; + + /** + * Implementation class for an iterator over a Trie2. + * + * Iteration over a Trie2 first returns all of the ranges that are indexed by code points, + * then returns the special alternate values for the lead surrogates + * + * @internal + */ + class Trie2Iterator implements Iterator<Range> { + + // The normal constructor that configures the iterator to cover the complete + // contents of the Trie2 + Trie2Iterator(ValueMapper vm) { + mapper = vm; + nextStart = 0; + limitCP = 0x110000; + doLeadSurrogates = true; + } + + /** + * The main next() function for Trie2 iterators + * + */ + public Range next() { + if (!hasNext()) { + throw new NoSuchElementException(); + } + if (nextStart >= limitCP) { + // Switch over from iterating normal code point values to + // doing the alternate lead-surrogate values. + doingCodePoints = false; + nextStart = 0xd800; + } + int endOfRange = 0; + int val = 0; + int mappedVal = 0; + + if (doingCodePoints) { + // Iteration over code point values. + val = get(nextStart); + mappedVal = mapper.map(val); + endOfRange = rangeEnd(nextStart, limitCP, val); + // Loop once for each range in the Trie2 with the same raw (unmapped) value. + // Loop continues so long as the mapped values are the same. + for (;;) { + if (endOfRange >= limitCP-1) { + break; + } + val = get(endOfRange+1); + if (mapper.map(val) != mappedVal) { + break; + } + endOfRange = rangeEnd(endOfRange+1, limitCP, val); + } + } else { + // Iteration over the alternate lead surrogate values. + val = getFromU16SingleLead((char)nextStart); + mappedVal = mapper.map(val); + endOfRange = rangeEndLS((char)nextStart); + // Loop once for each range in the Trie2 with the same raw (unmapped) value. + // Loop continues so long as the mapped values are the same. + for (;;) { + if (endOfRange >= 0xdbff) { + break; + } + val = getFromU16SingleLead((char)(endOfRange+1)); + if (mapper.map(val) != mappedVal) { + break; + } + endOfRange = rangeEndLS((char)(endOfRange+1)); + } + } + returnValue.startCodePoint = nextStart; + returnValue.endCodePoint = endOfRange; + returnValue.value = mappedVal; + returnValue.leadSurrogate = !doingCodePoints; + nextStart = endOfRange+1; + return returnValue; + } + + /** + * + */ + public boolean hasNext() { + return doingCodePoints && (doLeadSurrogates || nextStart < limitCP) || nextStart < 0xdc00; + } + + private int rangeEndLS(char startingLS) { + if (startingLS >= 0xdbff) { + return 0xdbff; + } + + int c; + int val = getFromU16SingleLead(startingLS); + for (c = startingLS+1; c <= 0x0dbff; c++) { + if (getFromU16SingleLead((char)c) != val) { + break; + } + } + return c-1; + } + + // + // Iteration State Variables + // + private ValueMapper mapper; + private Range returnValue = new Range(); + // The starting code point for the next range to be returned. + private int nextStart; + // The upper limit for the last normal range to be returned. Normally 0x110000, but + // may be lower when iterating over the code points for a single lead surrogate. + private int limitCP; + + // True while iterating over the Trie2 values for code points. + // False while iterating over the alternate values for lead surrogates. + private boolean doingCodePoints = true; + + // True if the iterator should iterate the special values for lead surrogates in + // addition to the normal values for code points. + private boolean doLeadSurrogates = true; + } + + /** + * Find the last character in a contiguous range of characters with the + * same Trie2 value as the input character. + * + * @param c The character to begin with. + * @return The last contiguous character with the same value. + */ + int rangeEnd(int start, int limitp, int val) { + int c; + int limit = Math.min(highStart, limitp); + + for (c = start+1; c < limit; c++) { + if (get(c) != val) { + break; + } + } + if (c >= highStart) { + c = limitp; + } + return c - 1; + } + + + // + // Hashing implementation functions. FNV hash. Respected public domain algorithm. + // + private static int initHash() { + return 0x811c9DC5; // unsigned 2166136261 + } + + private static int hashByte(int h, int b) { + h = h * 16777619; + h = h ^ b; + return h; + } + + private static int hashUChar32(int h, int c) { + h = Trie2.hashByte(h, c & 255); + h = Trie2.hashByte(h, (c>>8) & 255); + h = Trie2.hashByte(h, c>>16); + return h; + } + + private static int hashInt(int h, int i) { + h = Trie2.hashByte(h, i & 255); + h = Trie2.hashByte(h, (i>>8) & 255); + h = Trie2.hashByte(h, (i>>16) & 255); + h = Trie2.hashByte(h, (i>>24) & 255); + return h; + } + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/java.base/share/classes/jdk/internal/icu/impl/Trie2_16.java Mon Jan 13 08:05:59 2020 -0800 @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ******************************************************************************* + * Copyright (C) 2009-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ + +package jdk.internal.icu.impl; + +import java.io.IOException; +import java.nio.ByteBuffer; + + +/** + * @author aheninger + * + * A read-only Trie2, holding 16 bit data values. + * + * A Trie2 is a highly optimized data structure for mapping from Unicode + * code points (values ranging from 0 to 0x10ffff) to a 16 or 32 bit value. + * + * See class Trie2 for descriptions of the API for accessing the contents of a trie. + * + * The fundamental data access methods are declared final in this class, with + * the intent that applications might gain a little extra performance, when compared + * with calling the same methods via the abstract UTrie2 base class. + */ +public final class Trie2_16 extends Trie2 { + + /** + * Internal constructor, not for general use. + */ + Trie2_16() { + } + + + /** + * Create a Trie2 from its serialized form. Inverse of utrie2_serialize(). + * The serialized format is identical between ICU4C and ICU4J, so this function + * will work with serialized Trie2s from either. + * + * The serialized Trie2 in the bytes may be in either little or big endian byte order. + * This allows using serialized Tries from ICU4C without needing to consider the + * byte order of the system that created them. + * + * @param bytes a byte buffer to the serialized form of a UTrie2. + * @return An unserialized Trie2_16, ready for use. + * @throws IllegalArgumentException if the buffer does not contain a serialized Trie2. + * @throws IOException if a read error occurs in the buffer. + * @throws ClassCastException if the bytes contain a serialized Trie2_32 + */ + public static Trie2_16 createFromSerialized(ByteBuffer bytes) throws IOException { + return (Trie2_16) Trie2.createFromSerialized(bytes); + } + + /** + * Get the value for a code point as stored in the Trie2. + * + * @param codePoint the code point + * @return the value + */ + @Override + public final int get(int codePoint) { + int value; + int ix; + + if (codePoint >= 0) { + if (codePoint < 0x0d800 || (codePoint > 0x0dbff && codePoint <= 0x0ffff)) { + // Ordinary BMP code point, excluding leading surrogates. + // BMP uses a single level lookup. BMP index starts at offset 0 in the Trie2 index. + // 16 bit data is stored in the index array itself. + ix = index[codePoint >> UTRIE2_SHIFT_2]; + ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK); + value = index[ix]; + return value; + } + if (codePoint <= 0xffff) { + // Lead Surrogate Code Point. A Separate index section is stored for + // lead surrogate code units and code points. + // The main index has the code unit data. + // For this function, we need the code point data. + // Note: this expression could be refactored for slightly improved efficiency, but + // surrogate code points will be so rare in practice that it's not worth it. + ix = index[UTRIE2_LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> UTRIE2_SHIFT_2)]; + ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK); + value = index[ix]; + return value; + } + if (codePoint < highStart) { + // Supplemental code point, use two-level lookup. + ix = (UTRIE2_INDEX_1_OFFSET - UTRIE2_OMITTED_BMP_INDEX_1_LENGTH) + (codePoint >> UTRIE2_SHIFT_1); + ix = index[ix]; + ix += (codePoint >> UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK; + ix = index[ix]; + ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK); + value = index[ix]; + return value; + } + if (codePoint <= 0x10ffff) { + value = index[highValueIndex]; + return value; + } + } + + // Fall through. The code point is outside of the legal range of 0..0x10ffff. + return errorValue; + } + + + /** + * Get a Trie2 value for a UTF-16 code unit. + * + * This function returns the same value as get() if the input + * character is outside of the lead surrogate range + * + * There are two values stored in a Trie2 for inputs in the lead + * surrogate range. This function returns the alternate value, + * while Trie2.get() returns the main value. + * + * @param codeUnit a 16 bit code unit or lead surrogate value. + * @return the value + */ + @Override + public int getFromU16SingleLead(char codeUnit) { + int value; + int ix; + + // Because the input is a 16 bit char, we can skip the tests for it being in + // the BMP range. It is. + ix = index[codeUnit >> UTRIE2_SHIFT_2]; + ix = (ix << UTRIE2_INDEX_SHIFT) + (codeUnit & UTRIE2_DATA_MASK); + value = index[ix]; + return value; + } + + /** + * @return the number of bytes of the serialized trie + */ + public int getSerializedLength() { + return 16+(header.indexLength+dataLength)*2; + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/java.base/share/classes/jdk/internal/icu/impl/UBiDiProps.java Mon Jan 13 08:05:59 2020 -0800 @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +/* + ******************************************************************************* + * + * Copyright (C) 2004-2014, International Business Machines + * Corporation and others. All Rights Reserved. + * + ******************************************************************************* + * file name: UBiDiProps.java + * encoding: US-ASCII + * tab size: 8 (not used) + * indentation:4 + * + * created on: 2005jan16 + * created by: Markus W. Scherer + * + * Low-level Unicode bidi/shaping properties access. + * Java port of ubidi_props.h/.c. + */ + +package jdk.internal.icu.impl; + +import jdk.internal.icu.lang.UCharacter; +import jdk.internal.icu.util.VersionInfo; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.MissingResourceException; + +public final class UBiDiProps { + // constructors etc. --------------------------------------------------- *** + + // port of ubidi_openProps() + private UBiDiProps() throws IOException{ + ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME); + readData(bytes); + } + + private void readData(ByteBuffer bytes) throws IOException { + // read the header + ICUBinary.readHeader(bytes, FMT, new IsAcceptable()); + + // read indexes[] + int i, count; + count=bytes.getInt(); + if(count<IX_TOP) { + throw new IOException("indexes[0] too small in "+DATA_FILE_NAME); + } + indexes=new int[count]; + + indexes[0]=count; + for(i=1; i<count; ++i) { + indexes[i]=bytes.getInt(); + } + + // read the trie + trie=Trie2_16.createFromSerialized(bytes); + int expectedTrieLength=indexes[IX_TRIE_SIZE]; + int trieLength=trie.getSerializedLength(); + if(trieLength>expectedTrieLength) { + throw new IOException(DATA_FILE_NAME+": not enough bytes for the trie"); + } + // skip padding after trie bytes + ICUBinary.skipBytes(bytes, expectedTrieLength-trieLength); + + // read mirrors[] + count=indexes[IX_MIRROR_LENGTH]; + if(count>0) { + mirrors=new int[count]; + for(i=0; i<count; ++i) { + mirrors[i]=bytes.getInt(); + } + } + + // read jgArray[] + count=indexes[IX_JG_LIMIT]-indexes[IX_JG_START]; + jgArray=new byte[count]; + for(i=0; i<count; ++i) { + jgArray[i]=bytes.get(); + } + + // read jgArray2[] + count=indexes[IX_JG_LIMIT2]-indexes[IX_JG_START2]; + jgArray2=new byte[count]; + for(i=0; i<count; ++i) { + jgArray2[i]=bytes.get(); + } + } + + // implement ICUBinary.Authenticate + private static final class IsAcceptable implements ICUBinary.Authenticate { + public boolean isDataVersionAcceptable(byte version[]) { + return version[0]==2; + } + } + + // property access functions ------------------------------------------- *** + + public final int getClass(int c) { + return getClassFromProps(trie.get(c)); + } + + private final int getMirror(int c, int props) { + int delta=getMirrorDeltaFromProps(props); + if(delta!=ESC_MIRROR_DELTA) { + return c+delta; + } else { + /* look for mirror code point in the mirrors[] table */ + int m; + int i, length; + int c2; + + length=indexes[IX_MIRROR_LENGTH]; + + /* linear search */ + for(i=0; i<length; ++i) { + m=mirrors[i]; + c2=getMirrorCodePoint(m); + if(c==c2) { + /* found c, return its mirror code point using the index in m */ + return getMirrorCodePoint(mirrors[getMirrorIndex(m)]); + } else if(c<c2) { + break; + } + } + + /* c not found, return it itself */ + return c; + } + } + + public final int getMirror(int c) { + int props=trie.get(c); + return getMirror(c, props); + } + + public final int getJoiningType(int c) { + return (trie.get(c)&JT_MASK)>>JT_SHIFT; + } + + public final int getJoiningGroup(int c) { + int start, limit; + + start=indexes[IX_JG_START]; + limit=indexes[IX_JG_LIMIT]; + if(start<=c && c<limit) { + return (int)jgArray[c-start]&0xff; + } + start=indexes[IX_JG_START2]; + limit=indexes[IX_JG_LIMIT2]; + if(start<=c && c<limit) { + return (int)jgArray2[c-start]&0xff; + } + return UCharacter.JoiningGroup.NO_JOINING_GROUP; + } + + public final int getPairedBracketType(int c) { + return (trie.get(c)&BPT_MASK)>>BPT_SHIFT; + } + + public final int getPairedBracket(int c) { + int props=trie.get(c); + if((props&BPT_MASK)==0) { + return c; + } else { + return getMirror(c, props); + } + } + + // data members -------------------------------------------------------- *** + private int indexes[]; + private int mirrors[]; + private byte jgArray[]; + private byte jgArray2[]; + + private Trie2_16 trie; + + // data format constants ----------------------------------------------- *** + @SuppressWarnings("deprecation") + private static final String DATA_FILE_NAME = + "/jdk/internal/icu/impl/data/icudt" + + VersionInfo.ICU_DATA_VERSION_PATH + + "/ubidi.icu"; + + /* format "BiDi" */ + private static final int FMT=0x42694469; + + /* indexes into indexes[] */ + private static final int IX_TRIE_SIZE=2; + private static final int IX_MIRROR_LENGTH=3; + + private static final int IX_JG_START=4; + private static final int IX_JG_LIMIT=5; + private static final int IX_JG_START2=6; /* new in format version 2.2, ICU 54 */ + private static final int IX_JG_LIMIT2=7; + + private static final int IX_TOP=16; + + // definitions for 16-bit bidi/shaping properties word ----------------- *** + + /* CLASS_SHIFT=0, */ /* bidi class: 5 bits (4..0) */ + private static final int JT_SHIFT=5; /* joining type: 3 bits (7..5) */ + + private static final int BPT_SHIFT=8; /* Bidi_Paired_Bracket_Type(bpt): 2 bits (9..8) */ + + private static final int MIRROR_DELTA_SHIFT=13; /* bidi mirroring delta: 3 bits (15..13) */ + + private static final int CLASS_MASK= 0x0000001f; + private static final int JT_MASK= 0x000000e0; + private static final int BPT_MASK= 0x00000300; + + private static final int getClassFromProps(int props) { + return props&CLASS_MASK; + } + private static final boolean getFlagFromProps(int props, int shift) { + return ((props>>shift)&1)!=0; + } + private static final int getMirrorDeltaFromProps(int props) { + return (short)props>>MIRROR_DELTA_SHIFT; + } + + private static final int ESC_MIRROR_DELTA=-4; + + // definitions for 32-bit mirror table entry --------------------------- *** + + /* the source Unicode code point takes 21 bits (20..0) */ + private static final int MIRROR_INDEX_SHIFT=21; + + private static final int getMirrorCodePoint(int m) { + return m&0x1fffff; + } + private static final int getMirrorIndex(int m) { + return m>>>MIRROR_INDEX_SHIFT; + } + + + /* + * public singleton instance + */ + public static final UBiDiProps INSTANCE; + + // This static initializer block must be placed after + // other static member initialization + static { + try { + INSTANCE = new UBiDiProps(); + } catch (IOException e) { + throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME,""); + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/java.base/share/classes/jdk/internal/icu/impl/UCharacterProperty.java Mon Jan 13 08:05:59 2020 -0800 @@ -0,0 +1,614 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +/* + ******************************************************************************* + * Copyright (C) 1996-2014, International Business Machines Corporation and + * others. All Rights Reserved. + ******************************************************************************* + */ + +package jdk.internal.icu.impl; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Iterator; +import java.util.MissingResourceException; + +import jdk.internal.icu.lang.UCharacter.HangulSyllableType; +import jdk.internal.icu.lang.UCharacter.NumericType; +import jdk.internal.icu.text.UTF16; +import jdk.internal.icu.text.UnicodeSet; +import jdk.internal.icu.util.VersionInfo; + +/** +* <p>Internal class used for Unicode character property database.</p> +* <p>This classes store binary data read from uprops.icu. +* It does not have the capability to parse the data into more high-level +* information. It only returns bytes of information when required.</p> +* <p>Due to the form most commonly used for retrieval, array of char is used +* to store the binary data.</p> +* <p>UCharacterPropertyDB also contains information on accessing indexes to +* significant points in the binary data.</p> +* <p>Responsibility for molding the binary data into more meaning form lies on +* <a href=UCharacter.html>UCharacter</a>.</p> +* @author Syn Wee Quek +* @since release 2.1, february 1st 2002 +*/ + +public final class UCharacterProperty +{ + // public data members ----------------------------------------------- + + /* + * public singleton instance + */ + public static final UCharacterProperty INSTANCE; + + /** + * Trie data + */ + public Trie2_16 m_trie_; + + /** + * Unicode version + */ + public VersionInfo m_unicodeVersion_; + + /** + * Character type mask + */ + public static final int TYPE_MASK = 0x1F; + + // uprops.h enum UPropertySource --------------------------------------- *** + + /** From uchar.c/uprops.icu main trie */ + public static final int SRC_CHAR=1; + /** From uchar.c/uprops.icu properties vectors trie */ + public static final int SRC_PROPSVEC=2; + /** From ubidi_props.c/ubidi.icu */ + public static final int SRC_BIDI=5; + /** From normalizer2impl.cpp/nfc.nrm */ + public static final int SRC_NFC=8; + /** From normalizer2impl.cpp/nfkc.nrm */ + public static final int SRC_NFKC=9; + + // public methods ---------------------------------------------------- + + /** + * Gets the main property value for code point ch. + * @param ch code point whose property value is to be retrieved + * @return property value of code point + */ + public final int getProperty(int ch) + { + return m_trie_.get(ch); + } + + /** + * Gets the unicode additional properties. + * Java version of C u_getUnicodeProperties(). + * @param codepoint codepoint whose additional properties is to be + * retrieved + * @param column The column index. + * @return unicode properties + */ + public int getAdditional(int codepoint, int column) { + assert column >= 0; + if (column >= m_additionalColumnsCount_) { + return 0; + } + return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column]; + } + + /** + * <p>Get the "age" of the code point.</p> + * <p>The "age" is the Unicode version when the code point was first + * designated (as a non-character or for Private Use) or assigned a + * character.</p> + * <p>This can be useful to avoid emitting code points to receiving + * processes that do not accept newer characters.</p> + * <p>The data is from the UCD file DerivedAge.txt.</p> + * <p>This API does not check the validity of the codepoint.</p> + * @param codepoint The code point. + * @return the Unicode version number + */ + public VersionInfo getAge(int codepoint) + { + int version = getAdditional(codepoint, 0) >> AGE_SHIFT_; + return VersionInfo.getInstance( + (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, + version & LAST_NIBBLE_MASK_, 0, 0); + } + + // int-value and enumerated properties --------------------------------- *** + + public int getType(int c) { + return getProperty(c)&TYPE_MASK; + } + + /* + * Map some of the Grapheme Cluster Break values to Hangul Syllable Types. + * Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break. + */ + private static final int /* UHangulSyllableType */ gcbToHst[]={ + HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */ + HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */ + HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */ + HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */ + HangulSyllableType.LEADING_JAMO, /* U_GCB_L */ + HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */ + HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */ + HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */ + HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */ + HangulSyllableType.VOWEL_JAMO /* U_GCB_V */ + /* + * Omit GCB values beyond what we need for hst. + * The code below checks for the array length. + */ + }; + + private class IntProperty { + int column; // SRC_PROPSVEC column, or "source" if mask==0 + int mask; + int shift; + + IntProperty(int column, int mask, int shift) { + this.column=column; + this.mask=mask; + this.shift=shift; + } + + IntProperty(int source) { + this.column=source; + this.mask=0; + } + + int getValue(int c) { + // systematic, directly stored properties + return (getAdditional(c, column)&mask)>>>shift; + } + } + + private class BiDiIntProperty extends IntProperty { + BiDiIntProperty() { + super(SRC_BIDI); + } + } + + private class CombiningClassIntProperty extends IntProperty { + CombiningClassIntProperty(int source) { + super(source); + } + } + + private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties + int which; + int max; + + NormQuickCheckIntProperty(int source, int which, int max) { + super(source); + this.which=which; + this.max=max; + } + } + + private IntProperty intProp = new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE + int getValue(int c) { + return UBiDiProps.INSTANCE.getPairedBracketType(c); + } + }; + + public int getIntPropertyValue(int c, int which) { + if (which == BIDI_PAIRED_BRACKET_TYPE) { + return intProp.getValue(c); + } + return 0; // undefined + } + + /** + * Forms a supplementary code point from the argument character<br> + * Note this is for internal use hence no checks for the validity of the + * surrogate characters are done + * @param lead lead surrogate character + * @param trail trailing surrogate character + * @return code point of the supplementary character + */ + public static int getRawSupplementary(char lead, char trail) + { + return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; + } + + /** + * Gets the type mask + * @param type character type + * @return mask + */ + public static final int getMask(int type) + { + return 1 << type; + } + + /** + * Returns the digit values of characters like 'A' - 'Z', normal, + * half-width and full-width. This method assumes that the other digit + * characters are checked by the calling method. + * @param ch character to test + * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise + * its corresponding digit will be returned. + */ + public static int getEuropeanDigit(int ch) { + if ((ch > 0x7a && ch < 0xff21) + || ch < 0x41 || (ch > 0x5a && ch < 0x61) + || ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) { + return -1; + } + if (ch <= 0x7a) { + // ch >= 0x41 or ch < 0x61 + return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); + } + // ch >= 0xff21 + if (ch <= 0xff3a) { + return ch + 10 - 0xff21; + } + // ch >= 0xff41 && ch <= 0xff5a + return ch + 10 - 0xff41; + } + + public int digit(int c) { + int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_; + if(value<=9) { + return value; + } else { + return -1; + } + } + + // protected variables ----------------------------------------------- + + /** + * Extra property trie + */ + Trie2_16 m_additionalTrie_; + /** + * Extra property vectors, 1st column for age and second for binary + * properties. + */ + int m_additionalVectors_[]; + /** + * Number of additional columns + */ + int m_additionalColumnsCount_; + /** + * Maximum values for block, bits used as in vector word + * 0 + */ + int m_maxBlockScriptValue_; + /** + * Maximum values for script, bits used as in vector word + * 0 + */ + int m_maxJTGValue_; + /** + * Script_Extensions data + */ + public char[] m_scriptExtensions_; + + // private variables ------------------------------------------------- + + /** + * Default name of the datafile + */ + @SuppressWarnings("deprecation") + private static final String DATA_FILE_NAME_ = + "/jdk/internal/icu/impl/data/icudt" + + VersionInfo.ICU_DATA_VERSION_PATH + + "/uprops.icu"; + + /** + * Shift value for lead surrogate to form a supplementary character. + */ + private static final int LEAD_SURROGATE_SHIFT_ = 10; + /** + * Offset to add to combined surrogate pair to avoid masking. + */ + private static final int SURROGATE_OFFSET_ = + UTF16.SUPPLEMENTARY_MIN_VALUE - + (UTF16.SURROGATE_MIN_VALUE << + LEAD_SURROGATE_SHIFT_) - + UTF16.TRAIL_SURROGATE_MIN_VALUE; + + + // property data constants ------------------------------------------------- + + /** + * Numeric types and values in the main properties words. + */ + private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6; + private static final int getNumericTypeValue(int props) { + return props >> NUMERIC_TYPE_VALUE_SHIFT_; + } + + /* constants for the storage form of numeric types and values */ + /** No numeric value. */ + private static final int NTV_NONE_ = 0; + /** Decimal digits: nv=0..9 */ + private static final int NTV_DECIMAL_START_ = 1; + /** Other digits: nv=0..9 */ + private static final int NTV_DIGIT_START_ = 11; + /** Small integers: nv=0..154 */ + private static final int NTV_NUMERIC_START_ = 21; + + private static final int ntvGetType(int ntv) { + return + (ntv==NTV_NONE_) ? NumericType.NONE : + (ntv<NTV_DIGIT_START_) ? NumericType.DECIMAL : + (ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT : + NumericType.NUMERIC; + } + + /* + * Properties in vector word 0 + * Bits + * 31..24 DerivedAge version major/minor one nibble each + * 23..22 3..1: Bits 7..0 = Script_Extensions index + * 3: Script value from Script_Extensions + * 2: Script=Inherited + * 1: Script=Common + * 0: Script=bits 7..0 + * 21..20 reserved + * 19..17 East Asian Width + * 16.. 8 UBlockCode + * 7.. 0 UScriptCode + */ + /** + * Script_Extensions: mask includes Script + */ + public static final int SCRIPT_X_MASK = 0x00c000ff; + //private static final int SCRIPT_X_SHIFT = 22; + /** + * Integer properties mask and shift values for East Asian cell width. + * Equivalent to icu4c UPROPS_EA_MASK + */ + private static final int EAST_ASIAN_MASK_ = 0x000e0000; + /** + * Integer properties mask and shift values for East Asian cell width. + * Equivalent to icu4c UPROPS_EA_SHIFT + */ + private static final int EAST_ASIAN_SHIFT_ = 17; + /** + * Integer properties mask and shift values for blocks. + * Equivalent to icu4c UPROPS_BLOCK_MASK + */ + private static final int BLOCK_MASK_ = 0x0001ff00; + /** + * Integer properties mask and shift values for blocks. + * Equivalent to icu4c UPROPS_BLOCK_SHIFT + */ + private static final int BLOCK_SHIFT_ = 8; + /** + * Integer properties mask and shift values for scripts. + * Equivalent to icu4c UPROPS_SHIFT_MASK + */ + public static final int SCRIPT_MASK_ = 0x000000ff; + + /** + * Additional properties used in internal trie data + */ + /* + * Properties in vector word 1 + * Each bit encodes one binary property. + * The following constants represent the bit number, use 1<<UPROPS_XYZ. + * UPROPS_BINARY_1_TOP<=32! + * + * Keep this list of property enums in sync with + * propListNames[] in icu/source/tools/genprops/props2.c! + * + * ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_". + */ + private static final int WHITE_SPACE_PROPERTY_ = 0; + private static final int DASH_PROPERTY_ = 1; + private static final int HYPHEN_PROPERTY_ = 2; + private static final int QUOTATION_MARK_PROPERTY_ = 3; + private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4; + private static final int MATH_PROPERTY_ = 5; + private static final int HEX_DIGIT_PROPERTY_ = 6; + private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7; + private static final int ALPHABETIC_PROPERTY_ = 8; + private static final int IDEOGRAPHIC_PROPERTY_ = 9; + private static final int DIACRITIC_PROPERTY_ = 10; + private static final int EXTENDER_PROPERTY_ = 11; + private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12; + private static final int GRAPHEME_EXTEND_PROPERTY_ = 13; + private static final int GRAPHEME_LINK_PROPERTY_ = 14; + private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15; + private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16; + private static final int RADICAL_PROPERTY_ = 17; + private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18; + private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19; + private static final int DEPRECATED_PROPERTY_ = 20; + private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21; + private static final int XID_START_PROPERTY_ = 22; + private static final int XID_CONTINUE_PROPERTY_ = 23; + private static final int ID_START_PROPERTY_ = 24; + private static final int ID_CONTINUE_PROPERTY_ = 25; + private static final int GRAPHEME_BASE_PROPERTY_ = 26; + private static final int S_TERM_PROPERTY_ = 27; + private static final int VARIATION_SELECTOR_PROPERTY_ = 28; + private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */ + private static final int PATTERN_WHITE_SPACE = 30; + + /* + * Properties in vector word 2 + * Bits + * 31..26 reserved + * 25..20 Line Break + * 19..15 Sentence Break + * 14..10 Word Break + * 9.. 5 Grapheme Cluster Break + * 4.. 0 Decomposition Type + */ + private static final int LB_MASK = 0x03f00000; + private static final int LB_SHIFT = 20; + + private static final int SB_MASK = 0x000f8000; + private static final int SB_SHIFT = 15; + + private static final int WB_MASK = 0x00007c00; + private static final int WB_SHIFT = 10; + + private static final int GCB_MASK = 0x000003e0; + private static final int GCB_SHIFT = 5; + + /** + * Integer properties mask for decomposition type. + * Equivalent to icu4c UPROPS_DT_MASK. + */ + private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f; + + /** + * First nibble shift + */ + private static final int FIRST_NIBBLE_SHIFT_ = 0x4; + /** + * Second nibble mask + */ + private static final int LAST_NIBBLE_MASK_ = 0xF; + /** + * Age value shift + */ + private static final int AGE_SHIFT_ = 24; + + // private constructors -------------------------------------------------- + + /** + * Constructor + * @exception IOException thrown when data reading fails or data corrupted + */ + private UCharacterProperty() throws IOException + { + // jar access + ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_); + m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable()); + // Read or skip the 16 indexes. + int propertyOffset = bytes.getInt(); + /* exceptionOffset = */ bytes.getInt(); + /* caseOffset = */ bytes.getInt(); + int additionalOffset = bytes.getInt(); + int additionalVectorsOffset = bytes.getInt(); + m_additionalColumnsCount_ = bytes.getInt(); + int scriptExtensionsOffset = bytes.getInt(); + int reservedOffset7 = bytes.getInt(); + /* reservedOffset8 = */ bytes.getInt(); + /* dataTopOffset = */ bytes.getInt(); + m_maxBlockScriptValue_ = bytes.getInt(); + m_maxJTGValue_ = bytes.getInt(); + ICUBinary.skipBytes(bytes, (16 - 12) << 2); + + // read the main properties trie + m_trie_ = Trie2_16.createFromSerialized(bytes); + int expectedTrieLength = (propertyOffset - 16) * 4; + int trieLength = m_trie_.getSerializedLength(); + if(trieLength > expectedTrieLength) { + throw new IOException("uprops.icu: not enough bytes for main trie"); + } + // skip padding after trie bytes + ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); + + // skip unused intervening data structures + ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4); + + if(m_additionalColumnsCount_ > 0) { + // reads the additional property block + m_additionalTrie_ = Trie2_16.createFromSerialized(bytes); + expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4; + trieLength = m_additionalTrie_.getSerializedLength(); + if(trieLength > expectedTrieLength) { + throw new IOException("uprops.icu: not enough bytes for additional-properties trie"); + } + // skip padding after trie bytes + ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength); + + // additional properties + int size = scriptExtensionsOffset - additionalVectorsOffset; + m_additionalVectors_ = new int[size]; + for (int i = 0; i < size; i ++) { + m_additionalVectors_[i] = bytes.getInt(); + } + } + + // Script_Extensions + int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2; + if(numChars > 0) { + m_scriptExtensions_ = new char[numChars]; + for(int i = 0; i < numChars; ++i) { + m_scriptExtensions_[i] = bytes.getChar(); + } + } + } + + private static final class IsAcceptable implements ICUBinary.Authenticate { + // @Override when we switch to Java 6 + public boolean isDataVersionAcceptable(byte version[]) { + return version[0] == 7; + } + } + + private static final int DATA_FORMAT = 0x5550726F; // "UPro" + + public void upropsvec_addPropertyStarts(UnicodeSet set) { + /* add the start code point of each same-value range of the properties vectors trie */ + if(m_additionalColumnsCount_>0) { + /* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */ + Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator(); + Trie2.Range range; + while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { + set.add(range.startCodePoint); + } + } + } + + // This static initializer block must be placed after + // other static member initialization + static { + try { + INSTANCE = new UCharacterProperty(); + } + catch (IOException e) { + throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME_,""); + } + } + + + // Moved from UProperty.java + /** + * Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3). + * Used in UAX #9: Unicode Bidirectional Algorithm + * (http://www.unicode.org/reports/tr9/) + * Returns UCharacter.BidiPairedBracketType values. + * @stable ICU 52 + */ + public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015; + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/java.base/share/classes/jdk/internal/icu/impl/UnicodeSetStringSpan.java Mon Jan 13 08:05:59 2020 -0800 @@ -0,0 +1,1168 @@ +/* + * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + ****************************************************************************** + * + * Copyright (C) 2009-2014, International Business Machines + * Corporation and others. All Rights Reserved. + * + ****************************************************************************** + */ + +package jdk.internal.icu.impl; + +import java.util.ArrayList; + +import jdk.internal.icu.text.UTF16; +import jdk.internal.icu.text.UnicodeSet; +import jdk.internal.icu.text.UnicodeSet.SpanCondition; +import jdk.internal.icu.util.OutputInt; + +/* + * Implement span() etc. for a set with strings. + * Avoid recursion because of its exponential complexity. + * Instead, try multiple paths at once and track them with an IndexList. + */ +public class UnicodeSetStringSpan { + + /* + * Which span() variant will be used? The object is either built for one variant and used once, + * or built for all and may be used many times. + */ + public static final int WITH_COUNT = 0x40; // spanAndCount() may be called + public static final int FWD = 0x20; + public static final int BACK = 0x10; + // public static final int UTF16 = 8; + public static final int CONTAINED = 2; + public static final int NOT_CONTAINED = 1; + + public static final int ALL = 0x7f; + + public static final int FWD_UTF16_CONTAINED = FWD | /* UTF16 | */ CONTAINED; + public static final int FWD_UTF16_NOT_CONTAINED = FWD | /* UTF16 | */NOT_CONTAINED; + public static final int BACK_UTF16_CONTAINED = BACK | /* UTF16 | */ CONTAINED; + public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED; + + /** + * Special spanLength short values. (since Java has not unsigned byte type) + * All code points in the string are contained in the parent set. + */ + static final short ALL_CP_CONTAINED = 0xff; + + /** The spanLength is >=0xfe. */ + static final short LONG_SPAN = ALL_CP_CONTAINED - 1; + + /** Set for span(). Same as parent but without strings. */ + private UnicodeSet spanSet; + + /** + * Set for span(not contained). + * Same as spanSet, plus characters that start or end strings. + */ + private UnicodeSet spanNotSet; + + /** The strings of the parent set. */ + private ArrayList<String> strings; + + /** The lengths of span(), spanBack() etc. for each string. */ + private short[] spanLengths; + + /** Maximum lengths of relevant strings. */ + private int maxLength16; + + /** Are there strings that are not fully contained in the code point set? */ + private boolean someRelevant; + + /** Set up for all variants of span()? */ + private boolean all; + + /** Span helper */ + private OffsetList offsets; + + /** + * Constructs for all variants of span(), or only for any one variant. + * Initializes as little as possible, for single use. + */ + public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) { + spanSet = new UnicodeSet(0, 0x10ffff); + // TODO: With Java 6, just take the parent set's strings as is, + // as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings. + // Then iterate via the first() and higher() methods. + // (We do not want to create multiple Iterator objects in each span().) + // See ICU ticket #7454. + strings = setStrings; + all = (which == ALL); + spanSet.retainAll(set); + if (0 != (which & NOT_CONTAINED)) { + // Default to the same sets. + // addToSpanNotSet() will create a separate set if necessary. + spanNotSet = spanSet; + } + offsets = new OffsetList(); + + // Determine if the strings even need to be taken into account at all for span() etc. + // If any string is relevant, then all strings need to be used for + // span(longest match) but only the relevant ones for span(while contained). + // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH + // and do not store UTF-8 strings if !thisRelevant and CONTAINED. + // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) + // Also count the lengths of the UTF-8 versions of the strings for memory allocation. + int stringsLength = strings.size(); + + int i, spanLength; + someRelevant = false; + for (i = 0; i < stringsLength; ++i) { + String string = strings.get(i); + int length16 = string.length(); + spanLength = spanSet.span(string, SpanCondition.CONTAINED); + if (spanLength < length16) { // Relevant string. + someRelevant = true; + } + if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) { + maxLength16 = length16; + } + } + if (!someRelevant && (which & WITH_COUNT) == 0) { + return; + } + + // Freeze after checking for the need to use strings at all because freezing + // a set takes some time and memory which are wasted if there are no relevant strings. + if (all) { + spanSet.freeze(); + } + + int spanBackLengthsOffset; + + // Allocate a block of meta data. + int allocSize; + if (all) { + // 2 sets of span lengths + allocSize = stringsLength * (2); + } else { + allocSize = stringsLength; // One set of span lengths. + } + spanLengths = new short[allocSize]; + + if (all) { + // Store span lengths for all span() variants. + spanBackLengthsOffset = stringsLength; + } else { + // Store span lengths for only one span() variant. + spanBackLengthsOffset = 0; + } + + // Set the meta data and spanNotSet and write the UTF-8 strings. + + for (i = 0; i < stringsLength; ++i) { + String string = strings.get(i); + int length16 = string.length(); + spanLength = spanSet.span(string, SpanCondition.CONTAINED); + if (spanLength < length16) { // Relevant string. + if (true /* 0 != (which & UTF16) */) { + if (0 != (which & CONTAINED)) { + if (0 != (which & FWD)) { + spanLengths[i] = makeSpanLengthByte(spanLength); + } + if (0 != (which & BACK)) { + spanLength = length16 + - spanSet.spanBack(string, length16, SpanCondition.CONTAINED); + spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength); + } + } else /* not CONTAINED, not all, but NOT_CONTAINED */{ + spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant + // flag. + } + } + if (0 != (which & NOT_CONTAINED)) { + // Add string start and end code points to the spanNotSet so that + // a span(while not contained) stops before any string. + int c; + if (0 != (which & FWD)) { + c = string.codePointAt(0); + addToSpanNotSet(c); + } + if (0 != (which & BACK)) { + c = string.codePointBefore(length16); + addToSpanNotSet(c); + } + } + } else { // Irrelevant string. + if (all) { + spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED; + } else { + // All spanXYZLengths pointers contain the same address. + spanLengths[i] = ALL_CP_CONTAINED; + } + } + } + + // Finish. + if (all) { + spanNotSet.freeze(); + } + } + + /** + * Do the strings need to be checked in span() etc.? + * + * @return true if strings need to be checked (call span() here), + * false if not (use a BMPSet for best performance). + */ + public boolean needsStringSpanUTF16() { + return someRelevant; + } + + /** For fast UnicodeSet::contains(c). */ + public boolean contains(int c) { + return spanSet.contains(c); + } + + /** + * Adds a starting or ending string character to the spanNotSet + * so that a character span ends before any string. + */ + private void addToSpanNotSet(int c) { + if (spanNotSet == null || spanNotSet == spanSet) { + if (spanSet.contains(c)) { + return; // Nothing to do. + } + spanNotSet = spanSet.cloneAsThawed(); + } + spanNotSet.add(c); + } + + /* + * Note: In span() when spanLength==0 + * (after a string match, or at the beginning after an empty code point span) + * and in spanNot() and spanNotUTF8(), + * string matching could use a binary search because all string matches are done + * from the same start index. + * + * For UTF-8, this would require a comparison function that returns UTF-16 order. + * + * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets + * with strings have very few very short strings. For cases with many strings, it might be better to use a different + * API and implementation with a DFA (state machine). + */ + + /* + * Algorithm for span(SpanCondition.CONTAINED) + * + * Theoretical algorithm: + * - Iterate through the string, and at each code point boundary: + * + If the code point there is in the set, then remember to continue after it. + * + If a set string matches at the current position, then remember to continue after it. + * + Either recursively span for each code point or string match, or recursively span + * for all but the shortest one and iteratively continue the span with the shortest local match. + * + Remember the longest recursive span (the farthest end point). + * + If there is no match at the current position, + * neither for the code point there nor for any set string, + * then stop and return the longest recursive span length. + * + * Optimized implementation: + * + * (We assume that most sets will have very few very short strings. + * A span using a string-less set is extremely fast.) + * + * Create and cache a spanSet which contains all of the single code points of the original set + * but none of its strings. + * + * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). + * - Loop: + * + Try to match each set string at the end of the spanLength. + * ~ Set strings that start with set-contained code points + * must be matched with a partial overlap + * because the recursive algorithm would have tried to match them at every position. + * ~ Set strings that entirely consist of set-contained code points + * are irrelevant for span(SpanCondition.CONTAINED) + * because the recursive algorithm would continue after them anyway and + * find the longest recursive match from their end. + * ~ Rather than recursing, note each end point of a set string match. + * + If no set string matched after spanSet.span(), + * then return with where the spanSet.span() ended. + * + If at least one set string matched after spanSet.span(), + * then pop the shortest string match end point and continue the loop, + * trying to match all set strings from there. + * + If at least one more set string matched after a previous string match, then test if the + * code point after the previous string match is also contained in the set. + * Continue the loop with the shortest end point of + * either this code point or a matching set string. + * + If no more set string matched after a previous string match, + * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). + * Stop if spanLength==0, otherwise continue the loop. + * + * By noting each end point of a set string match, the function visits each string position at most once and + * finishes in linear time. + * + * The recursive algorithm may visit the same string position many times + * if multiple paths lead to it and finishes in exponential time. + */ + + /* + * Algorithm for span(SIMPLE) + * + * Theoretical algorithm: + * - Iterate through the string, and at each code point boundary: + * + If the code point there is in the set, then remember to continue after it. + * + If a set string matches at the current position, then remember to continue after it. + * + Continue from the farthest match position and ignore all others. + * + If there is no match at the current position, then stop and return the current position. + * + * Optimized implementation: + * + * (Same assumption and spanSet as above.) + * + * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). + * - Loop: + * + Try to match each set string at the end of the spanLength. + * ~ Set strings that start with set-contained code points + * must be matched with a partial overlap + * because the standard algorithm would have tried to match them earlier. + * ~ Set strings that entirely consist of set-contained code points + * must be matched with a full overlap because the longest-match algorithm + * would hide set string matches that end earlier. + * Such set strings need not be matched earlier inside the code point span + * because the standard algorithm would then have + * continued after the set string match anyway. + * ~ Remember the longest set string match (farthest end point) + * from the earliest starting point. + * + If no set string matched after spanSet.span(), + * then return with where the spanSet.span() ended. + * + If at least one set string matched, + * then continue the loop after the longest match from the earliest position. + * + If no more set string matched after a previous string match, + * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). + * Stop if spanLength==0, otherwise continue the loop. + */ + /** + * Spans a string. + * + * @param s The string to be spanned + * @param start The start index that the span begins + * @param spanCondition The span condition + * @return the limit (exclusive end) of the span + */ + public int span(CharSequence s, int start, SpanCondition spanCondition) { + if (spanCondition == SpanCondition.NOT_CONTAINED) { + return spanNot(s, start, null); + } + int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED); + if (spanLimit == s.length()) { + return spanLimit; + } + return spanWithStrings(s, start, spanLimit, spanCondition); + } + + /** + * Synchronized method for complicated spans using the offsets. + * Avoids synchronization for simple cases. + * + * @param spanLimit = spanSet.span(s, start, CONTAINED) + */ + private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit, + SpanCondition spanCondition) { + // Consider strings; they may overlap with the span. + int initSize = 0; + if (spanCondition == SpanCondition.CONTAINED) { + // Use offset list to try all possibilities. + initSize = maxLength16; + } + offsets.setMaxLength(initSize); + int length = s.length(); + int pos = spanLimit, rest = length - spanLimit; + int spanLength = spanLimit - start; + int i, stringsLength = strings.size(); + for (;;) { + if (spanCondition == SpanCondition.CONTAINED) { + for (i = 0; i < stringsLength; ++i) { + int overlap = spanLengths[i]; + if (overlap == ALL_CP_CONTAINED) { + continue; // Irrelevant string. + } + String string = strings.get(i); + + int length16 = string.length(); + + // Try to match this string at pos-overlap..pos. + if (overlap >= LONG_SPAN) { + overlap = length16; + // While contained: No point matching fully inside the code point span. + overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code + // point. + } + if (overlap > spanLength) { + overlap = spanLength; + } + int inc = length16 - overlap; // Keep overlap+inc==length16. + for (;;) { + if (inc > rest) { + break; + } + // Try to match if the increment is not listed already. + if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) { + if (inc == rest) { + return length; // Reached the end of the string. + } + offsets.addOffset(inc); + } + if (overlap == 0) { + break; + } + --overlap; + ++inc; + } + } + } else /* SIMPLE */{ + int maxInc = 0, maxOverlap = 0; + for (i = 0; i < stringsLength; ++i) { + int overlap = spanLengths[i]; + // For longest match, we do need to try to match even an all-contained string + // to find the match from the earliest start. + + String string = strings.get(i); + + int length16 = string.length(); + + // Try to match this string at pos-overlap..pos. + if (overlap >= LONG_SPAN) { + overlap = length16; + // Longest match: Need to match fully inside the code point span + // to find the match from the earliest start. + } + if (overlap > spanLength) { + overlap = spanLength; + } + int inc = length16 - overlap; // Keep overlap+inc==length16. + for (;;) { + if (inc > rest || overlap < maxOverlap) { + break; + } + // Try to match if the string is longer or starts earlier. + if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc) + && matches16CPB(s, pos - overlap, length, string, length16)) { + maxInc = inc; // Longest match from earliest start. + maxOverlap = overlap; + break; + } + --overlap; + ++inc; + } + } + + if (maxInc != 0 || maxOverlap != 0) { + // Longest-match algorithm, and there was a string match. + // Simply continue after it. + pos += maxInc; + rest -= maxInc; + if (rest == 0) { + return length; // Reached the end of the string. + } + spanLength = 0; // Match strings from after a string match. + continue; + } + } + // Finished trying to match all strings at pos. + + if (spanLength != 0 || pos == 0) { + // The position is after an unlimited code point span (spanLength!=0), + // not after a string match. + // The only position where spanLength==0 after a span is pos==0. + // Otherwise, an unlimited code point span is only tried again when no + // strings match, and if such a non-initial span fails we stop. + if (offsets.isEmpty()) { + return pos; // No strings matched after a span. + } + // Match strings from after the next string match. + } else { + // The position is after a string match (or a single code point). + if (offsets.isEmpty()) { + // No more strings matched after a previous string match. + // Try another code point span from after the last string match. + spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED); + spanLength = spanLimit - pos; + if (spanLength == rest || // Reached the end of the string, or + spanLength == 0 // neither strings nor span progressed. + ) { + return spanLimit; + } + pos += spanLength; + rest -= spanLength; + continue; // spanLength>0: Match strings from after a span. + } else { + // Try to match only one code point from after a string match if some + // string matched beyond it, so that we try all possible positions + // and don't overshoot. + spanLength = spanOne(spanSet, s, pos, rest); + if (spanLength > 0) { + if (spanLength == rest) { + return length; // Reached the end of the string. + } + // Match strings after this code point. + // There cannot be any increments below it because UnicodeSet strings + // contain multiple code points. + pos += spanLength; + rest -= spanLength; + offsets.shift(spanLength); + spanLength = 0; + continue; // Match strings from after a single code point. + } + // Match strings from after the next string match. + } + } + int minOffset = offsets.popMinimum(null); + pos += minOffset; + rest -= minOffset; + spanLength = 0; // Match strings from after a string match. + } + } + + /** + * Spans a string and counts the smallest number of set elements on any path across the span. + * + * <p>For proper counting, we cannot ignore strings that are fully contained in code point spans. + * + * <p>If the set does not have any fully-contained strings, then we could optimize this + * like span(), but such sets are likely rare, and this is at least still linear. + * + * @param s The string to be spanned + * @param start The start index that the span begins + * @param spanCondition The span condition + * @param outCount The count + * @return the limit (exclusive end) of the span + */ + public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, + OutputInt outCount) { + if (spanCondition == SpanCondition.NOT_CONTAINED) { + return spanNot(s, start, outCount); + } + // Consider strings; they may overlap with the span, + // and they may result in a smaller count that with just code points. + if (spanCondition == SpanCondition.CONTAINED) { + return spanContainedAndCount(s, start, outCount); + } + // SIMPLE (not synchronized, does not use offsets) + int stringsLength = strings.size(); + int length = s.length(); + int pos = start; + int rest = length - start; + int count = 0; + while (rest != 0) { + // Try to match the next code point. + int cpLength = spanOne(spanSet, s, pos, rest); + int maxInc = (cpLength > 0) ? cpLength : 0; + // Try to match all of the strings. + for (int i = 0; i < stringsLength; ++i) { + String string = strings.get(i); + int length16 = string.length(); + if (maxInc < length16 && length16 <= rest && + matches16CPB(s, pos, length, string, length16)) { + maxInc = length16; + } + } + // We are done if there is no match beyond pos. + if (maxInc == 0) { + outCount.value = count; + return pos; + } + // Continue from the longest match. + ++count; + pos += maxInc; + rest -= maxInc; + } + outCount.value = count; + return pos; + } + + private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) { + // Use offset list to try all possibilities. + offsets.setMaxLength(maxLength16); + int stringsLength = strings.size(); + int length = s.length(); + int pos = start; + int rest = length - start; + int count = 0; + while (rest != 0) { + // Try to match the next code point. + int cpLength = spanOne(spanSet, s, pos, rest); + if (cpLength > 0) { + offsets.addOffsetAndCount(cpLength, count + 1); + } + // Try to match all of the strings. + for (int i = 0; i < stringsLength; ++i) { + String string = strings.get(i); + int length16 = string.length(); + // Note: If the strings were sorted by length, then we could also + // avoid trying to match if there is already a match of the same length. + if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) && + matches16CPB(s, pos, length, string, length16)) { + offsets.addOffsetAndCount(length16, count + 1); + } + } + // We are done if there is no match beyond pos. + if (offsets.isEmpty()) { + outCount.value = count; + return pos; + } + // Continue from the nearest match. + int minOffset = offsets.popMinimum(outCount); + count = outCount.value; + pos += minOffset; + rest -= minOffset; + } + outCount.value = count; + return pos; + } + + /** + * Span a string backwards. + * + * @param s The string to be spanned + * @param spanCondition The span condition + * @return The string index which starts the span (i.e. inclusive). + */ + public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) { + if (spanCondition == SpanCondition.NOT_CONTAINED) { + return spanNotBack(s, length); + } + int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED); + if (pos == 0) { + return 0; + } + int spanLength = length - pos; + + // Consider strings; they may overlap with the span. + int initSize = 0; + if (spanCondition == SpanCondition.CONTAINED) { + // Use offset list to try all possibilities. + initSize = maxLength16; + } + offsets.setMaxLength(initSize); + int i, stringsLength = strings.size(); + int spanBackLengthsOffset = 0; + if (all) { + spanBackLengthsOffset = stringsLength; + } + for (;;) { + if (spanCondition == SpanCondition.CONTAINED) { + for (i = 0; i < stringsLength; ++i) { + int overlap = spanLengths[spanBackLengthsOffset + i]; + if (overlap == ALL_CP_CONTAINED) { + continue; // Irrelevant string. + } + String string = strings.get(i); + + int length16 = string.length(); + + // Try to match this string at pos-(length16-overlap)..pos-length16. + if (overlap >= LONG_SPAN) { + overlap = length16; + // While contained: No point matching fully inside the code point span. + int len1 = 0; + len1 = string.offsetByCodePoints(0, 1); + overlap -= len1; // Length of the string minus the first code point. + } + if (overlap > spanLength) { + overlap = spanLength; + } + int dec = length16 - overlap; // Keep dec+overlap==length16. + for (;;) { + if (dec > pos) { + break; + } + // Try to match if the decrement is not listed already. + if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) { + if (dec == pos) { + return 0; // Reached the start of the string. + } + offsets.addOffset(dec); + } + if (overlap == 0) { + break; + } + --overlap; + ++dec; + } + } + } else /* SIMPLE */{ + int maxDec = 0, maxOverlap = 0; + for (i = 0; i < stringsLength; ++i) { + int overlap = spanLengths[spanBackLengthsOffset + i]; + // For longest match, we do need to try to match even an all-contained string + // to find the match from the latest end. + + String string = strings.get(i); + + int length16 = string.length(); + + // Try to match this string at pos-(length16-overlap)..pos-length16. + if (overlap >= LONG_SPAN) { + overlap = length16; + // Longest match: Need to match fully inside the code point span + // to find the match from the latest end. + } + if (overlap > spanLength) { + overlap = spanLength; + } + int dec = length16 - overlap; // Keep dec+overlap==length16. + for (;;) { + if (dec > pos || overlap < maxOverlap) { + break; + } + // Try to match if the string is longer or ends later. + if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec) + && matches16CPB(s, pos - dec, length, string, length16)) { + maxDec = dec; // Longest match from latest end. + maxOverlap = overlap; + break; + } + --overlap; + ++dec; + } + } + + if (maxDec != 0 || maxOverlap != 0) { + // Longest-match algorithm, and there was a string match. + // Simply continue before it. + pos -= maxDec; + if (pos == 0) { + return 0; // Reached the start of the string. + } + spanLength = 0; // Match strings from before a string match. + continue; + } + } + // Finished trying to match all strings at pos. + + if (spanLength != 0 || pos == length) { + // The position is before an unlimited code point span (spanLength!=0), + // not before a string match. + // The only position where spanLength==0 before a span is pos==length. + // Otherwise, an unlimited code point span is only tried again when no + // strings match, and if such a non-initial span fails we stop. + if (offsets.isEmpty()) { + return pos; // No strings matched before a span. + } + // Match strings from before the next string match. + } else { + // The position is before a string match (or a single code point). + if (offsets.isEmpty()) { + // No more strings matched before a previous string match. + // Try another code point span from before the last string match. + int oldPos = pos; + pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED); + spanLength = oldPos - pos; + if (pos == 0 || // Reached the start of the string, or + spanLength == 0 // neither strings nor span progressed. + ) { + return pos; + } + continue; // spanLength>0: Match strings from before a span. + } else { + // Try to match only one code point from before a string match if some + // string matched beyond it, so that we try all possible positions + // and don't overshoot. + spanLength = spanOneBack(spanSet, s, pos); + if (spanLength > 0) { + if (spanLength == pos) { + return 0; // Reached the start of the string. + } + // Match strings before this code point. + // There cannot be any decrements below it because UnicodeSet strings + // contain multiple code points. + pos -= spanLength; + offsets.shift(spanLength); + spanLength = 0; + continue; // Match strings from before a single code point. + } + // Match strings from before the next string match. + } + } + pos -= offsets.popMinimum(null); + spanLength = 0; // Match strings from before a string match. + } + } + + /** + * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED) + * + * Theoretical algorithm: + * - Iterate through the string, and at each code point boundary: + * + If the code point there is in the set, then return with the current position. + * + If a set string matches at the current position, then return with the current position. + * + * Optimized implementation: + * + * (Same assumption as for span() above.) + * + * Create and cache a spanNotSet which contains + * all of the single code points of the original set but none of its strings. + * For each set string add its initial code point to the spanNotSet. + * (Also add its final code point for spanNotBack().) + * + * - Loop: + * + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED). + * + If the current code point is in the original set, then return the current position. + * + If any set string matches at the current position, then return the current position. + * + If there is no match at the current position, neither for the code point + * there nor for any set string, then skip this code point and continue the loop. + * This happens for set-string-initial code points that were added to spanNotSet + * when there is not actually a match for such a set string. + * + * @param s The string to be spanned + * @param start The start index that the span begins + * @param outCount If not null: Receives the number of code points across the span. + * @return the limit (exclusive end) of the span + */ + private int spanNot(CharSequence s, int start, OutputInt outCount) { + int length = s.length(); + int pos = start, rest = length - start; + int stringsLength = strings.size(); + int count = 0; + do { + // Span until we find a code point from the set, + // or a code point that starts or ends some string. + int spanLimit; + if (outCount == null) { + spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED); + } else { + spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount); + outCount.value = count = count + outCount.value; + } + if (spanLimit == length) { + return length; // Reached the end of the string. + } + pos = spanLimit; + rest = length - spanLimit; + + // Check whether the current code point is in the original set, + // without the string starts and ends. + int cpLength = spanOne(spanSet, s, pos, rest); + if (cpLength > 0) { + return pos; // There is a set element at pos. + } + + // Try to match the strings at pos. + for (int i = 0; i < stringsLength; ++i) { + if (spanLengths[i] == ALL_CP_CONTAINED) { + continue; // Irrelevant string. + } + String string = strings.get(i); + + int length16 = string.length(); + if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) { + return pos; // There is a set element at pos. + } + } + + // The span(while not contained) ended on a string start/end which is + // not in the original set. Skip this code point and continue. + // cpLength<0 + pos -= cpLength; + rest += cpLength; + ++count; + } while (rest != 0); + if (outCount != null) { + outCount.value = count; + } + return length; // Reached the end of the string. + } + + private int spanNotBack(CharSequence s, int length) { + int pos = length; + int i, stringsLength = strings.size(); + do { + // Span until we find a code point from the set, + // or a code point that starts or ends some string. + pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED); + if (pos == 0) { + return 0; // Reached the start of the string. + } + + // Check whether the current code point is in the original set, + // without the string starts and ends. + int cpLength = spanOneBack(spanSet, s, pos); + if (cpLength > 0) { + return pos; // There is a set element at pos. + } + + // Try to match the strings at pos. + for (i = 0; i < stringsLength; ++i) { + // Use spanLengths rather than a spanLengths pointer because + // it is easier and we only need to know whether the string is irrelevant + // which is the same in either array. + if (spanLengths[i] == ALL_CP_CONTAINED) { + continue; // Irrelevant string. + } + String string = strings.get(i); + + int length16 = string.length(); + if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) { + return pos; // There is a set element at pos. + } + } + + // The span(while not contained) ended on a string start/end which is + // not in the original set. Skip this code point and continue. + // cpLength<0 + pos += cpLength; + } while (pos != 0); + return 0; // Reached the start of the string. + } + + static short makeSpanLengthByte(int spanLength) { + // 0xfe==UnicodeSetStringSpan::LONG_SPAN + return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN; + } + + // Compare strings without any argument checks. Requires length>0. + private static boolean matches16(CharSequence s, int start, final String t, int length) { + int end = start + length; + while (length-- > 0) { + if (s.charAt(--end) != t.charAt(length)) { + return false; + } + } + return true; + } + + /** + * Compare 16-bit Unicode strings (which may be malformed UTF-16) + * at code point boundaries. + * That is, each edge of a match must not be in the middle of a surrogate pair. + * @param s The string to match in. + * @param start The start index of s. + * @param limit The limit of the subsequence of s being spanned. + * @param t The substring to be matched in s. + * @param tlength The length of t. + */ + static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) { + return matches16(s, start, t, tlength) + && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) && + Character.isLowSurrogate(s.charAt(start))) + && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) && + Character.isLowSurrogate(s.charAt(start + tlength))); + } + + /** + * Does the set contain the next code point? + * If so, return its length; otherwise return its negative length. + */ + static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) { + char c = s.charAt(start); + if (c >= 0xd800 && c <= 0xdbff && length >= 2) { + char c2 = s.charAt(start + 1); + if (UTF16.isTrailSurrogate(c2)) { + int supplementary = UCharacterProperty.getRawSupplementary(c, c2); + return set.contains(supplementary) ? 2 : -2; + } + } + return set.contains(c) ? 1 : -1; + } + + static int spanOneBack(final UnicodeSet set, CharSequence s, int length) { + char c = s.charAt(length - 1); + if (c >= 0xdc00 && c <= 0xdfff && length >= 2) { + char c2 = s.charAt(length - 2); + if (UTF16.isLeadSurrogate(c2)) { + int supplementary = UCharacterProperty.getRawSupplementary(c2, c); + return set.contains(supplementary) ? 2 : -2; + } + } + return set.contains(c) ? 1 : -1; + } + + /** + * Helper class for UnicodeSetStringSpan. + * + * <p>List of offsets from the current position from where to try matching + * a code point or a string. + * Stores offsets rather than indexes to simplify the code and use the same list + * for both increments (in span()) and decrements (in spanBack()). + * + * <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time + * are relatively dense, that is, + * there are normally no gaps of hundreds or thousands of offset values. + * + * <p>This class optionally also tracks the minimum non-negative count for each position, + * intended to count the smallest number of elements of any path leading to that position. + * + * <p>The implementation uses a circular buffer of count integers, + * each indicating whether the corresponding offset is in the list, + * and its path element count. + * This avoids inserting into a sorted list of offsets (or absolute indexes) + * and physically moving part of the list. + * + * <p>Note: In principle, the caller should setMaxLength() to + * the maximum of the max string length and U16_LENGTH/U8_LENGTH + * to account for "long" single code points. + * + * <p>Note: An earlier version did not track counts and stored only byte flags. + * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64, + * the list could be stored as bit flags in a single integer. + * Rather than handling a circular buffer with a start list index, + * the integer would simply be shifted when lower offsets are removed. + * UnicodeSet does not have a limit on the lengths of strings. + */ + private static final class OffsetList { + private int[] list; + private int length; + private int start; + + public OffsetList() { + list = new int[16]; // default size + } + + public void setMaxLength(int maxLength) { + if (maxLength > list.length) { + list = new int[maxLength]; + } + clear(); + } + + public void clear() { + for (int i = list.length; i-- > 0;) { + list[i] = 0; + } + start = length = 0; + } + + public boolean isEmpty() { + return (length == 0); + } + + /** + * Reduces all stored offsets by delta, used when the current position moves by delta. + * There must not be any offsets lower than delta. + * If there is an offset equal to delta, it is removed. + * + * @param delta [1..maxLength] + */ + public void shift(int delta) { + int i = start + delta; + if (i >= list.length) { + i -= list.length; + } + if (list[i] != 0) { + list[i] = 0; + --length; + } + start = i; + } + + /** + * Adds an offset. The list must not contain it yet. + * @param offset [1..maxLength] + */ + public void addOffset(int offset) { + int i = start + offset; + if (i >= list.length) { + i -= list.length; + } + assert list[i] == 0; + list[i] = 1; + ++length; + } + + /** + * Adds an offset and updates its count. + * The list may already contain the offset. + * @param offset [1..maxLength] + */ + public void addOffsetAndCount(int offset, int count) { + assert count > 0; + int i = start + offset; + if (i >= list.length) { + i -= list.length; + } + if (list[i] == 0) { + list[i] = count; + ++length; + } else if (count < list[i]) { + list[i] = count; + } + } + + /** + * @param offset [1..maxLength] + */ + public boolean containsOffset(int offset) { + int i = start + offset; + if (i >= list.length) { + i -= list.length; + } + return list[i] != 0; + } + + /** + * @param offset [1..maxLength] + */ + public boolean hasCountAtOffset(int offset, int count) { + int i = start + offset; + if (i >= list.length) { + i -= list.length; + } + int oldCount = list[i]; + return oldCount != 0 && oldCount <= count; + } + + /** + * Finds the lowest stored offset from a non-empty list, removes it, + * and reduces all other offsets by this minimum. + * @return min=[1..maxLength] + */ + public int popMinimum(OutputInt outCount) { + // Look for the next offset in list[start+1..list.length-1]. + int i = start, result; + while (++i < list.length) { + int count = list[i]; + if (count != 0) { + list[i] = 0; + --length; + result = i - start; + start = i; + if (outCount != null) { outCount.value = count; } + return result; + } + } + // i==list.length + + // Wrap around and look for the next offset in list[0..start]. + // Since the list is not empty, there will be one. + result = list.length - start; + i = 0; + int count; + while ((count = list[i]) == 0) { + ++i; + } + list[i] = 0; + --length; + start = i; + if (outCount != null) { outCount.value = count; } + return result + i; + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/java.base/share/classes/jdk/internal/icu/impl/Utility.java Mon Jan 13 08:05:59 2020 -0800 @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +/* + ******************************************************************************* + * Copyright (C) 1996-2011, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ + +package jdk.internal.icu.impl; + +import jdk.internal.icu.lang.UCharacter; +import jdk.internal.icu.text.UTF16; + +import java.io.IOException; +import java.util.Locale; + +public final class Utility { + + /** + * Convert characters outside the range U+0020 to U+007F to + * Unicode escapes, and convert backslash to a double backslash. + */ + public static final String escape(String s) { + StringBuilder buf = new StringBuilder(); + for (int i=0; i<s.length(); ) { + int c = Character.codePointAt(s, i); + i += UTF16.getCharCount(c); + if (c >= ' ' && c <= 0x007F) { + if (c == '\\') { + buf.append("\\\\"); // That is, "\\" + } else { + buf.append((char)c); + } + } else { + boolean four = c <= 0xFFFF; + buf.append(four ? "\\u" : "\\U"); + buf.append(hex(c, four ? 4 : 8)); + } + } + return buf.toString(); + } + + /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ + private static final char[] UNESCAPE_MAP = { + /*" 0x22, 0x22 */ + /*' 0x27, 0x27 */ + /*? 0x3F, 0x3F */ + /*\ 0x5C, 0x5C */ + /*a*/ 0x61, 0x07, + /*b*/ 0x62, 0x08, + /*e*/ 0x65, 0x1b, + /*f*/ 0x66, 0x0c, + /*n*/ 0x6E, 0x0a, + /*r*/ 0x72, 0x0d, + /*t*/ 0x74, 0x09, + /*v*/ 0x76, 0x0b + }; + + /** + * Convert an escape to a 32-bit code point value. We attempt + * to parallel the icu4c unescapeAt() function. + * @param offset16 an array containing offset to the character + * <em>after</em> the backslash. Upon return offset16[0] will + * be updated to point after the escape sequence. + * @return character value from 0 to 10FFFF, or -1 on error. + */ + public static int unescapeAt(String s, int[] offset16) { + int c; + int result = 0; + int n = 0; + int minDig = 0; + int maxDig = 0; + int bitsPerDigit = 4; + int dig; + int i; + boolean braces = false; + + /* Check that offset is in range */ + int offset = offset16[0]; + int length = s.length(); + if (offset < 0 || offset >= length) { + return -1; + } + + /* Fetch first UChar after '\\' */ + c = Character.codePointAt(s, offset); + offset += UTF16.getCharCount(c); + + /* Convert hexadecimal and octal escapes */ + switch (c) { + case 'u': + minDig = maxDig = 4; + break; + case 'U': + minDig = maxDig = 8; + break; + case 'x': + minDig = 1; + if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) { + ++offset; + braces = true; + maxDig = 8; + } else { + maxDig = 2; + } + break; + default: + dig = UCharacter.digit(c, 8); + if (dig >= 0) { + minDig = 1; + maxDig = 3; + n = 1; /* Already have first octal digit */ + bitsPerDigit = 3; + result = dig; + } + break; + } + if (minDig != 0) { + while (offset < length && n < maxDig) { + c = UTF16.charAt(s, offset); + dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16); + if (dig < 0) { + break; + } + result = (result << bitsPerDigit) | dig; + offset += UTF16.getCharCount(c); + ++n; + } + if (n < minDig) { + return -1; + } + if (braces) { + if (c != 0x7D /*}*/) { + return -1; + } + ++offset; + } + if (result < 0 || result >= 0x110000) { + return -1; + } + // If an escape sequence specifies a lead surrogate, see + // if there is a trail surrogate after it, either as an + // escape or as a literal. If so, join them up into a + // supplementary. + if (offset < length && + UTF16.isLeadSurrogate((char) result)) { + int ahead = offset+1; + c = s.charAt(offset); // [sic] get 16-bit code unit + if (c == '\\' && ahead < length) { + int o[] = new int[] { ahead }; + c = unescapeAt(s, o); + ahead = o[0]; + } + if (UTF16.isTrailSurrogate((char) c)) { + offset = ahead; + result = UCharacterProperty.getRawSupplementary( + (char) result, (char) c); + } + } + offset16[0] = offset; + return result; + } + + /* Convert C-style escapes in table */ + for (i=0; i<UNESCAPE_MAP.length; i+=2) { + if (c == UNESCAPE_MAP[i]) { + offset16[0] = offset; + return UNESCAPE_MAP[i+1]; + } else if (c < UNESCAPE_MAP[i]) { + break; + } + } + + /* Map \cX to control-X: X & 0x1F */ + if (c == 'c' && offset < length) { + c = UTF16.charAt(s, offset); + offset16[0] = offset + UTF16.getCharCount(c); + return 0x1F & c; + } + + /* If no special forms are recognized, then consider + * the backslash to generically escape the next character. */ + offset16[0] = offset; + return c; + } + + /** + * Supplies a zero-padded hex representation of an integer (without 0x) + */ + public static String hex(long i, int places) { + if (i == Long.MIN_VALUE) return "-8000000000000000"; + boolean negative = i < 0; + if (negative) { + i = -i; + } + String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH); + if (result.length() < places) { + result = "0000000000000000".substring(result.length(),places) + result; + } + if (negative) { + return '-' + result; + } + return result; + } + + static final char DIGITS[] = { + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', + 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', + 'U', 'V', 'W', 'X', 'Y', 'Z' + }; + + /** + * Return true if the character is NOT printable ASCII. The tab, + * newline and linefeed characters are considered unprintable. + */ + public static boolean isUnprintable(int c) { + //0x20 = 32 and 0x7E = 126 + return !(c >= 0x20 && c <= 0x7E); + } + + /** + * Escape unprintable characters using <backslash>uxxxx notation + * for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and + * above. If the character is printable ASCII, then do nothing + * and return FALSE. Otherwise, append the escaped notation and + * return TRUE. + */ + public static <T extends Appendable> boolean escapeUnprintable(T result, int c) { + try { + if (isUnprintable(c)) { + result.append('\\'); + if ((c & ~0xFFFF) != 0) { + result.append('U'); + result.append(DIGITS[0xF&(c>>28)]); + result.append(DIGITS[0xF&(c>>24)]); + result.append(DIGITS[0xF&(c>>20)]); + result.append(DIGITS[0xF&(c>>16)]); + } else { + result.append('u'); + } + result.append(DIGITS[0xF&(c>>12)]); + result.append(DIGITS[0xF&(c>>8)]); + result.append(DIGITS[0xF&(c>>4)]); + result.append(DIGITS[0xF&c]); + return true; + } + return false; + } catch (IOException e) { + throw new IllegalArgumentException(e); + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/java.base/share/classes/jdk/internal/icu/lang/UCharacter.java Mon Jan 13 08:05:59 2020 -0800 @@ -0,0 +1,545 @@ +/* + * Copyright (c) 2009, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/** +******************************************************************************* +* Copyright (C) 1996-2014, International Business Machines Corporation and +* others. All Rights Reserved. +******************************************************************************* +*/ + +package jdk.internal.icu.lang; + +import jdk.internal.icu.impl.UBiDiProps; +import jdk.internal.icu.impl.UCharacterProperty; +import jdk.internal.icu.text.Normalizer2; +import jdk.internal.icu.text.UTF16; +import jdk.internal.icu.util.VersionInfo; + +/** + * <p>The UCharacter class provides extensions to the + * <a href="http://java.sun.com/j2se/1.5/docs/api/java/lang/Character.html"> + * java.lang.Character</a> class. These extensions provide support for + * more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a> + * class, provide support for supplementary characters (those with code + * points above U+FFFF). + * Each ICU release supports the latest version of Unicode available at that time. + * + * <p>Code points are represented in these API using ints. While it would be + * more convenient in Java to have a separate primitive datatype for them, + * ints suffice in the meantime. + * + * <p>To use this class please add the jar file name icu4j.jar to the + * class path, since it contains data files which supply the information used + * by this file.<br> + * E.g. In Windows <br> + * <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br> + * Otherwise, another method would be to copy the files uprops.dat and + * unames.icu from the icu4j source subdirectory + * <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory + * <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>. + * + * <p>Aside from the additions for UTF-16 support, and the updated Unicode + * properties, the main differences between UCharacter and Character are: + * <ul> + * <li> UCharacter is not designed to be a char wrapper and does not have + * APIs to which involves management of that single char.<br> + * These include: + * <ul> + * <li> char charValue(), + * <li> int compareTo(java.lang.Character, java.lang.Character), etc. + * </ul> + * <li> UCharacter does not include Character APIs that are deprecated, nor + * does it include the Java-specific character information, such as + * boolean isJavaIdentifierPart(char ch). + * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric + * values '10' - '35'. UCharacter also does this in digit and + * getNumericValue, to adhere to the java semantics of these + * methods. New methods unicodeDigit, and + * getUnicodeNumericValue do not treat the above code points + * as having numeric values. This is a semantic change from ICU4J 1.3.1. + * </ul> + * <p> + * Further detail on differences can be determined using the program + * <a href= + * "http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java"> + * com.ibm.icu.dev.test.lang.UCharacterCompare</a> + * </p> + * <p> + * In addition to Java compatibility functions, which calculate derived properties, + * this API provides low-level access to the Unicode Character Database. + * </p> + * <p> + * Unicode assigns each code point (not just assigned character) values for + * many properties. + * Most of them are simple boolean flags, or constants from a small enumerated list. + * For some properties, values are strings or other relatively more complex types. + * </p> + * <p> + * For more information see + * <a href="http://www.unicode/org/ucd/">"About the Unicode Character Database"</a> + * (http://www.unicode.org/ucd/) + * and the <a href="http://www.icu-project.org/userguide/properties.html">ICU + * User Guide chapter on Properties</a> + * (http://www.icu-project.org/userguide/properties.html). + * </p> + * <p> + * There are also functions that provide easy migration from C/POSIX functions + * like isblank(). Their use is generally discouraged because the C/POSIX + * standards do not define their semantics beyond the ASCII range, which means + * that different implementations exhibit very different behavior. + * Instead, Unicode properties should be used directly. + * </p> + * <p> + * There are also only a few, broad C/POSIX character classes, and they tend + * to be used for conflicting purposes. For example, the "isalpha()" class + * is sometimes used to determine word boundaries, while a more sophisticated + * approach would at least distinguish initial letters from continuation + * characters (the latter including combining marks). + * (In ICU, BreakIterator is the most sophisticated API for word boundaries.) + * Another example: There is no "istitle()" class for titlecase characters. + * </p> + * <p> + * ICU 3.4 and later provides API access for all twelve C/POSIX character classes. + * ICU implements them according to the Standard Recommendations in + * Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions + * (http://www.unicode.org/reports/tr18/#Compatibility_Properties). + * </p> + * <p> + * API access for C/POSIX character classes is as follows: + * <pre>{@code + * - alpha: isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC) + * - lower: isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE) + * - upper: isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE) + * - punct: ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)| + * (1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)| + * (1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0 + * - digit: isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER + * - xdigit: hasBinaryProperty(c, UProperty.POSIX_XDIGIT) + * - alnum: hasBinaryProperty(c, UProperty.POSIX_ALNUM) + * - space: isUWhiteSpace(c) or hasBinaryProperty(c, UProperty.WHITE_SPACE) + * - blank: hasBinaryProperty(c, UProperty.POSIX_BLANK) + * - cntrl: getType(c)==CONTROL + * - graph: hasBinaryProperty(c, UProperty.POSIX_GRAPH) + * - print: hasBinaryProperty(c, UProperty.POSIX_PRINT) + * }</pre> + * </p> + * <p> + * The C/POSIX character classes are also available in UnicodeSet patterns, + * using patterns like [:graph:] or \p{graph}. + * </p> + * + * There are several ICU (and Java) whitespace functions. + * Comparison:<ul> + * <li> isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property; + * most of general categories "Z" (separators) + most whitespace ISO controls + * (including no-break spaces, but excluding IS1..IS4 and ZWSP) + * <li> isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces + * <li> isSpaceChar: just Z (including no-break spaces)</ul> + * </p> + * <p> + * This class is not subclassable. + * </p> + * @author Syn Wee Quek + * @stable ICU 2.1 + * @see com.ibm.icu.lang.UCharacterEnums + */ + +public final class UCharacter +{ + + /** + * Joining Group constants. + * @see UProperty#JOINING_GROUP + * @stable ICU 2.4 + */ + public static interface JoiningGroup + { + /** + * @stable ICU 2.4 + */ + public static final int NO_JOINING_GROUP = 0; + } + + /** + * Numeric Type constants. + * @see UProperty#NUMERIC_TYPE + * @stable ICU 2.4 + */ + public static interface NumericType + { + /** + * @stable ICU 2.4 + */ + public static final int NONE = 0; + /** + * @stable ICU 2.4 + */ + public static final int DECIMAL = 1; + /** + * @stable ICU 2.4 + */ + public static final int DIGIT = 2; + /** + * @stable ICU 2.4 + */ + public static final int NUMERIC = 3; + /** + * @stable ICU 2.4 + */ + public static final int COUNT = 4; + } + + /** + * Hangul Syllable Type constants. + * + * @see UProperty#HANGUL_SYLLABLE_TYPE + * @stable ICU 2.6 + */ + public static interface HangulSyllableType + { + /** + * @stable ICU 2.6 + */ + public static final int NOT_APPLICABLE = 0; /*[NA]*/ /*See note !!*/ + /** + * @stable ICU 2.6 + */ + public static final int LEADING_JAMO = 1; /*[L]*/ + /** + * @stable ICU 2.6 + */ + public static final int VOWEL_JAMO = 2; /*[V]*/ + /** + * @stable ICU 2.6 + */ + public static final int TRAILING_JAMO = 3; /*[T]*/ + /** + * @stable ICU 2.6 + */ + public static final int LV_SYLLABLE = 4; /*[LV]*/ + /** + * @stable ICU 2.6 + */ + public static final int LVT_SYLLABLE = 5; /*[LVT]*/ + /** + * @stable ICU 2.6 + */ + public static final int COUNT = 6; + } + + // public data members ----------------------------------------------- + + /** + * The lowest Unicode code point value. + * @stable ICU 2.1 + */ + public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE; + + /** + * The highest Unicode code point value (scalar value) according to the + * Unicode Standard. + * This is a 21-bit value (21 bits, rounded up).<br> + * Up-to-date Unicode implementation of java.lang.Character.MAX_VALUE + * @stable ICU 2.1 + */ + public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE; + + // public methods ---------------------------------------------------- + + /** + * Returns the numeric value of a decimal digit code point. + * <br>This method observes the semantics of + * <code>java.lang.Character.digit()</code>. Note that this + * will return positive values for code points for which isDigit + * returns false, just like java.lang.Character. + * <br><em>Semantic Change:</em> In release 1.3.1 and + * prior, this did not treat the European letters as having a + * digit value, and also treated numeric letters and other numbers as + * digits. + * This has been changed to conform to the java semantics. + * <br>A code point is a valid digit if and only if: + * <ul> + * <li>ch is a decimal digit or one of the european letters, and + * <li>the value of ch is less than the specified radix. + * </ul> + * @param ch the code point to query + * @param radix the radix + * @return the numeric value represented by the code point in the + * specified radix, or -1 if the code point is not a decimal digit + * or if its value is too large for the radix + * @stable ICU 2.1 + */ + public static int digit(int ch, int radix) + { + if (2 <= radix && radix <= 36) { + int value = digit(ch); + if (value < 0) { + // ch is not a decimal digit, try latin letters + value = UCharacterProperty.getEuropeanDigit(ch); + } + return (value < radix) ? value : -1; + } else { + return -1; // invalid radix + } + } + + /** + * Returns the numeric value of a decimal digit code point. + * <br>This is a convenience overload of <code>digit(int, int)</code> + * that provides a decimal radix. + * <br><em>Semantic Change:</em> In release 1.3.1 and prior, this + * treated numeric letters and other numbers as digits. This has + * been changed to conform to the java semantics. + * @param ch the code point to query + * @return the numeric value represented by the code point, + * or -1 if the code point is not a decimal digit or if its + * value is too large for a decimal radix + * @stable ICU 2.1 + */ + public static int digit(int ch) + { + return UCharacterProperty.INSTANCE.digit(ch); + } + + /** + * Returns a value indicating a code point's Unicode category. + * Up-to-date Unicode implementation of java.lang.Character.getType() + * except for the above mentioned code points that had their category + * changed.<br> + * Return results are constants from the interface + * <a href=UCharacterCategory.html>UCharacterCategory</a><br> + * <em>NOTE:</em> the UCharacterCategory values are <em>not</em> compatible with + * those returned by java.lang.Character.getType. UCharacterCategory values + * match the ones used in ICU4C, while java.lang.Character type + * values, though similar, skip the value 17.</p> + * @param ch code point whose type is to be determined + * @return category which is a value of UCharacterCategory + * @stable ICU 2.1 + */ + public static int getType(int ch) + { + return UCharacterProperty.INSTANCE.getType(ch); + } + + /** + * Returns the Bidirection property of a code point. + * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional + * property.<br> + * Result returned belongs to the interface + * <a href=UCharacterDirection.html>UCharacterDirection</a> + * @param ch the code point to be determined its direction + * @return direction constant from UCharacterDirection. + * @stable ICU 2.1 + */ + public static int getDirection(int ch) + { + return UBiDiProps.INSTANCE.getClass(ch); + } + + /** + * Maps the specified code point to a "mirror-image" code point. + * For code points with the "mirrored" property, implementations sometimes + * need a "poor man's" mapping to another code point such that the default + * glyph may serve as the mirror-image of the default glyph of the + * specified code point.<br> + * This is useful for text conversion to and from codepages with visual + * order, and for displays without glyph selection capabilities. + * @param ch code point whose mirror is to be retrieved + * @return another code point that may serve as a mirror-image substitute, + * or ch itself if there is no such mapping or ch does not have the + * "mirrored" property + * @stable ICU 2.1 + */ + public static int getMirror(int ch) + { + return UBiDiProps.INSTANCE.getMirror(ch); + } + + /** + * Maps the specified character to its paired bracket character. + * For Bidi_Paired_Bracket_Type!=None, this is the same as getMirror(int). + * Otherwise c itself is returned. + * See http://www.unicode.org/reports/tr9/ + * + * @param c the code point to be mapped + * @return the paired bracket code point, + * or c itself if there is no such mapping + * (Bidi_Paired_Bracket_Type=None) + * + * @see UProperty#BIDI_PAIRED_BRACKET + * @see UProperty#BIDI_PAIRED_BRACKET_TYPE + * @see #getMirror(int) + * @stable ICU 52 + */ + public static int getBidiPairedBracket(int c) { + return UBiDiProps.INSTANCE.getPairedBracket(c); + } + + /** + * Returns the combining class of the argument codepoint + * @param ch code point whose combining is to be retrieved + * @return the combining class of the codepoint + * @stable ICU 2.1 + */ + public static int getCombiningClass(int ch) + { + return Normalizer2.getNFDInstance().getCombiningClass(ch); + } + + /** + * Returns the version of Unicode data used. + * @return the unicode version number used + * @stable ICU 2.1 + */ + public static VersionInfo getUnicodeVersion() + { + return UCharacterProperty.INSTANCE.m_unicodeVersion_; + } + + /** + * Returns a code point corresponding to the two UTF16 characters. + * @param lead the lead char + * @param trail the trail char + * @return code point if surrogate characters are valid. + * @exception IllegalArgumentException thrown when argument characters do + * not form a valid codepoint + * @stable ICU 2.1 + */ + public static int getCodePoint(char lead, char trail) + { + if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) { + return UCharacterProperty.getRawSupplementary(lead, trail); + } + throw new IllegalArgumentException("Illegal surrogate characters"); + } + + /** + * Returns the "age" of the code point.</p> + * <p>The "age" is the Unicode version when the code point was first + * designated (as a non-character or for Private Use) or assigned a + * character. + * <p>This can be useful to avoid emitting code points to receiving + * processes that do not accept newer characters.</p> + * <p>The data is from the UCD file DerivedAge.txt.</p> + * @param ch The code point. + * @return the Unicode version number + * @stable ICU 2.6 + */ + public static VersionInfo getAge(int ch) + { + if (ch < MIN_VALUE || ch > MAX_VALUE) { + throw new IllegalArgumentException("Codepoint out of bounds"); + } + return UCharacterProperty.INSTANCE.getAge(ch); + } + + /** + * Returns the property value for an Unicode property type of a code point. + * Also returns binary and mask property values.</p> + * <p>Unicode, especially in version 3.2, defines many more properties than + * the original set in UnicodeData.txt.</p> + * <p>The properties APIs are intended to reflect Unicode properties as + * defined in the Unicode Character Database (UCD) and Unicode Technical + * Reports (UTR). For details about the properties see + * http://www.unicode.org/.</p> + * <p>For names of Unicode properties see the UCD file PropertyAliases.txt. + * </p> + * <pre> + * Sample usage: + * int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH); + * int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC); + * boolean b = (ideo == 1) ? true : false; + * </pre> + * @param ch code point to test. + * @param type UProperty selector constant, identifies which binary + * property to check. Must be + * UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or + * UProperty.INT_START <= type < UProperty.INT_LIMIT or + * UProperty.MASK_START <= type < UProperty.MASK_LIMIT. + * @return numeric value that is directly the property value or, + * for enumerated properties, corresponds to the numeric value of + * the enumerated constant of the respective property value + * enumeration type (cast to enum type if necessary). + * Returns 0 or 1 (for false / true) for binary Unicode properties. + * Returns a bit-mask for mask properties. + * Returns 0 if 'type' is out of bounds or if the Unicode version + * does not have data for the property at all, or not for this code + * point. + * @see UProperty + * @see #hasBinaryProperty + * @see #getIntPropertyMinValue + * @see #getIntPropertyMaxValue + * @see #getUnicodeVersion + * @stable ICU 2.4 + */ + // for BiDiBase.java + public static int getIntPropertyValue(int ch, int type) { + return UCharacterProperty.INSTANCE.getIntPropertyValue(ch, type); + } + + // private constructor ----------------------------------------------- + + /** + * Private constructor to prevent instantiation + */ + private UCharacter() { } + + /* + * Copied from UCharacterEnums.java + */ + + /** + * Character type Mn + * @stable ICU 2.1 + */ + public static final byte NON_SPACING_MARK = 6; + /** + * Character type Me + * @stable ICU 2.1 + */ + public static final byte ENCLOSING_MARK = 7; + /** + * Character type Mc + * @stable ICU 2.1 + */ + public static final byte COMBINING_SPACING_MARK = 8; + /** + * Character type count + * @stable ICU 2.1 + */ + public static final byte CHAR_CATEGORY_COUNT = 30; + + /** + * Directional type R + * @stable ICU 2.1 + */ + public static final int RIGHT_TO_LEFT = 1; + /** + * Directional type AL + * @stable ICU 2.1 + */ + public static final int RIGHT_TO_LEFT_ARABIC = 13; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/java.base/share/classes/jdk/internal/icu/lang/UCharacterDirection.java Mon Jan 13 08:05:59 2020 -0800 @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +/* +/** +******************************************************************************* +* Copyright (C) 1996-2004, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +*/ +// CHANGELOG +// 2005-05-19 Edward Wang +// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/lang/UCharacterDirection.java +// - move from package com.ibm.icu.lang to package sun.net.idn +// + +package jdk.internal.icu.lang; + +/** + * Enumerated Unicode character linguistic direction constants. + * Used as return results from <a href=UCharacter.html>UCharacter</a> + * <p> + * This class is not subclassable + * </p> + * @author Syn Wee Quek + * @stable ICU 2.1 + */ + +@SuppressWarnings("deprecation") +public final class UCharacterDirection implements UCharacterEnums.ECharacterDirection { + + // private constructor ========================================= + ///CLOVER:OFF + /** + * Private constructor to prevent initialisation + */ + private UCharacterDirection() + { + } + ///CLOVER:ON + + /** + * Gets the name of the argument direction + * @param dir direction type to retrieve name + * @return directional name + * @stable ICU 2.1 + */ + public static String toString(int dir) { + switch(dir) + { + case LEFT_TO_RIGHT : + return "Left-to-Right"; + case RIGHT_TO_LEFT : + return "Right-to-Left"; + case EUROPEAN_NUMBER : + return "European Number"; + case EUROPEAN_NUMBER_SEPARATOR : + return "European Number Separator"; + case EUROPEAN_NUMBER_TERMINATOR : + return "European Number Terminator"; + case ARABIC_NUMBER : + return "Arabic Number"; + case COMMON_NUMBER_SEPARATOR : + return "Common Number Separator"; + case BLOCK_SEPARATOR : + return "Paragraph Separator"; + case SEGMENT_SEPARATOR : + return "Segment Separator"; + case WHITE_SPACE_NEUTRAL : + return "Whitespace"; + case OTHER_NEUTRAL : + return "Other Neutrals"; + case LEFT_TO_RIGHT_EMBEDDING : + return "Left-to-Right Embedding"; + case LEFT_TO_RIGHT_OVERRIDE : + return "Left-to-Right Override"; + case RIGHT_TO_LEFT_ARABIC : + return "Right-to-Left Arabic"; + case RIGHT_TO_LEFT_EMBEDDING : + return "Right-to-Left Embedding"; + case RIGHT_TO_LEFT_OVERRIDE : + return "Right-to-Left Override"; + case POP_DIRECTIONAL_FORMAT : + return "Pop Directional Format"; + case DIR_NON_SPACING_MARK : + return "Non-Spacing Mark"; + case BOUNDARY_NEUTRAL : + return "Boundary Neutral"; + } + return "Unassigned"; + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/java.base/share/classes/jdk/internal/icu/lang/UCharacterEnums.java Mon Jan 13 08:05:59 2020 -0800 @@ -0,0 +1,588 @@ +/* + * Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +/* +/** + ******************************************************************************* + * Copyright (C) 2004, International Business Machines Corporation and * + * others. All Rights Reserved. * + ******************************************************************************* + */ +// CHANGELOG +// 2005-05-19 Edward Wang +// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/lang/UCharacterEnums.java +// - move from package com.ibm.icu.lang to package sun.net.idn +// +// 2011-09-06 Kurchi Subhra Hazra +// - Added @Deprecated tag to the following: +// - class UCharacterEnums +// - interfaces ECharacterCategory, ECharacterDirection +// - fields INITIAL_QUOTE_PUNCTUATION, FINAL_QUOTE_PUNCTUATION, +// DIRECTIONALITY_LEFT_TO_RIGHT, DIRECTIONALITY_RIGHT_TO_LEFT, +// DIRECTIONALITY_EUROPEAN_NUMBER, DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR +// DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR, DIRECTIONALITY_ARABIC_NUMBER, +// DIRECTIONALITY_COMMON_NUMBER_SEPARATOR, DIRECTIONALITY_PARAGRAPH_SEPARATOR, +// DIRECTIONALITY_SEGMENT_SEPARATOR, DIRECTIONALITY_WHITESPACE, +// DIRECTIONALITY_OTHER_NEUTRALS, DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING, +// DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE, DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC, +// DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING, DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE, +// DIRECTIONALITY_POP_DIRECTIONAL_FORMAT, DIRECTIONALITY_NON_SPACING_MARK, +// DIRECTIONALITY_BOUNDARY_NEUTRAL, DIRECTIONALITY_UNDEFINED +// + +package jdk.internal.icu.lang; + +/** + * A container for the different 'enumerated types' used by UCharacter. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + +@Deprecated +class UCharacterEnums { + + /** This is just a namespace, it is not instantiatable. */ + private UCharacterEnums() {}; + + /** + * 'Enum' for the CharacterCategory constants. These constants are + * compatible in name <b>but not in value</b> with those defined in + * <code>java.lang.Character</code>. + * @see UCharacterCategory + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static interface ECharacterCategory { + /** + * Unassigned character type + * @stable ICU 2.1 + */ + public static final int UNASSIGNED = 0; + + /** + * Character type Cn + * Not Assigned (no characters in [UnicodeData.txt] have this property) + * @stable ICU 2.6 + */ + public static final int GENERAL_OTHER_TYPES = 0; + + /** + * Character type Lu + * @stable ICU 2.1 + */ + public static final int UPPERCASE_LETTER = 1; + + /** + * Character type Ll + * @stable ICU 2.1 + */ + public static final int LOWERCASE_LETTER = 2; + + /** + * Character type Lt + * @stable ICU 2.1 + */ + + public static final int TITLECASE_LETTER = 3; + + /** + * Character type Lm + * @stable ICU 2.1 + */ + public static final int MODIFIER_LETTER = 4; + + /** + * Character type Lo + * @stable ICU 2.1 + */ + public static final int OTHER_LETTER = 5; + + /** + * Character type Mn + * @stable ICU 2.1 + */ + public static final int NON_SPACING_MARK = 6; + + /** + * Character type Me + * @stable ICU 2.1 + */ + public static final int ENCLOSING_MARK = 7; + + /** + * Character type Mc + * @stable ICU 2.1 + */ + public static final int COMBINING_SPACING_MARK = 8; + + /** + * Character type Nd + * @stable ICU 2.1 + */ + public static final int DECIMAL_DIGIT_NUMBER = 9; + + /** + * Character type Nl + * @stable ICU 2.1 + */ + public static final int LETTER_NUMBER = 10; + + /** + * Character type No + * @stable ICU 2.1 + */ + public static final int OTHER_NUMBER = 11; + + /** + * Character type Zs + * @stable ICU 2.1 + */ + public static final int SPACE_SEPARATOR = 12; + + /** + * Character type Zl + * @stable ICU 2.1 + */ + public static final int LINE_SEPARATOR = 13; + + /** + * Character type Zp + * @stable ICU 2.1 + */ + public static final int PARAGRAPH_SEPARATOR = 14; + + /** + * Character type Cc + * @stable ICU 2.1 + */ + public static final int CONTROL = 15; + + /** + * Character type Cf + * @stable ICU 2.1 + */ + public static final int FORMAT = 16; + + /** + * Character type Co + * @stable ICU 2.1 + */ + public static final int PRIVATE_USE = 17; + + /** + * Character type Cs + * @stable ICU 2.1 + */ + public static final int SURROGATE = 18; + + /** + * Character type Pd + * @stable ICU 2.1 + */ + public static final int DASH_PUNCTUATION = 19; + + /** + * Character type Ps + * @stable ICU 2.1 + */ + public static final int START_PUNCTUATION = 20; + + /** + * Character type Pe + * @stable ICU 2.1 + */ + public static final int END_PUNCTUATION = 21; + + /** + * Character type Pc + * @stable ICU 2.1 + */ + public static final int CONNECTOR_PUNCTUATION = 22; + + /** + * Character type Po + * @stable ICU 2.1 + */ + public static final int OTHER_PUNCTUATION = 23; + + /** + * Character type Sm + * @stable ICU 2.1 + */ + public static final int MATH_SYMBOL = 24; + + /** + * Character type Sc + * @stable ICU 2.1 + */ + public static final int CURRENCY_SYMBOL = 25; + + /** + * Character type Sk + * @stable ICU 2.1 + */ + public static final int MODIFIER_SYMBOL = 26; + + /** + * Character type So + * @stable ICU 2.1 + */ + public static final int OTHER_SYMBOL = 27; + + /** + * Character type Pi + * @see #INITIAL_QUOTE_PUNCTUATION + * @stable ICU 2.1 + */ + public static final int INITIAL_PUNCTUATION = 28; + + /** + * Character type Pi + * This name is compatible with java.lang.Character's name for this type. + * @see #INITIAL_PUNCTUATION + * @draft ICU 2.8 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final int INITIAL_QUOTE_PUNCTUATION = 28; + + /** + * Character type Pf + * @see #FINAL_QUOTE_PUNCTUATION + * @stable ICU 2.1 + */ + public static final int FINAL_PUNCTUATION = 29; + + /** + * Character type Pf + * This name is compatible with java.lang.Character's name for this type. + * @see #FINAL_PUNCTUATION + * @draft ICU 2.8 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final int FINAL_QUOTE_PUNCTUATION = 29; + + /** + * Character type count + * @stable ICU 2.1 + */ + public static final int CHAR_CATEGORY_COUNT = 30; + } + + /** + * 'Enum' for the CharacterDirection constants. There are two sets + * of names, those used in ICU, and those used in the JDK. The + * JDK constants are compatible in name <b>but not in value</b> + * with those defined in <code>java.lang.Character</code>. + * @see UCharacterDirection + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + + @Deprecated + public static interface ECharacterDirection { + /** + * Directional type L + * @stable ICU 2.1 + */ + public static final int LEFT_TO_RIGHT = 0; + + /** + * JDK-compatible synonum for LEFT_TO_RIGHT. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = (byte)LEFT_TO_RIGHT; + + /** + * Directional type R + * @stable ICU 2.1 + */ + public static final int RIGHT_TO_LEFT = 1; + + /** + * JDK-compatible synonum for RIGHT_TO_LEFT. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = (byte)RIGHT_TO_LEFT; + + /** + * Directional type EN + * @stable ICU 2.1 + */ + public static final int EUROPEAN_NUMBER = 2; + + /** + * JDK-compatible synonum for EUROPEAN_NUMBER. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = (byte)EUROPEAN_NUMBER; + + /** + * Directional type ES + * @stable ICU 2.1 + */ + public static final int EUROPEAN_NUMBER_SEPARATOR = 3; + + /** + * JDK-compatible synonum for EUROPEAN_NUMBER_SEPARATOR. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = (byte)EUROPEAN_NUMBER_SEPARATOR; + + /** + * Directional type ET + * @stable ICU 2.1 + */ + public static final int EUROPEAN_NUMBER_TERMINATOR = 4; + + /** + * JDK-compatible synonum for EUROPEAN_NUMBER_TERMINATOR. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = (byte)EUROPEAN_NUMBER_TERMINATOR; + + /** + * Directional type AN + * @stable ICU 2.1 + */ + public static final int ARABIC_NUMBER = 5; + + /** + * JDK-compatible synonum for ARABIC_NUMBER. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_ARABIC_NUMBER = (byte)ARABIC_NUMBER; + + /** + * Directional type CS + * @stable ICU 2.1 + */ + public static final int COMMON_NUMBER_SEPARATOR = 6; + + /** + * JDK-compatible synonum for COMMON_NUMBER_SEPARATOR. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = (byte)COMMON_NUMBER_SEPARATOR; + + /** + * Directional type B + * @stable ICU 2.1 + */ + public static final int BLOCK_SEPARATOR = 7; + + /** + * JDK-compatible synonum for BLOCK_SEPARATOR. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = (byte)BLOCK_SEPARATOR; + + /** + * Directional type S + * @stable ICU 2.1 + */ + public static final int SEGMENT_SEPARATOR = 8; + + /** + * JDK-compatible synonum for SEGMENT_SEPARATOR. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = (byte)SEGMENT_SEPARATOR; + + /** + * Directional type WS + * @stable ICU 2.1 + */ + public static final int WHITE_SPACE_NEUTRAL = 9; + + /** + * JDK-compatible synonum for WHITE_SPACE_NEUTRAL. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_WHITESPACE = (byte)WHITE_SPACE_NEUTRAL; + + /** + * Directional type ON + * @stable ICU 2.1 + */ + public static final int OTHER_NEUTRAL = 10; + + /** + * JDK-compatible synonum for OTHER_NEUTRAL. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_OTHER_NEUTRALS = (byte)OTHER_NEUTRAL; + + /** + * Directional type LRE + * @stable ICU 2.1 + */ + public static final int LEFT_TO_RIGHT_EMBEDDING = 11; + + /** + * JDK-compatible synonum for LEFT_TO_RIGHT_EMBEDDING. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = (byte)LEFT_TO_RIGHT_EMBEDDING; + + /** + * Directional type LRO + * @stable ICU 2.1 + */ + public static final int LEFT_TO_RIGHT_OVERRIDE = 12; + + /** + * JDK-compatible synonum for LEFT_TO_RIGHT_OVERRIDE. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = (byte)LEFT_TO_RIGHT_OVERRIDE; + + /** + * Directional type AL + * @stable ICU 2.1 + */ + public static final int RIGHT_TO_LEFT_ARABIC = 13; + + /** + * JDK-compatible synonum for RIGHT_TO_LEFT_ARABIC. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = (byte)RIGHT_TO_LEFT_ARABIC; + + /** + * Directional type RLE + * @stable ICU 2.1 + */ + public static final int RIGHT_TO_LEFT_EMBEDDING = 14; + + /** + * JDK-compatible synonum for RIGHT_TO_LEFT_EMBEDDING. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = (byte)RIGHT_TO_LEFT_EMBEDDING; + + /** + * Directional type RLO + * @stable ICU 2.1 + */ + public static final int RIGHT_TO_LEFT_OVERRIDE = 15; + + /** + * JDK-compatible synonum for RIGHT_TO_LEFT_OVERRIDE. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = (byte)RIGHT_TO_LEFT_OVERRIDE; + + /** + * Directional type PDF + * @stable ICU 2.1 + */ + public static final int POP_DIRECTIONAL_FORMAT = 16; + + /** + * JDK-compatible synonum for POP_DIRECTIONAL_FORMAT. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = (byte)POP_DIRECTIONAL_FORMAT; + + /** + * Directional type NSM + * @stable ICU 2.1 + */ + public static final int DIR_NON_SPACING_MARK = 17; + + /** + * JDK-compatible synonum for DIR_NON_SPACING_MARK. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of ICU. + */ + @Deprecated + public static final byte DIRECTIONALITY_NON_SPACING_MARK = (byte)DIR_NON_SPACING_MARK; + + /** + * Directional type BN + * @stable ICU 2.1 + */ + public static final int BOUNDARY_NEUTRAL = 18; + + /** + * JDK-compatible synonum for BOUNDARY_NEUTRAL. + * @draft ICU 3.0 + * @deprecated This is a draft API and might change in a future release of