changeset 58545:ba672c242599

8237599: Greedy matching against supplementary chars fails to respect the region Reviewed-by: rriggs
author igerasim
date Wed, 25 Mar 2020 08:46:31 -0700
parents 5d3f6f0582fe
children ee707e1ced80
files src/java.base/share/classes/java/util/regex/Pattern.java test/jdk/java/util/regex/RegExTest.java
diffstat 2 files changed, 56 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/src/java.base/share/classes/java/util/regex/Pattern.java	Wed Mar 25 14:41:52 2020 +0100
+++ b/src/java.base/share/classes/java/util/regex/Pattern.java	Wed Mar 25 08:46:31 2020 -0700
@@ -4340,14 +4340,22 @@
             this.cmin = cmin;
         }
         boolean match(Matcher matcher, int i, CharSequence seq) {
+            int starti = i;
             int n = 0;
             int to = matcher.to;
             // greedy, all the way down
             while (i < to) {
                 int ch = Character.codePointAt(seq, i);
+                int len = Character.charCount(ch);
+                if (i + len > to) {
+                    // the region cut off the high half of a surrogate pair
+                    matcher.hitEnd = true;
+                    ch = seq.charAt(i);
+                    len = 1;
+                }
                 if (!predicate.is(ch))
-                   break;
-                i += Character.charCount(ch);
+                    break;
+                i += len;
                 n++;
             }
             if (i >= to) {
@@ -4358,9 +4366,10 @@
                     return true;
                 if (n == cmin)
                     return false;
-                 // backing off if match fails
+                // backing off if match fails
                 int ch = Character.codePointBefore(seq, i);
-                i -= Character.charCount(ch);
+                // check if the region cut off the low half of a surrogate pair
+                i = Math.max(starti, i - Character.charCount(ch));
                 n--;
             }
             return false;
--- a/test/jdk/java/util/regex/RegExTest.java	Wed Mar 25 14:41:52 2020 +0100
+++ b/test/jdk/java/util/regex/RegExTest.java	Wed Mar 25 08:46:31 2020 -0700
@@ -36,7 +36,7 @@
  * 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895
  * 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706
  * 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812
- * 8216332 8214245
+ * 8216332 8214245 8237599
  *
  * @library /test/lib
  * @library /lib/testlibrary/java/lang
@@ -195,6 +195,7 @@
         surrogatePairWithCanonEq();
         lineBreakWithQuantifier();
         caseInsensitivePMatch();
+        surrogatePairOverlapRegion();
 
         if (failure) {
             throw new
@@ -5155,4 +5156,45 @@
         }
         report("caseInsensitivePMatch");
     }
+
+    // This test is for 8237599
+    private static void surrogatePairOverlapRegion() {
+        String input = "\ud801\udc37";
+
+        Pattern p = Pattern.compile(".+");
+        Matcher m = p.matcher(input);
+        m.region(0, 1);
+
+        boolean ok = m.find();
+        if (!ok || !m.group(0).equals(input.substring(0, 1)))
+        {
+            failCount++;
+            System.out.println("Input \"" + input + "\".substr(0, 1)" +
+                    " expected to match pattern \"" + p + "\"");
+            if (ok) {
+                System.out.println("group(0): \"" + m.group(0) + "\"");
+            }
+        } else if (!m.hitEnd()) {
+            failCount++;
+            System.out.println("Expected m.hitEnd() == true");
+        }
+
+        p = Pattern.compile(".*(.)");
+        m = p.matcher(input);
+        m.region(1, 2);
+
+        ok = m.find();
+        if (!ok || !m.group(0).equals(input.substring(1, 2))
+                || !m.group(1).equals(input.substring(1, 2)))
+        {
+            failCount++;
+            System.out.println("Input \"" + input + "\".substr(1, 2)" +
+                    " expected to match pattern \"" + p + "\"");
+            if (ok) {
+                System.out.println("group(0): \"" + m.group(0) + "\"");
+                System.out.println("group(1): \"" + m.group(1) + "\"");
+            }
+        }
+        report("surrogatePairOverlapRegion");
+    }
 }