From 50fa5e44c79841852b06d963a9dac19406456fb3 Mon Sep 17 00:00:00 2001 From: Stephen Amar Date: Wed, 18 Dec 2024 21:02:23 -0800 Subject: [PATCH 01/10] STDREGEX --- build.sbt | 1 + build.sc | 3 +- sjsonnet/src-js/sjsonnet/Platform.scala | 12 +++ sjsonnet/src-jvm/sjsonnet/Platform.scala | 10 +++ sjsonnet/src-native/sjsonnet/Platform.scala | 9 +++ .../src/sjsonnet/PrettyYamlRenderer.scala | 4 +- sjsonnet/src/sjsonnet/Std.scala | 28 +++---- sjsonnet/src/sjsonnet/StdRegex.scala | 73 +++++++++++++++++++ sjsonnet/src/sjsonnet/TomlRenderer.scala | 3 +- sjsonnet/src/sjsonnet/YamlRenderer.scala | 19 ++--- .../test/src/sjsonnet/OldYamlRenderer.scala | 2 +- .../test/src/sjsonnet/StdRegexTests.scala | 38 ++++++++++ 12 files changed, 167 insertions(+), 35 deletions(-) create mode 100644 sjsonnet/src/sjsonnet/StdRegex.scala create mode 100644 sjsonnet/test/src/sjsonnet/StdRegexTests.scala diff --git a/build.sbt b/build.sbt index c68d7cc7..1a6a9913 100644 --- a/build.sbt +++ b/build.sbt @@ -21,6 +21,7 @@ lazy val main = (project in file("sjsonnet")) "org.scala-lang.modules" %% "scala-collection-compat" % "2.11.0", "org.tukaani" % "xz" % "1.8", "org.yaml" % "snakeyaml" % "1.33", + "com.google.re2j" % "re2j" % "1.7", ), libraryDependencies ++= Seq( "com.lihaoyi" %% "utest" % "0.8.2", diff --git a/build.sc b/build.sc index 8d690590..041b1876 100644 --- a/build.sc +++ b/build.sc @@ -108,7 +108,8 @@ object sjsonnet extends Module { ivy"org.json:json:20240303", ivy"org.tukaani:xz::1.10", ivy"org.lz4:lz4-java::1.8.0", - ivy"org.yaml:snakeyaml::1.33" + ivy"org.yaml:snakeyaml::1.33", + ivy"com.google.re2j:re2j:1.7", ) def scalacOptions = Seq("-opt:l:inline", "-opt-inline-from:sjsonnet.**") diff --git a/sjsonnet/src-js/sjsonnet/Platform.scala b/sjsonnet/src-js/sjsonnet/Platform.scala index 45c2ede3..f5cac641 100644 --- a/sjsonnet/src-js/sjsonnet/Platform.scala +++ b/sjsonnet/src-js/sjsonnet/Platform.scala @@ -1,5 +1,10 @@ package sjsonnet + import java.io.File +import java.util +import java.util.regex.Pattern + + object Platform { def gzipBytes(s: Array[Byte]): String = { throw new Exception("GZip not implemented in Scala.js") @@ -34,4 +39,11 @@ object Platform { def hashFile(file: File): String = { throw new Exception("hashFile not implemented in Scala.js") } + + private val regexCache = new util.LinkedHashMap[String, Pattern](100, 0.75f, true) { + override def removeEldestEntry(eldest: util.Map.Entry[String, Pattern]): Boolean = size() > 100 + } + def getPatternFromCache(pat: String) : Pattern = regexCache.computeIfAbsent(pat, _ => Pattern.compile(pat)) + + def regexQuote(s: String): String = Pattern.quote(s) } diff --git a/sjsonnet/src-jvm/sjsonnet/Platform.scala b/sjsonnet/src-jvm/sjsonnet/Platform.scala index 867d6b03..da9e4cba 100644 --- a/sjsonnet/src-jvm/sjsonnet/Platform.scala +++ b/sjsonnet/src-jvm/sjsonnet/Platform.scala @@ -1,14 +1,17 @@ package sjsonnet import java.io.{BufferedInputStream, ByteArrayOutputStream, File, FileInputStream} +import java.util import java.util.Base64 import java.util.zip.GZIPOutputStream +import com.google.re2j.Pattern import net.jpountz.xxhash.{StreamingXXHash64, XXHashFactory} import org.json.{JSONArray, JSONObject} import org.tukaani.xz.LZMA2Options import org.tukaani.xz.XZOutputStream import org.yaml.snakeyaml.{LoaderOptions, Yaml} import org.yaml.snakeyaml.constructor.SafeConstructor + import scala.jdk.CollectionConverters._ object Platform { @@ -107,4 +110,11 @@ object Platform { hash.getValue.toString } + + private val regexCache = new util.LinkedHashMap[String, Pattern](100, 0.75f, true) { + override def removeEldestEntry(eldest: util.Map.Entry[String, Pattern]): Boolean = size() > 100 + } + def getPatternFromCache(pat: String) : Pattern = regexCache.computeIfAbsent(pat, _ => Pattern.compile(pat)) + + def regexQuote(s: String): String = Pattern.quote(s) } diff --git a/sjsonnet/src-native/sjsonnet/Platform.scala b/sjsonnet/src-native/sjsonnet/Platform.scala index dd88e7c8..7baaea2f 100644 --- a/sjsonnet/src-native/sjsonnet/Platform.scala +++ b/sjsonnet/src-native/sjsonnet/Platform.scala @@ -1,8 +1,10 @@ package sjsonnet import java.io.{ByteArrayOutputStream, File} +import java.util import java.util.Base64 import java.util.zip.GZIPOutputStream +import java.util.regex.Pattern object Platform { def gzipBytes(b: Array[Byte]): String = { @@ -50,4 +52,11 @@ object Platform { // File hashes in Scala Native are just the file content scala.io.Source.fromFile(file).mkString } + + private val regexCache = new util.LinkedHashMap[String, Pattern](100, 0.75f, true) { + override def removeEldestEntry(eldest: util.Map.Entry[String, Pattern]): Boolean = size() > 100 + } + def getPatternFromCache(pat: String) : Pattern = regexCache.computeIfAbsent(pat, _ => Pattern.compile(pat)) + + def regexQuote(s: String): String = Pattern.quote(s) } diff --git a/sjsonnet/src/sjsonnet/PrettyYamlRenderer.scala b/sjsonnet/src/sjsonnet/PrettyYamlRenderer.scala index a2829fa2..de8b4f08 100644 --- a/sjsonnet/src/sjsonnet/PrettyYamlRenderer.scala +++ b/sjsonnet/src/sjsonnet/PrettyYamlRenderer.scala @@ -1,10 +1,8 @@ package sjsonnet import java.io.{StringWriter, Writer} -import java.util.regex.Pattern import upickle.core.{ArrVisitor, ObjVisitor} -import fastparse.IndexedParserInput import scala.collection.mutable /** @@ -240,7 +238,7 @@ object PrettyYamlRenderer{ */ def writeBlockString(str: String, out: Writer, depth: Int, indent: Int, lineComment: String) = { val len = str.length() - val splits = YamlRenderer.newlinePattern.split(str, -1) + val splits = Platform.getPatternFromCache("\n").split(str, -1) val blockOffsetNumeral = if (str.charAt(0) != ' ') "" else indent val (blockStyle, dropRight) = (str.charAt(len - 1), if (len > 2) Some(str.charAt(len - 2)) else None) match{ diff --git a/sjsonnet/src/sjsonnet/Std.scala b/sjsonnet/src/sjsonnet/Std.scala index 46e63024..b35f3acf 100644 --- a/sjsonnet/src/sjsonnet/Std.scala +++ b/sjsonnet/src/sjsonnet/Std.scala @@ -4,12 +4,10 @@ import java.io.StringWriter import java.nio.charset.StandardCharsets.UTF_8 import java.util.Base64 import java.util -import java.util.regex.Pattern import sjsonnet.Expr.Member.Visibility import scala.collection.Searching._ import scala.collection.mutable -import scala.util.matching.Regex /** * The Jsonnet standard library, `std`, with each builtin function implemented @@ -19,8 +17,8 @@ import scala.util.matching.Regex class Std(private val additionalNativeFunctions: Map[String, Val.Builtin] = Map.empty) { private val dummyPos: Position = new Position(null, 0) private val emptyLazyArray = new Array[Lazy](0) - private val leadingWhiteSpacePattern = Pattern.compile("^[ \t\n\f\r\u0085\u00A0']+") - private val trailingWhiteSpacePattern = Pattern.compile("[ \t\n\f\r\u0085\u00A0']+$") + private val leadingWhiteSpacePattern = Platform.getPatternFromCache("^[ \t\n\f\r\u0085\u00A0']+") + private val trailingWhiteSpacePattern = Platform.getPatternFromCache("[ \t\n\f\r\u0085\u00A0']+$") private val oldNativeFunctions = Map( builtin("gzip", "v"){ (_, _, v: Val) => v match{ @@ -48,7 +46,7 @@ class Std(private val additionalNativeFunctions: Map[String, Val.Builtin] = Map. }, ) require(oldNativeFunctions.forall(k => !additionalNativeFunctions.contains(k._1)), "Conflicting native functions") - private val nativeFunctions = oldNativeFunctions ++ additionalNativeFunctions + private val nativeFunctions = oldNativeFunctions ++ additionalNativeFunctions ++ StdRegex.functions private object AssertEqual extends Val.Builtin2("assertEqual", "a", "b") { def evalRhs(v1: Val, v2: Val, ev: EvalScope, pos: Position): Val = { @@ -474,26 +472,24 @@ class Std(private val additionalNativeFunctions: Map[String, Val.Builtin] = Map. Val.Str(pos, str.asString.replaceAll(from.asString, to.asString)) override def specialize(args: Array[Expr]) = args match { case Array(str, from: Val.Str, to) => - try { (new SpecFrom(Pattern.compile(from.value)), Array(str, to)) } catch { case _: Exception => null } + try { (new SpecFrom(from.value), Array(str, to)) } catch { case _: Exception => null } case _ => null } - private class SpecFrom(from: Pattern) extends Val.Builtin2("strReplaceAll", "str", "to") { + private class SpecFrom(from: String) extends Val.Builtin2("strReplaceAll", "str", "to") { def evalRhs(str: Val, to: Val, ev: EvalScope, pos: Position): Val = - Val.Str(pos, from.matcher(str.asString).replaceAll(to.asString)) + Val.Str(pos, Platform.getPatternFromCache(from).matcher(str.asString).replaceAll(to.asString)) } } private object StripUtils { - private def getLeadingPattern(chars: String): Pattern = - Pattern.compile("^[" + Regex.quote(chars) + "]+") + private def getLeadingPattern(chars: String): String = "^[" + Platform.regexQuote(chars) + "]+" - private def getTrailingPattern(chars: String): Pattern = - Pattern.compile("[" + Regex.quote(chars) + "]+$") + private def getTrailingPattern(chars: String): String = "[" + Platform.regexQuote(chars) + "]+$" def unspecializedStrip(str: String, chars: String, left: Boolean, right: Boolean): String = { var s = str - if (right) s = getTrailingPattern(chars).matcher(s).replaceAll("") - if (left) s = getLeadingPattern(chars).matcher(s).replaceAll("") + if (right) s = Platform.getPatternFromCache(getTrailingPattern(chars)).matcher(s).replaceAll("") + if (left) s = Platform.getPatternFromCache(getLeadingPattern(chars)).matcher(s).replaceAll("") s } @@ -508,8 +504,8 @@ class Std(private val additionalNativeFunctions: Map[String, Val.Builtin] = Map. def evalRhs(str: Val, ev: EvalScope, pos: Position): Val = { var s = str.asString - if (right) s = rightPattern.matcher(s).replaceAll("") - if (left) s = leftPattern.matcher(s).replaceAll("") + if (right) s = Platform.getPatternFromCache(rightPattern).matcher(s).replaceAll("") + if (left) s = Platform.getPatternFromCache(leftPattern).matcher(s).replaceAll("") Val.Str(pos, s) } } diff --git a/sjsonnet/src/sjsonnet/StdRegex.scala b/sjsonnet/src/sjsonnet/StdRegex.scala new file mode 100644 index 00000000..391e42e3 --- /dev/null +++ b/sjsonnet/src/sjsonnet/StdRegex.scala @@ -0,0 +1,73 @@ +package sjsonnet + +import sjsonnet.Expr.Member.Visibility +import sjsonnet.Val.Obj + +object StdRegex { + def functions: Map[String, Val.Builtin] = Map( + "regexPartialMatch" -> new Val.Builtin2("regexPartialMatch", "pattern", "str") { + override def evalRhs(pattern: Val, str: Val, ev: EvalScope, pos: Position): Val = { + val compiledPattern = Platform.getPatternFromCache(pattern.asString) + val matcher = compiledPattern.matcher(str.asString) + var returnStr: Val = Val.Null(pos.noOffset) + val captures = Array.newBuilder[Val.Str] + val groupCount = matcher.groupCount() + while (matcher.find()) { + if (returnStr.isInstanceOf[Val.Null]) { + returnStr = Val.Str(pos.noOffset, matcher.group(0)) + } + for (i <- 1 to groupCount) { + captures += Val.Str(pos.noOffset, matcher.group(i)) + } + } + val result = captures.result() + Val.Obj.mk(pos.noOffset, + "string" -> new Obj.ConstMember(true, Visibility.Normal, returnStr), + "captures" -> new Obj.ConstMember(true, Visibility.Normal, new Val.Arr(pos.noOffset, result)) + ) + } + }, + "regexFullMatch" -> new Val.Builtin2("regexFullMatch", "pattern", "str") { + override def evalRhs(pattern: Val, str: Val, ev: EvalScope, pos: Position): Val = { + val compiledPattern = Platform.getPatternFromCache(pattern.asString) + val matcher = compiledPattern.matcher(str.asString) + if (!matcher.matches()) { + Val.Obj.mk(pos.noOffset, + "string" -> new Obj.ConstMember(true, Visibility.Normal, Val.Null(pos.noOffset)), + "captures" -> new Obj.ConstMember(true, Visibility.Normal, new Val.Arr(pos.noOffset, Array.empty[Lazy])) + ) + } else { + val captures = Array.newBuilder[Val.Str] + val groupCount = matcher.groupCount() + for (i <- 0 to groupCount) { + captures += Val.Str(pos.noOffset, matcher.group(i)) + } + val result = captures.result() + Val.Obj.mk(pos.noOffset, + "string" -> new Obj.ConstMember(true, Visibility.Normal, result.head), + "captures" -> new Obj.ConstMember(true, Visibility.Normal, new Val.Arr(pos.noOffset, result.drop(1))) + ) + } + } + }, + "regexGlobalReplace" -> new Val.Builtin3("regexGlobalReplace", "str", "pattern", "to") { + override def evalRhs(str: Val, pattern: Val, to: Val, ev: EvalScope, pos: Position): Val = { + val compiledPattern = Platform.getPatternFromCache(pattern.asString) + val matcher = compiledPattern.matcher(str.asString) + Val.Str(pos.noOffset, matcher.replaceAll(to.asString)) + } + }, + "regexReplace" -> new Val.Builtin3("regexGlobalReplace", "str", "pattern", "to") { + override def evalRhs(str: Val, pattern: Val, to: Val, ev: EvalScope, pos: Position): Val = { + val compiledPattern = Platform.getPatternFromCache(pattern.asString) + val matcher = compiledPattern.matcher(str.asString) + Val.Str(pos.noOffset, matcher.replaceFirst(to.asString)) + } + }, + "regexQuoteMeta" -> new Val.Builtin1("regexQuoteMeta", "str") { + override def evalRhs(str: Val, ev: EvalScope, pos: Position): Val = { + Val.Str(pos.noOffset, Platform.regexQuote(str.asString)) + } + } + ) +} diff --git a/sjsonnet/src/sjsonnet/TomlRenderer.scala b/sjsonnet/src/sjsonnet/TomlRenderer.scala index 04fb284b..4f467b65 100644 --- a/sjsonnet/src/sjsonnet/TomlRenderer.scala +++ b/sjsonnet/src/sjsonnet/TomlRenderer.scala @@ -3,7 +3,6 @@ package sjsonnet import upickle.core.{ArrVisitor, CharBuilder, ObjVisitor, SimpleVisitor, Visitor} import java.io.StringWriter -import java.util.regex.Pattern class TomlRenderer(out: StringWriter = new java.io.StringWriter(), cumulatedIndent: String, indent: String) extends SimpleVisitor[StringWriter, StringWriter]{ @@ -117,7 +116,7 @@ class TomlRenderer(out: StringWriter = new java.io.StringWriter(), cumulatedInde } object TomlRenderer { - private val bareAllowed = Pattern.compile("[A-Za-z0-9_-]+") + private val bareAllowed = Platform.getPatternFromCache("[A-Za-z0-9_-]+") def escapeKey(key: String): String = if (bareAllowed.matcher(key).matches()) key else { val out = new StringWriter() BaseRenderer.escape(out, key, unicode = true) diff --git a/sjsonnet/src/sjsonnet/YamlRenderer.scala b/sjsonnet/src/sjsonnet/YamlRenderer.scala index d2b2b3c2..570689a6 100644 --- a/sjsonnet/src/sjsonnet/YamlRenderer.scala +++ b/sjsonnet/src/sjsonnet/YamlRenderer.scala @@ -1,12 +1,8 @@ package sjsonnet import java.io.StringWriter -import java.util.regex.Pattern import upickle.core.{ArrVisitor, ObjVisitor, SimpleVisitor, Visitor} -import scala.util.Try - - class YamlRenderer(_out: StringWriter = new java.io.StringWriter(), indentArrayInObject: Boolean = false, quoteKeys: Boolean = true, indent: Int = 2) extends BaseCharRenderer(_out, indent){ @@ -52,7 +48,7 @@ class YamlRenderer(_out: StringWriter = new java.io.StringWriter(), indentArrayI elemBuilder.append('"') elemBuilder.append('"') } else if (s.charAt(len - 1) == '\n') { - val splits = YamlRenderer.newlinePattern.split(s) + val splits = Platform.getPatternFromCache("\n").split(s.toString) elemBuilder.append('|') depth += 1 splits.foreach { split => @@ -174,15 +170,14 @@ class YamlRenderer(_out: StringWriter = new java.io.StringWriter(), indentArrayI } } object YamlRenderer{ - val newlinePattern: Pattern = Pattern.compile("\n") - private val safeYamlKeyPattern = Pattern.compile("^[a-zA-Z0-9/._-]+$") + private val safeYamlKeyPattern = Platform.getPatternFromCache("^[a-zA-Z0-9/._-]+$") private val yamlReserved = Set("true", "false", "null", "yes", "no", "on", "off", "y", "n", ".nan", "+.inf", "-.inf", ".inf", "null", "-", "---", "''") - private val yamlTimestampPattern = Pattern.compile("^(?:[0-9]*-){2}[0-9]*$") - private val yamlBinaryPattern = Pattern.compile("^[-+]?0b[0-1_]+$") - private val yamlHexPattern = Pattern.compile("[-+]?0x[0-9a-fA-F_]+") - private val yamlFloatPattern = Pattern.compile( "^-?([0-9_]*)*(\\.[0-9_]*)?(e[-+][0-9_]+)?$" ) - private val yamlIntPattern = Pattern.compile("^[-+]?[0-9_]+$") + private val yamlTimestampPattern = Platform.getPatternFromCache("^(?:[0-9]*-){2}[0-9]*$") + private val yamlBinaryPattern = Platform.getPatternFromCache("^[-+]?0b[0-1_]+$") + private val yamlHexPattern = Platform.getPatternFromCache("[-+]?0x[0-9a-fA-F_]+") + private val yamlFloatPattern = Platform.getPatternFromCache( "^-?([0-9_]*)*(\\.[0-9_]*)?(e[-+][0-9_]+)?$" ) + private val yamlIntPattern = Platform.getPatternFromCache("^[-+]?[0-9_]+$") private def isSafeBareKey(k: String) = { val l = k.toLowerCase diff --git a/sjsonnet/test/src/sjsonnet/OldYamlRenderer.scala b/sjsonnet/test/src/sjsonnet/OldYamlRenderer.scala index 533f4ee4..20861a54 100644 --- a/sjsonnet/test/src/sjsonnet/OldYamlRenderer.scala +++ b/sjsonnet/test/src/sjsonnet/OldYamlRenderer.scala @@ -20,7 +20,7 @@ class OldYamlRenderer(out: StringWriter = new java.io.StringWriter(), indentArra val len = s.length() if (len == 0) out.append("\"\"") else if (s.charAt(len - 1) == '\n') { - val splits = YamlRenderer.newlinePattern.split(s) + val splits = Platform.getPatternFromCache("\n").split(s.toString) out.append('|') depth += 1 splits.foreach { split => diff --git a/sjsonnet/test/src/sjsonnet/StdRegexTests.scala b/sjsonnet/test/src/sjsonnet/StdRegexTests.scala new file mode 100644 index 00000000..0f6e6bf9 --- /dev/null +++ b/sjsonnet/test/src/sjsonnet/StdRegexTests.scala @@ -0,0 +1,38 @@ +package sjsonnet + +import sjsonnet.TestUtils.eval +import utest._ + +object StdRegexTests extends TestSuite { + def tests: Tests = Tests { + test("std.native - regex") { + eval("""std.native("regexPartialMatch")("a(b)c", "cabc")""") ==> ujson.Obj( + "string" -> "abc", + "captures" -> ujson.Arr("b") + ) + eval("""std.native("regexPartialMatch")("a(b)c", "def")""") ==> ujson.Obj( + "string" -> ujson.Null, + "captures" -> ujson.Arr() + ) + eval("""std.native("regexPartialMatch")("a(b)c", "abcabc")""") ==> ujson.Obj( + "string" -> "abc", + "captures" -> ujson.Arr("b", "b") + ) + eval("""std.native("regexFullMatch")("a(b)c", "abc")""") ==> ujson.Obj( + "string" -> "abc", + "captures" -> ujson.Arr("b") + ) + eval("""std.native("regexFullMatch")("a(b)c", "cabc")""") ==> ujson.Obj( + "string" -> ujson.Null, + "captures" -> ujson.Arr() + ) + eval("""std.native("regexFullMatch")("a(b)c", "def")""") ==> ujson.Obj( + "string" -> ujson.Null, + "captures" -> ujson.Arr() + ) + eval("""std.native("regexGlobalReplace")("abcbbb", "b", "d")""") ==> ujson.Str("adcddd") + eval("""std.native("regexReplace")("abcbbb", "b", "d")""") ==> ujson.Str("adcbbb") + eval("""std.native("regexQuoteMeta")("a.b")""") ==> ujson.Str(Platform.regexQuote("a.b")) + } + } +} From 699a255b8999921a7bada0566c27cb7a946e9ab2 Mon Sep 17 00:00:00 2001 From: Stephen Amar Date: Thu, 19 Dec 2024 16:45:20 -0800 Subject: [PATCH 02/10] Handle null matches properly --- sjsonnet/src/sjsonnet/StdRegex.scala | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sjsonnet/src/sjsonnet/StdRegex.scala b/sjsonnet/src/sjsonnet/StdRegex.scala index 391e42e3..77759df8 100644 --- a/sjsonnet/src/sjsonnet/StdRegex.scala +++ b/sjsonnet/src/sjsonnet/StdRegex.scala @@ -40,7 +40,12 @@ object StdRegex { val captures = Array.newBuilder[Val.Str] val groupCount = matcher.groupCount() for (i <- 0 to groupCount) { - captures += Val.Str(pos.noOffset, matcher.group(i)) + val m = matcher.group(i) + if (m == null) { + captures += Val.Null(pos.noOffset) + } else { + captures += Val.Str(pos.noOffset, m) + } } val result = captures.result() Val.Obj.mk(pos.noOffset, From a2f9f92c5a59f733e06ba716f6b870641fff561b Mon Sep 17 00:00:00 2001 From: Stephen Amar Date: Thu, 19 Dec 2024 16:46:05 -0800 Subject: [PATCH 03/10] Handle null matches properly --- sjsonnet/src/sjsonnet/StdRegex.scala | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sjsonnet/src/sjsonnet/StdRegex.scala b/sjsonnet/src/sjsonnet/StdRegex.scala index 77759df8..b60aa405 100644 --- a/sjsonnet/src/sjsonnet/StdRegex.scala +++ b/sjsonnet/src/sjsonnet/StdRegex.scala @@ -17,7 +17,12 @@ object StdRegex { returnStr = Val.Str(pos.noOffset, matcher.group(0)) } for (i <- 1 to groupCount) { - captures += Val.Str(pos.noOffset, matcher.group(i)) + val m = matcher.group(i) + if (m == null) { + captures += Val.Null(pos.noOffset) + } else { + captures += Val.Str(pos.noOffset, m) + } } } val result = captures.result() From 65a822dc0d872d9df5cd85f2b3ed9044fe71402d Mon Sep 17 00:00:00 2001 From: Stephen Amar Date: Thu, 19 Dec 2024 16:48:55 -0800 Subject: [PATCH 04/10] Handle null matches properly for partialMatch --- sjsonnet/src/sjsonnet/StdRegex.scala | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sjsonnet/src/sjsonnet/StdRegex.scala b/sjsonnet/src/sjsonnet/StdRegex.scala index b60aa405..b7a33b08 100644 --- a/sjsonnet/src/sjsonnet/StdRegex.scala +++ b/sjsonnet/src/sjsonnet/StdRegex.scala @@ -9,12 +9,17 @@ object StdRegex { override def evalRhs(pattern: Val, str: Val, ev: EvalScope, pos: Position): Val = { val compiledPattern = Platform.getPatternFromCache(pattern.asString) val matcher = compiledPattern.matcher(str.asString) - var returnStr: Val = Val.Null(pos.noOffset) + var returnStr: Val = null val captures = Array.newBuilder[Val.Str] val groupCount = matcher.groupCount() while (matcher.find()) { - if (returnStr.isInstanceOf[Val.Null]) { - returnStr = Val.Str(pos.noOffset, matcher.group(0)) + if (returnStr == null) { + val m = matcher.group(0) + if (m != null) { + returnStr = Val.Str(pos.noOffset, matcher.group(0)) + } else { + returnStr = Val.Null(pos.noOffset) + } } for (i <- 1 to groupCount) { val m = matcher.group(i) @@ -27,7 +32,8 @@ object StdRegex { } val result = captures.result() Val.Obj.mk(pos.noOffset, - "string" -> new Obj.ConstMember(true, Visibility.Normal, returnStr), + "string" -> new Obj.ConstMember(true, Visibility.Normal, + if (returnStr == null) Val.Null(pos.noOffset) else returnStr), "captures" -> new Obj.ConstMember(true, Visibility.Normal, new Val.Arr(pos.noOffset, result)) ) } From 39c7bceb02203ea0ee9b6069dafcc153881487ba Mon Sep 17 00:00:00 2001 From: Stephen Amar Date: Thu, 19 Dec 2024 16:55:38 -0800 Subject: [PATCH 05/10] fix types --- sjsonnet/src/sjsonnet/StdRegex.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sjsonnet/src/sjsonnet/StdRegex.scala b/sjsonnet/src/sjsonnet/StdRegex.scala index b7a33b08..1213fea0 100644 --- a/sjsonnet/src/sjsonnet/StdRegex.scala +++ b/sjsonnet/src/sjsonnet/StdRegex.scala @@ -10,7 +10,7 @@ object StdRegex { val compiledPattern = Platform.getPatternFromCache(pattern.asString) val matcher = compiledPattern.matcher(str.asString) var returnStr: Val = null - val captures = Array.newBuilder[Val.Str] + val captures = Array.newBuilder[Val] val groupCount = matcher.groupCount() while (matcher.find()) { if (returnStr == null) { @@ -48,7 +48,7 @@ object StdRegex { "captures" -> new Obj.ConstMember(true, Visibility.Normal, new Val.Arr(pos.noOffset, Array.empty[Lazy])) ) } else { - val captures = Array.newBuilder[Val.Str] + val captures = Array.newBuilder[Val] val groupCount = matcher.groupCount() for (i <- 0 to groupCount) { val m = matcher.group(i) From e101c10c2d2759f5b093b40860c1bb22dbe184fb Mon Sep 17 00:00:00 2001 From: Stephen Amar Date: Thu, 19 Dec 2024 22:37:02 -0800 Subject: [PATCH 06/10] Move to concurrenthashmap for the pattern cache --- sjsonnet/src-js/sjsonnet/Platform.scala | 4 +--- sjsonnet/src-jvm/sjsonnet/Platform.scala | 4 +--- sjsonnet/src-native/sjsonnet/Platform.scala | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/sjsonnet/src-js/sjsonnet/Platform.scala b/sjsonnet/src-js/sjsonnet/Platform.scala index f5cac641..280a7882 100644 --- a/sjsonnet/src-js/sjsonnet/Platform.scala +++ b/sjsonnet/src-js/sjsonnet/Platform.scala @@ -40,9 +40,7 @@ object Platform { throw new Exception("hashFile not implemented in Scala.js") } - private val regexCache = new util.LinkedHashMap[String, Pattern](100, 0.75f, true) { - override def removeEldestEntry(eldest: util.Map.Entry[String, Pattern]): Boolean = size() > 100 - } + private val regexCache = new util.concurrent.ConcurrentHashMap[String, Pattern] def getPatternFromCache(pat: String) : Pattern = regexCache.computeIfAbsent(pat, _ => Pattern.compile(pat)) def regexQuote(s: String): String = Pattern.quote(s) diff --git a/sjsonnet/src-jvm/sjsonnet/Platform.scala b/sjsonnet/src-jvm/sjsonnet/Platform.scala index da9e4cba..9d7d6d27 100644 --- a/sjsonnet/src-jvm/sjsonnet/Platform.scala +++ b/sjsonnet/src-jvm/sjsonnet/Platform.scala @@ -111,9 +111,7 @@ object Platform { hash.getValue.toString } - private val regexCache = new util.LinkedHashMap[String, Pattern](100, 0.75f, true) { - override def removeEldestEntry(eldest: util.Map.Entry[String, Pattern]): Boolean = size() > 100 - } + private val regexCache = new util.concurrent.ConcurrentHashMap[String, Pattern] def getPatternFromCache(pat: String) : Pattern = regexCache.computeIfAbsent(pat, _ => Pattern.compile(pat)) def regexQuote(s: String): String = Pattern.quote(s) diff --git a/sjsonnet/src-native/sjsonnet/Platform.scala b/sjsonnet/src-native/sjsonnet/Platform.scala index 7baaea2f..6bcfed33 100644 --- a/sjsonnet/src-native/sjsonnet/Platform.scala +++ b/sjsonnet/src-native/sjsonnet/Platform.scala @@ -53,9 +53,7 @@ object Platform { scala.io.Source.fromFile(file).mkString } - private val regexCache = new util.LinkedHashMap[String, Pattern](100, 0.75f, true) { - override def removeEldestEntry(eldest: util.Map.Entry[String, Pattern]): Boolean = size() > 100 - } + private val regexCache = new util.concurrent.ConcurrentHashMap[String, Pattern] def getPatternFromCache(pat: String) : Pattern = regexCache.computeIfAbsent(pat, _ => Pattern.compile(pat)) def regexQuote(s: String): String = Pattern.quote(s) From df59feef41c88477c87f11cc9e7e002c82ac8c41 Mon Sep 17 00:00:00 2001 From: Stephen Amar Date: Sat, 28 Dec 2024 20:00:19 -0800 Subject: [PATCH 07/10] Fix performance regression in stripChars --- sjsonnet/src/sjsonnet/Std.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sjsonnet/src/sjsonnet/Std.scala b/sjsonnet/src/sjsonnet/Std.scala index b35f3acf..fab31cbb 100644 --- a/sjsonnet/src/sjsonnet/Std.scala +++ b/sjsonnet/src/sjsonnet/Std.scala @@ -499,13 +499,13 @@ class Std(private val additionalNativeFunctions: Map[String, Val.Builtin] = Map. right: Boolean, functionName: String ) extends Val.Builtin1(functionName, "str") { - private[this] val leftPattern = getLeadingPattern(chars) - private[this] val rightPattern = getTrailingPattern(chars) + private[this] val leftPattern = Platform.getPatternFromCache(getLeadingPattern(chars)) + private[this] val rightPattern = Platform.getPatternFromCache(getTrailingPattern(chars)) def evalRhs(str: Val, ev: EvalScope, pos: Position): Val = { var s = str.asString - if (right) s = Platform.getPatternFromCache(rightPattern).matcher(s).replaceAll("") - if (left) s = Platform.getPatternFromCache(leftPattern).matcher(s).replaceAll("") + if (right) s = rightPattern.matcher(s).replaceAll("") + if (left) s = leftPattern.matcher(s).replaceAll("") Val.Str(pos, s) } } From 599bcccc71f64bc77b65cfaee14633bcfe068a65 Mon Sep 17 00:00:00 2001 From: Stephen Amar Date: Mon, 30 Dec 2024 16:43:23 -0800 Subject: [PATCH 08/10] address comments --- sjsonnet/src-js/sjsonnet/Platform.scala | 3 +++ sjsonnet/src-native/sjsonnet/Platform.scala | 2 ++ sjsonnet/src/sjsonnet/PrettyYamlRenderer.scala | 4 +++- sjsonnet/src/sjsonnet/StdRegex.scala | 2 +- sjsonnet/src/sjsonnet/YamlRenderer.scala | 3 ++- sjsonnet/test/src/sjsonnet/OldYamlRenderer.scala | 2 +- 6 files changed, 12 insertions(+), 4 deletions(-) diff --git a/sjsonnet/src-js/sjsonnet/Platform.scala b/sjsonnet/src-js/sjsonnet/Platform.scala index 280a7882..c950f6d5 100644 --- a/sjsonnet/src-js/sjsonnet/Platform.scala +++ b/sjsonnet/src-js/sjsonnet/Platform.scala @@ -41,6 +41,9 @@ object Platform { } private val regexCache = new util.concurrent.ConcurrentHashMap[String, Pattern] + + // scala.js does not rely on re2. Per https://www.scala-js.org/doc/regular-expressions.html. + // Expect to see some differences in behavior. def getPatternFromCache(pat: String) : Pattern = regexCache.computeIfAbsent(pat, _ => Pattern.compile(pat)) def regexQuote(s: String): String = Pattern.quote(s) diff --git a/sjsonnet/src-native/sjsonnet/Platform.scala b/sjsonnet/src-native/sjsonnet/Platform.scala index 6bcfed33..7e3ac5d0 100644 --- a/sjsonnet/src-native/sjsonnet/Platform.scala +++ b/sjsonnet/src-native/sjsonnet/Platform.scala @@ -54,6 +54,8 @@ object Platform { } private val regexCache = new util.concurrent.ConcurrentHashMap[String, Pattern] + // scala native is powered by RE2, per https://scala-native.org/en/latest/lib/javalib.html#regular-expressions-java-util-regexp + // It should perform similarly to the JVM implementation. def getPatternFromCache(pat: String) : Pattern = regexCache.computeIfAbsent(pat, _ => Pattern.compile(pat)) def regexQuote(s: String): String = Pattern.quote(s) diff --git a/sjsonnet/src/sjsonnet/PrettyYamlRenderer.scala b/sjsonnet/src/sjsonnet/PrettyYamlRenderer.scala index de8b4f08..a2829fa2 100644 --- a/sjsonnet/src/sjsonnet/PrettyYamlRenderer.scala +++ b/sjsonnet/src/sjsonnet/PrettyYamlRenderer.scala @@ -1,8 +1,10 @@ package sjsonnet import java.io.{StringWriter, Writer} +import java.util.regex.Pattern import upickle.core.{ArrVisitor, ObjVisitor} +import fastparse.IndexedParserInput import scala.collection.mutable /** @@ -238,7 +240,7 @@ object PrettyYamlRenderer{ */ def writeBlockString(str: String, out: Writer, depth: Int, indent: Int, lineComment: String) = { val len = str.length() - val splits = Platform.getPatternFromCache("\n").split(str, -1) + val splits = YamlRenderer.newlinePattern.split(str, -1) val blockOffsetNumeral = if (str.charAt(0) != ' ') "" else indent val (blockStyle, dropRight) = (str.charAt(len - 1), if (len > 2) Some(str.charAt(len - 2)) else None) match{ diff --git a/sjsonnet/src/sjsonnet/StdRegex.scala b/sjsonnet/src/sjsonnet/StdRegex.scala index 1213fea0..18a355bb 100644 --- a/sjsonnet/src/sjsonnet/StdRegex.scala +++ b/sjsonnet/src/sjsonnet/StdRegex.scala @@ -73,7 +73,7 @@ object StdRegex { Val.Str(pos.noOffset, matcher.replaceAll(to.asString)) } }, - "regexReplace" -> new Val.Builtin3("regexGlobalReplace", "str", "pattern", "to") { + "regexReplace" -> new Val.Builtin3("regexReplace", "str", "pattern", "to") { override def evalRhs(str: Val, pattern: Val, to: Val, ev: EvalScope, pos: Position): Val = { val compiledPattern = Platform.getPatternFromCache(pattern.asString) val matcher = compiledPattern.matcher(str.asString) diff --git a/sjsonnet/src/sjsonnet/YamlRenderer.scala b/sjsonnet/src/sjsonnet/YamlRenderer.scala index 570689a6..f2e6ee0b 100644 --- a/sjsonnet/src/sjsonnet/YamlRenderer.scala +++ b/sjsonnet/src/sjsonnet/YamlRenderer.scala @@ -48,7 +48,7 @@ class YamlRenderer(_out: StringWriter = new java.io.StringWriter(), indentArrayI elemBuilder.append('"') elemBuilder.append('"') } else if (s.charAt(len - 1) == '\n') { - val splits = Platform.getPatternFromCache("\n").split(s.toString) + val splits = YamlRenderer.newlinePattern.split(s.toString) elemBuilder.append('|') depth += 1 splits.foreach { split => @@ -170,6 +170,7 @@ class YamlRenderer(_out: StringWriter = new java.io.StringWriter(), indentArrayI } } object YamlRenderer{ + private[sjsonnet] val newlinePattern = Platform.getPatternFromCache("\n") private val safeYamlKeyPattern = Platform.getPatternFromCache("^[a-zA-Z0-9/._-]+$") private val yamlReserved = Set("true", "false", "null", "yes", "no", "on", "off", "y", "n", ".nan", "+.inf", "-.inf", ".inf", "null", "-", "---", "''") diff --git a/sjsonnet/test/src/sjsonnet/OldYamlRenderer.scala b/sjsonnet/test/src/sjsonnet/OldYamlRenderer.scala index 20861a54..e698d8b3 100644 --- a/sjsonnet/test/src/sjsonnet/OldYamlRenderer.scala +++ b/sjsonnet/test/src/sjsonnet/OldYamlRenderer.scala @@ -20,7 +20,7 @@ class OldYamlRenderer(out: StringWriter = new java.io.StringWriter(), indentArra val len = s.length() if (len == 0) out.append("\"\"") else if (s.charAt(len - 1) == '\n') { - val splits = Platform.getPatternFromCache("\n").split(s.toString) + val splits = YamlRenderer.newlinePattern.split(s.toString) out.append('|') depth += 1 splits.foreach { split => From 2d317f6e4f18fe2ea25d343256f938ae2ea79a94 Mon Sep 17 00:00:00 2001 From: Stephen Amar Date: Mon, 30 Dec 2024 16:45:50 -0800 Subject: [PATCH 09/10] Add regex functions to the "builtinnativefunctions" --- sjsonnet/src/sjsonnet/Std.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sjsonnet/src/sjsonnet/Std.scala b/sjsonnet/src/sjsonnet/Std.scala index fab31cbb..625ebcb4 100644 --- a/sjsonnet/src/sjsonnet/Std.scala +++ b/sjsonnet/src/sjsonnet/Std.scala @@ -19,7 +19,7 @@ class Std(private val additionalNativeFunctions: Map[String, Val.Builtin] = Map. private val emptyLazyArray = new Array[Lazy](0) private val leadingWhiteSpacePattern = Platform.getPatternFromCache("^[ \t\n\f\r\u0085\u00A0']+") private val trailingWhiteSpacePattern = Platform.getPatternFromCache("[ \t\n\f\r\u0085\u00A0']+$") - private val oldNativeFunctions = Map( + private val builtinNativeFunctions = Map( builtin("gzip", "v"){ (_, _, v: Val) => v match{ case Val.Str(_, value) => Platform.gzipString(value) @@ -44,9 +44,9 @@ class Std(private val additionalNativeFunctions: Map[String, Val.Builtin] = Map. case x => Error.fail("Cannot xz encode " + x.prettyName) } }, - ) - require(oldNativeFunctions.forall(k => !additionalNativeFunctions.contains(k._1)), "Conflicting native functions") - private val nativeFunctions = oldNativeFunctions ++ additionalNativeFunctions ++ StdRegex.functions + ) ++ StdRegex.functions + require(builtinNativeFunctions.forall(k => !additionalNativeFunctions.contains(k._1)), "Conflicting native functions") + private val nativeFunctions = builtinNativeFunctions ++ additionalNativeFunctions private object AssertEqual extends Val.Builtin2("assertEqual", "a", "b") { def evalRhs(v1: Val, v2: Val, ev: EvalScope, pos: Position): Val = { @@ -1518,7 +1518,7 @@ class Std(private val additionalNativeFunctions: Map[String, Val.Builtin] = Map. Error.fail("Native function " + name + " not found", pos)(ev) } }, - ) ++ oldNativeFunctions + ) ++ builtinNativeFunctions private def toSetArrOrString(args: Array[Val], idx: Int, pos: Position, ev: EvalScope) = { args(idx) match { From fb9f800224774b2efeaf0f21ce5281b55c57dbcc Mon Sep 17 00:00:00 2001 From: Stephen Amar Date: Mon, 30 Dec 2024 20:43:18 -0800 Subject: [PATCH 10/10] Fix specialization --- sjsonnet/src/sjsonnet/Std.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sjsonnet/src/sjsonnet/Std.scala b/sjsonnet/src/sjsonnet/Std.scala index 625ebcb4..31382d6d 100644 --- a/sjsonnet/src/sjsonnet/Std.scala +++ b/sjsonnet/src/sjsonnet/Std.scala @@ -476,8 +476,9 @@ class Std(private val additionalNativeFunctions: Map[String, Val.Builtin] = Map. case _ => null } private class SpecFrom(from: String) extends Val.Builtin2("strReplaceAll", "str", "to") { + private[this] val pattern = Platform.getPatternFromCache(from) def evalRhs(str: Val, to: Val, ev: EvalScope, pos: Position): Val = - Val.Str(pos, Platform.getPatternFromCache(from).matcher(str.asString).replaceAll(to.asString)) + Val.Str(pos, pattern.matcher(str.asString).replaceAll(to.asString)) } }