Skip to content

Commit 1f2ec92

Browse files
committed
Add support for isBlockName character classes
1 parent f085ae3 commit 1f2ec92

File tree

4 files changed

+100
-30
lines changed

4 files changed

+100
-30
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
# Changelog for Unicode Set 0.12.0
2+
3+
This is the changelog for Unicode Set 0.12.0 released on February 23rd, 2021. For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_set/tags)
4+
5+
## Enhancements
6+
7+
* Adds support for "isBlockName" Perl and POSIX regex syntax. Used in a regex as `[[:isLatin1]]` or `\p{isLatin1}` or their inverse forms `[[:^isLatin1]]` and `\P{isLatin1}`.
8+
19
# Changelog for Unicode Set 0.11.0
210

311
This is the changelog for Unicode Set 0.11.0 released on October 5th, 2020. For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_set/tags)

lib/set/parser.ex

Lines changed: 51 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,11 @@ defmodule Unicode.Set.Parser do
4848
|> repeat(set_operator() |> parsec(:one_set))
4949
end
5050

51-
52-
@debug_functions []
51+
@debug_functions [:reduce_property]
5352

5453
defmacrop tracer(step, a) do
5554
{caller, _} = __CALLER__.function
56-
57-
if Mix.env() == :dev and caller in @debug_functions do
55+
if Mix.env() in [:dev] and caller in @debug_functions do
5856
quote do
5957
IO.inspect("#{unquote(caller)}", label: "Step #{unquote(step)}")
6058
IO.inspect(unquote(a), label: "argument")
@@ -205,8 +203,7 @@ defmodule Unicode.Set.Parser do
205203
def posix_property do
206204
ignore(string("[:"))
207205
|> optional(ascii_char([?^]) |> replace(:not))
208-
|> concat(property_name())
209-
|> optional(operator() |> ignore(optional(whitespace())) |> concat(value_2()))
206+
|> property_expression([{:not, ?:}])
210207
|> ignore(string(":]"))
211208
|> label("posix property")
212209
end
@@ -216,8 +213,7 @@ defmodule Unicode.Set.Parser do
216213
ignore(ascii_char([?\\]))
217214
|> choice([ascii_char([?P]) |> replace(:not), ignore(ascii_char([?p]))])
218215
|> ignore(ascii_char([?{]))
219-
|> concat(property_name())
220-
|> optional(operator() |> ignore(optional(whitespace())) |> concat(value_1()))
216+
|> property_expression([{:not, ?}}])
221217
|> ignore(ascii_char([?}]))
222218
|> label("perl property")
223219
end
@@ -231,16 +227,44 @@ defmodule Unicode.Set.Parser do
231227
end
232228

233229
@doc false
234-
def reduce_property(_rest, [value, :in, property, :not], context, _line, _offset) do
230+
def property_expression(combinator \\ empty(), fence) do
231+
combinator
232+
|> choice([
233+
is_block()
234+
|> ignore(optional(whitespace()))
235+
|> concat(value(fence)),
236+
property_name()
237+
|> optional(operator() |> ignore(optional(whitespace())) |> concat(value(fence)))
238+
])
239+
end
240+
241+
@doc false
242+
def reduce_property(_rest, [value, "block" = property], context, _line, _offset) do
243+
tracer(0, [value, :in, property])
244+
case fetch_property!(property, value) do
245+
%{parsed: parsed} -> {[{:in, parsed}], context}
246+
ranges -> {[{:in, ranges}], context}
247+
end
248+
end
249+
250+
def reduce_property(_rest, [value, "block" = property, :not], context, _line, _offset) do
235251
tracer(1, [value, :in, property, :not])
252+
case fetch_property!(property, value) do
253+
%{parsed: parsed} -> {[{:not_in, parsed}], context}
254+
ranges -> {[{:not_in, ranges}], context}
255+
end
256+
end
257+
258+
def reduce_property(_rest, [value, :in, property, :not], context, _line, _offset) do
259+
tracer(2, [value, :in, property, :not])
236260
case fetch_property!(property, value) do
237261
%{parsed: parsed} -> {{:not_in, parsed}, context}
238262
ranges -> {[{:not_in, ranges}], context}
239263
end
240264
end
241265

242266
def reduce_property(_rest, [value, :not_in, property, :not], context, _line, _offset) do
243-
tracer(2, [value, :not_in, property, :not])
267+
tracer(3, [value, :not_in, property, :not])
244268
case fetch_property!(property, value) do
245269
%{parsed: parsed} -> {parsed, context}
246270
ranges -> {[{:in, ranges}], context}
@@ -249,15 +273,15 @@ defmodule Unicode.Set.Parser do
249273

250274
def reduce_property(_rest, [value, operator, property], context, _line, _offset)
251275
when operator in [:in, :not_in] do
252-
tracer(3, [value, operator, property])
276+
tracer(4, [value, operator, property])
253277
case fetch_property!(property, value) do
254278
%{parsed: parsed} -> {[{operator, parsed}], context}
255279
ranges -> {[{operator, ranges}], context}
256280
end
257281
end
258282

259283
def reduce_property(_rest, [value, :not], context, _line, _offset) do
260-
tracer(4, [value, :not])
284+
tracer(5, [value, :not])
261285
case fetch_property!(:script_or_category, value) do
262286
%{parsed: [{:not_in, parsed}]} -> {[{:in, parsed}], context}
263287
%{parsed: [{:in, parsed}]} -> {[{:not_in, parsed}], context}
@@ -267,7 +291,7 @@ defmodule Unicode.Set.Parser do
267291
end
268292

269293
def reduce_property(_rest, [value], context, _line, _offset) do
270-
tracer(5, [value])
294+
tracer(6, [value])
271295
case fetch_property!(:script_or_category, value) do
272296
%{parsed: [{:not_in, parsed}]} -> {[{:not_in, parsed}], context}
273297
%{parsed: [{:in, parsed}]} -> {[{:in, parsed}], context}
@@ -276,35 +300,33 @@ defmodule Unicode.Set.Parser do
276300
end
277301
end
278302

303+
@doc false
304+
def is_block do
305+
choice([
306+
string("is") |> replace("block"),
307+
string("Is") |> replace("block"),
308+
string("iS") |> replace("block"),
309+
string("IS") |> replace("block")
310+
])
311+
|> label("property name")
312+
end
313+
279314
@doc false
280315
@alphanumeric [?a..?z, ?A..?Z, ?0..?9]
281316
def property_name do
282-
ignore(optional(whitespace()))
283-
|> ascii_char(@alphanumeric)
317+
ascii_char(@alphanumeric)
284318
|> repeat(ascii_char(@alphanumeric ++ [?_, ?\s]))
285319
|> ignore(optional(whitespace()))
286320
|> reduce(:to_lower_string)
287321
|> label("property name")
288322
end
289323

290324
@doc false
291-
def value_1 do
292-
times(
293-
choice([
294-
ignore(ascii_char([?\\])) |> concat(quoted()),
295-
ascii_char([{:not, ?}}])
296-
]),
297-
min: 1
298-
)
299-
|> reduce(:to_lower_string)
300-
end
301-
302-
@doc false
303-
def value_2 do
325+
def value(gate) do
304326
times(
305327
choice([
306328
ignore(ascii_char([?\\])) |> concat(quoted()),
307-
ascii_char([{:not, ?:}])
329+
ascii_char(gate)
308330
]),
309331
min: 1
310332
)

mix.exs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
defmodule UnicodeSet.MixProject do
22
use Mix.Project
33

4-
@version "0.11.0"
4+
@version "0.12.0"
55

66
def project do
77
[

test/unicode_set_test.exs

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,4 +261,44 @@ defmodule UnicodeSetTest do
261261
Unicode.Set.compile_pattern("[^{ab}]")
262262
end
263263
end
264+
265+
test "[[:IsBasicLatin:]] property syntax" do
266+
basic_latin = Unicode.Regex.compile!("[[:block=BasicLatin:]]")
267+
assert Unicode.Regex.compile!("[[:IsBasicLatin:]]") == basic_latin
268+
assert Unicode.Regex.compile!("[[:Is BasicLatin:]]") == basic_latin
269+
assert Unicode.Regex.compile!("[[:IsBasic_Latin:]]") == basic_latin
270+
assert Unicode.Regex.compile!("[[:Is Basic_Latin:]]") == basic_latin
271+
assert Unicode.Regex.compile!("[[:Is Basic Latin:]]") == basic_latin
272+
assert Unicode.Regex.compile!("[[:is basic latin:]]") == basic_latin
273+
end
274+
275+
test "[[:^IsBasicLatin:]] property syntax" do
276+
basic_latin = Unicode.Regex.compile!("[[:^block=BasicLatin:]]")
277+
assert Unicode.Regex.compile!("[[:^IsBasicLatin:]]") == basic_latin
278+
assert Unicode.Regex.compile!("[[:^Is BasicLatin:]]") == basic_latin
279+
assert Unicode.Regex.compile!("[[:^IsBasic_Latin:]]") == basic_latin
280+
assert Unicode.Regex.compile!("[[:^Is Basic_Latin:]]") == basic_latin
281+
assert Unicode.Regex.compile!("[[:^Is Basic Latin:]]") == basic_latin
282+
assert Unicode.Regex.compile!("[[:^is basic latin:]]") == basic_latin
283+
end
284+
285+
test "\\p{isBlockName} property syntax" do
286+
basic_latin = Unicode.Regex.compile!("[[:block=BasicLatin:]]")
287+
assert Unicode.Regex.compile!("\\p{IsBasicLatin}") == basic_latin
288+
assert Unicode.Regex.compile!("\\p{Is BasicLatin}") == basic_latin
289+
assert Unicode.Regex.compile!("\\p{IsBasic_Latin}") == basic_latin
290+
assert Unicode.Regex.compile!("\\p{Is Basic_Latin}") == basic_latin
291+
assert Unicode.Regex.compile!("\\p{Is Basic Latin}") == basic_latin
292+
assert Unicode.Regex.compile!("\\p{is basic latin}") == basic_latin
293+
end
294+
295+
test "\\P{isBlockName} property syntax" do
296+
basic_latin = Unicode.Regex.compile!("[[:^block=BasicLatin:]]")
297+
assert Unicode.Regex.compile!("\\P{IsBasicLatin}") == basic_latin
298+
assert Unicode.Regex.compile!("\\P{Is BasicLatin}") == basic_latin
299+
assert Unicode.Regex.compile!("\\P{IsBasic_Latin}") == basic_latin
300+
assert Unicode.Regex.compile!("\\P{Is Basic_Latin}") == basic_latin
301+
assert Unicode.Regex.compile!("\\P{Is Basic Latin}") == basic_latin
302+
assert Unicode.Regex.compile!("\\P{is basic latin}") == basic_latin
303+
end
264304
end

0 commit comments

Comments
 (0)