From b64fcfaa6115eb4a5f65c12d37f95d842136ae35 Mon Sep 17 00:00:00 2001 From: Adam Scarr Date: Wed, 9 Aug 2017 22:14:27 +1000 Subject: [PATCH] Add a unicode whitespace parser --- parser.go | 5 ++++- parser_test.go | 9 +++++++++ state.go | 50 +++++++++++++++++++++++++++++++------------------- 3 files changed, 44 insertions(+), 20 deletions(-) diff --git a/parser.go b/parser.go index f084af4..7590ffa 100644 --- a/parser.go +++ b/parser.go @@ -25,6 +25,9 @@ type Result struct { // - A parser that consumed some input should advance state.Pos type Parser func(*State) Result +// VoidParser is a special type of parser that never returns anything but can still consume input +type VoidParser func(*State) + // Parserish types are any type that can be turned into a Parser by Parsify // These currently include *Parser and string literals. // @@ -76,7 +79,7 @@ func ParsifyAll(parsers ...Parserish) []Parser { // WS will consume whitespace, it should only be needed when AutoWS is turned off func WS() Parser { return NewParser("AutoWS", func(ps *State) Result { - ps.WS() + ps.WS(ps) return Result{} }) } diff --git a/parser_test.go b/parser_test.go index 09cf7f4..bc2867e 100644 --- a/parser_test.go +++ b/parser_test.go @@ -203,6 +203,15 @@ func TestAutoWS(t *testing.T) { require.Equal(t, "hello", result.Child[1].Token) require.Equal(t, "", ps.Get()) }) + + t.Run("unicode whitespace", func(t *testing.T) { + ps := NewState(" \u202f hello") + ps.WS = UnicodeWhitespace + + result := Exact("hello")(ps) + require.Equal(t, "hello", result.Token) + require.False(t, ps.Errored()) + }) } func runParser(input string, parser Parser) (Result, *State) { diff --git a/state.go b/state.go index 1860bc4..3c7de43 100644 --- a/state.go +++ b/state.go @@ -2,6 +2,8 @@ package goparsify import ( "fmt" + "unicode" + "unicode/utf8" ) // Error represents a parse error. These will often be set, the parser will back up a little and @@ -17,9 +19,6 @@ func (e Error) Pos() int { return e.pos } // Error satisfies the golang error interface func (e Error) Error() string { return fmt.Sprintf("offset %d: expected %s", e.pos, e.expected) } -// WSFunc matches a byte and returns true if it is whitespace -type WSFunc func(c byte) bool - // State is the current parse state. It is entirely public because parsers are expected to mutate it during the parse. type State struct { // The full input string @@ -30,21 +29,41 @@ type State struct { // in backtracking that it has been inlined to avoid allocations. Error Error // Called to determine what to ignore when WS is called, or when AutoWS fires - WSFunc WSFunc + WS VoidParser NoAutoWS bool } +// ASCIIWhitespace matches any of the standard whitespace characters. It is faster +// than the UnicodeWhitespace parser as it does not need to decode unicode runes. +func ASCIIWhitespace(s *State) { + for s.Pos < len(s.Input) { + switch s.Input[s.Pos] { + case '\t', '\n', '\v', '\f', '\r', ' ': + s.Pos++ + default: + return + } + } +} + +// UnicodeWhitespace matches any unicode space character. Its a little slower +// than the ascii parser because it matches a rune at a time. +func UnicodeWhitespace(s *State) { + for s.Pos < len(s.Input) { + r, w := utf8.DecodeRuneInString(s.Get()) + if !unicode.IsSpace(r) { + return + } + s.Pos += w + } + +} + // NewState creates a new State from a string func NewState(input string) *State { return &State{ Input: input, - WSFunc: func(b byte) bool { - switch b { - case '\t', '\n', '\v', '\f', '\r', ' ': - return true - } - return false - }, + WS: ASCIIWhitespace, } } @@ -58,14 +77,7 @@ func (s *State) AutoWS() { if s.NoAutoWS { return } - s.WS() -} - -// WS consumes all whitespace and advances Pos. -func (s *State) WS() { - for s.Pos < len(s.Input) && s.WSFunc(s.Input[s.Pos]) { - s.Pos++ - } + s.WS(s) } // Get the remaining input.