summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--parser.go5
-rw-r--r--parser_test.go9
-rw-r--r--state.go50
3 files changed, 44 insertions, 20 deletions
diff --git a/parser.go b/parser.go
index f084af4..7590ffa 100644
--- a/parser.go
+++ b/parser.go
@@ -25,6 +25,9 @@ type Result struct {
// - A parser that consumed some input should advance state.Pos
type Parser func(*State) Result
+// VoidParser is a special type of parser that never returns anything but can still consume input
+type VoidParser func(*State)
+
// Parserish types are any type that can be turned into a Parser by Parsify
// These currently include *Parser and string literals.
//
@@ -76,7 +79,7 @@ func ParsifyAll(parsers ...Parserish) []Parser {
// WS will consume whitespace, it should only be needed when AutoWS is turned off
func WS() Parser {
return NewParser("AutoWS", func(ps *State) Result {
- ps.WS()
+ ps.WS(ps)
return Result{}
})
}
diff --git a/parser_test.go b/parser_test.go
index 09cf7f4..bc2867e 100644
--- a/parser_test.go
+++ b/parser_test.go
@@ -203,6 +203,15 @@ func TestAutoWS(t *testing.T) {
require.Equal(t, "hello", result.Child[1].Token)
require.Equal(t, "", ps.Get())
})
+
+ t.Run("unicode whitespace", func(t *testing.T) {
+ ps := NewState(" \u202f hello")
+ ps.WS = UnicodeWhitespace
+
+ result := Exact("hello")(ps)
+ require.Equal(t, "hello", result.Token)
+ require.False(t, ps.Errored())
+ })
}
func runParser(input string, parser Parser) (Result, *State) {
diff --git a/state.go b/state.go
index 1860bc4..3c7de43 100644
--- a/state.go
+++ b/state.go
@@ -2,6 +2,8 @@ package goparsify
import (
"fmt"
+ "unicode"
+ "unicode/utf8"
)
// Error represents a parse error. These will often be set, the parser will back up a little and
@@ -17,9 +19,6 @@ func (e Error) Pos() int { return e.pos }
// Error satisfies the golang error interface
func (e Error) Error() string { return fmt.Sprintf("offset %d: expected %s", e.pos, e.expected) }
-// WSFunc matches a byte and returns true if it is whitespace
-type WSFunc func(c byte) bool
-
// State is the current parse state. It is entirely public because parsers are expected to mutate it during the parse.
type State struct {
// The full input string
@@ -30,21 +29,41 @@ type State struct {
// in backtracking that it has been inlined to avoid allocations.
Error Error
// Called to determine what to ignore when WS is called, or when AutoWS fires
- WSFunc WSFunc
+ WS VoidParser
NoAutoWS bool
}
+// ASCIIWhitespace matches any of the standard whitespace characters. It is faster
+// than the UnicodeWhitespace parser as it does not need to decode unicode runes.
+func ASCIIWhitespace(s *State) {
+ for s.Pos < len(s.Input) {
+ switch s.Input[s.Pos] {
+ case '\t', '\n', '\v', '\f', '\r', ' ':
+ s.Pos++
+ default:
+ return
+ }
+ }
+}
+
+// UnicodeWhitespace matches any unicode space character. Its a little slower
+// than the ascii parser because it matches a rune at a time.
+func UnicodeWhitespace(s *State) {
+ for s.Pos < len(s.Input) {
+ r, w := utf8.DecodeRuneInString(s.Get())
+ if !unicode.IsSpace(r) {
+ return
+ }
+ s.Pos += w
+ }
+
+}
+
// NewState creates a new State from a string
func NewState(input string) *State {
return &State{
Input: input,
- WSFunc: func(b byte) bool {
- switch b {
- case '\t', '\n', '\v', '\f', '\r', ' ':
- return true
- }
- return false
- },
+ WS: ASCIIWhitespace,
}
}
@@ -58,14 +77,7 @@ func (s *State) AutoWS() {
if s.NoAutoWS {
return
}
- s.WS()
-}
-
-// WS consumes all whitespace and advances Pos.
-func (s *State) WS() {
- for s.Pos < len(s.Input) && s.WSFunc(s.Input[s.Pos]) {
- s.Pos++
- }
+ s.WS(s)
}
// Get the remaining input.