Add a unicode whitespace parser

2017-08-09 22:14:27 +10:00 · 2017-08-09 22:14:27 +10:00 · b64fcfaa61
commit b64fcfaa61
parent d285a0badc
3 changed files with 44 additions and 20 deletions
--- a/parser.go
+++ b/parser.go
@ -25,6 +25,9 @@ type Result struct {
 //  - A parser that consumed some input should advance state.Pos
 type Parser func(*State) Result
 // VoidParser is a special type of parser that never returns anything but can still consume input
 type VoidParser func(*State)
 // Parserish types are any type that can be turned into a Parser by Parsify
 // These currently include *Parser and string literals.
 //
@ -76,7 +79,7 @@ func ParsifyAll(parsers ...Parserish) []Parser {
 // WS will consume whitespace, it should only be needed when AutoWS is turned off
 func WS() Parser {
 	return NewParser("AutoWS", func(ps *State) Result {
-		ps.WS()
+		ps.WS(ps)
 		return Result{}
 	})
 }
--- a/parser_test.go
+++ b/parser_test.go
@ -203,6 +203,15 @@ func TestAutoWS(t *testing.T) {
 		require.Equal(t, "hello", result.Child[1].Token)
 		require.Equal(t, "", ps.Get())
 	})
 	t.Run("unicode whitespace", func(t *testing.T) {
 		ps := NewState(" \u202f hello")
 		ps.WS = UnicodeWhitespace
 		result := Exact("hello")(ps)
 		require.Equal(t, "hello", result.Token)
 		require.False(t, ps.Errored())
 	})
 }
 func runParser(input string, parser Parser) (Result, *State) {
--- a/state.go
+++ b/state.go
@ -2,6 +2,8 @@ package goparsify
 import (
 	"fmt"
 	"unicode"
 	"unicode/utf8"
 )
 // Error represents a parse error. These will often be set, the parser will back up a little and
@ -17,9 +19,6 @@ func (e Error) Pos() int { return e.pos }
 // Error satisfies the golang error interface
 func (e Error) Error() string { return fmt.Sprintf("offset %d: expected %s", e.pos, e.expected) }
 // WSFunc matches a byte and returns true if it is whitespace
 type WSFunc func(c byte) bool
 // State is the current parse state. It is entirely public because parsers are expected to mutate it during the parse.
 type State struct {
 	// The full input string
@ -30,21 +29,41 @@ type State struct {
 	// in backtracking that it has been inlined to avoid allocations.
 	Error Error
 	// Called to determine what to ignore when WS is called, or when AutoWS fires
-	WSFunc   WSFunc
+	WS       VoidParser
 	NoAutoWS bool
 }
 // ASCIIWhitespace matches any of the standard whitespace characters. It is faster
 // than the UnicodeWhitespace parser as it does not need to decode unicode runes.
 func ASCIIWhitespace(s *State) {
 	for s.Pos < len(s.Input) {
 		switch s.Input[s.Pos] {
 		case '\t', '\n', '\v', '\f', '\r', ' ':
 			s.Pos++
 		default:
 			return
 		}
 	}
 }
 // UnicodeWhitespace matches any unicode space character. Its a little slower
 // than the ascii parser because it matches a rune at a time.
 func UnicodeWhitespace(s *State) {
 	for s.Pos < len(s.Input) {
 		r, w := utf8.DecodeRuneInString(s.Get())
 		if !unicode.IsSpace(r) {
 			return
 		}
 		s.Pos += w
 	}
 }
 // NewState creates a new State from a string
 func NewState(input string) *State {
 	return &State{
 		Input: input,
-		WSFunc: func(b byte) bool {
+		WS:    ASCIIWhitespace,
 			switch b {
 			case '\t', '\n', '\v', '\f', '\r', ' ':
 				return true
 			}
 			return false
 		},
 	}
 }
@ -58,14 +77,7 @@ func (s *State) AutoWS() {
 	if s.NoAutoWS {
 		return
 	}
-	s.WS()
+	s.WS(s)
 }
 // WS consumes all whitespace and advances Pos.
 func (s *State) WS() {
 	for s.Pos < len(s.Input) && s.WSFunc(s.Input[s.Pos]) {
 		s.Pos++
 	}
 }
 // Get the remaining input.