Add a unicode whitespace parser

This commit is contained in:
Adam Scarr 2017-08-09 22:14:27 +10:00
parent d285a0badc
commit b64fcfaa61
3 changed files with 44 additions and 20 deletions

View File

@ -25,6 +25,9 @@ type Result struct {
// - A parser that consumed some input should advance state.Pos // - A parser that consumed some input should advance state.Pos
type Parser func(*State) Result type Parser func(*State) Result
// VoidParser is a special type of parser that never returns anything but can still consume input
type VoidParser func(*State)
// Parserish types are any type that can be turned into a Parser by Parsify // Parserish types are any type that can be turned into a Parser by Parsify
// These currently include *Parser and string literals. // These currently include *Parser and string literals.
// //
@ -76,7 +79,7 @@ func ParsifyAll(parsers ...Parserish) []Parser {
// WS will consume whitespace, it should only be needed when AutoWS is turned off // WS will consume whitespace, it should only be needed when AutoWS is turned off
func WS() Parser { func WS() Parser {
return NewParser("AutoWS", func(ps *State) Result { return NewParser("AutoWS", func(ps *State) Result {
ps.WS() ps.WS(ps)
return Result{} return Result{}
}) })
} }

View File

@ -203,6 +203,15 @@ func TestAutoWS(t *testing.T) {
require.Equal(t, "hello", result.Child[1].Token) require.Equal(t, "hello", result.Child[1].Token)
require.Equal(t, "", ps.Get()) require.Equal(t, "", ps.Get())
}) })
t.Run("unicode whitespace", func(t *testing.T) {
ps := NewState(" \u202f hello")
ps.WS = UnicodeWhitespace
result := Exact("hello")(ps)
require.Equal(t, "hello", result.Token)
require.False(t, ps.Errored())
})
} }
func runParser(input string, parser Parser) (Result, *State) { func runParser(input string, parser Parser) (Result, *State) {

View File

@ -2,6 +2,8 @@ package goparsify
import ( import (
"fmt" "fmt"
"unicode"
"unicode/utf8"
) )
// Error represents a parse error. These will often be set, the parser will back up a little and // Error represents a parse error. These will often be set, the parser will back up a little and
@ -17,9 +19,6 @@ func (e Error) Pos() int { return e.pos }
// Error satisfies the golang error interface // Error satisfies the golang error interface
func (e Error) Error() string { return fmt.Sprintf("offset %d: expected %s", e.pos, e.expected) } func (e Error) Error() string { return fmt.Sprintf("offset %d: expected %s", e.pos, e.expected) }
// WSFunc matches a byte and returns true if it is whitespace
type WSFunc func(c byte) bool
// State is the current parse state. It is entirely public because parsers are expected to mutate it during the parse. // State is the current parse state. It is entirely public because parsers are expected to mutate it during the parse.
type State struct { type State struct {
// The full input string // The full input string
@ -30,21 +29,41 @@ type State struct {
// in backtracking that it has been inlined to avoid allocations. // in backtracking that it has been inlined to avoid allocations.
Error Error Error Error
// Called to determine what to ignore when WS is called, or when AutoWS fires // Called to determine what to ignore when WS is called, or when AutoWS fires
WSFunc WSFunc WS VoidParser
NoAutoWS bool NoAutoWS bool
} }
// ASCIIWhitespace matches any of the standard whitespace characters. It is faster
// than the UnicodeWhitespace parser as it does not need to decode unicode runes.
func ASCIIWhitespace(s *State) {
for s.Pos < len(s.Input) {
switch s.Input[s.Pos] {
case '\t', '\n', '\v', '\f', '\r', ' ':
s.Pos++
default:
return
}
}
}
// UnicodeWhitespace matches any unicode space character. Its a little slower
// than the ascii parser because it matches a rune at a time.
func UnicodeWhitespace(s *State) {
for s.Pos < len(s.Input) {
r, w := utf8.DecodeRuneInString(s.Get())
if !unicode.IsSpace(r) {
return
}
s.Pos += w
}
}
// NewState creates a new State from a string // NewState creates a new State from a string
func NewState(input string) *State { func NewState(input string) *State {
return &State{ return &State{
Input: input, Input: input,
WSFunc: func(b byte) bool { WS: ASCIIWhitespace,
switch b {
case '\t', '\n', '\v', '\f', '\r', ' ':
return true
}
return false
},
} }
} }
@ -58,14 +77,7 @@ func (s *State) AutoWS() {
if s.NoAutoWS { if s.NoAutoWS {
return return
} }
s.WS() s.WS(s)
}
// WS consumes all whitespace and advances Pos.
func (s *State) WS() {
for s.Pos < len(s.Input) && s.WSFunc(s.Input[s.Pos]) {
s.Pos++
}
} }
// Get the remaining input. // Get the remaining input.