Add a unicode whitespace parser
This commit is contained in:
parent
d285a0badc
commit
b64fcfaa61
@ -25,6 +25,9 @@ type Result struct {
|
||||
// - A parser that consumed some input should advance state.Pos
|
||||
type Parser func(*State) Result
|
||||
|
||||
// VoidParser is a special type of parser that never returns anything but can still consume input
|
||||
type VoidParser func(*State)
|
||||
|
||||
// Parserish types are any type that can be turned into a Parser by Parsify
|
||||
// These currently include *Parser and string literals.
|
||||
//
|
||||
@ -76,7 +79,7 @@ func ParsifyAll(parsers ...Parserish) []Parser {
|
||||
// WS will consume whitespace, it should only be needed when AutoWS is turned off
|
||||
func WS() Parser {
|
||||
return NewParser("AutoWS", func(ps *State) Result {
|
||||
ps.WS()
|
||||
ps.WS(ps)
|
||||
return Result{}
|
||||
})
|
||||
}
|
||||
|
@ -203,6 +203,15 @@ func TestAutoWS(t *testing.T) {
|
||||
require.Equal(t, "hello", result.Child[1].Token)
|
||||
require.Equal(t, "", ps.Get())
|
||||
})
|
||||
|
||||
t.Run("unicode whitespace", func(t *testing.T) {
|
||||
ps := NewState(" \u202f hello")
|
||||
ps.WS = UnicodeWhitespace
|
||||
|
||||
result := Exact("hello")(ps)
|
||||
require.Equal(t, "hello", result.Token)
|
||||
require.False(t, ps.Errored())
|
||||
})
|
||||
}
|
||||
|
||||
func runParser(input string, parser Parser) (Result, *State) {
|
||||
|
50
state.go
50
state.go
@ -2,6 +2,8 @@ package goparsify
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// Error represents a parse error. These will often be set, the parser will back up a little and
|
||||
@ -17,9 +19,6 @@ func (e Error) Pos() int { return e.pos }
|
||||
// Error satisfies the golang error interface
|
||||
func (e Error) Error() string { return fmt.Sprintf("offset %d: expected %s", e.pos, e.expected) }
|
||||
|
||||
// WSFunc matches a byte and returns true if it is whitespace
|
||||
type WSFunc func(c byte) bool
|
||||
|
||||
// State is the current parse state. It is entirely public because parsers are expected to mutate it during the parse.
|
||||
type State struct {
|
||||
// The full input string
|
||||
@ -30,21 +29,41 @@ type State struct {
|
||||
// in backtracking that it has been inlined to avoid allocations.
|
||||
Error Error
|
||||
// Called to determine what to ignore when WS is called, or when AutoWS fires
|
||||
WSFunc WSFunc
|
||||
WS VoidParser
|
||||
NoAutoWS bool
|
||||
}
|
||||
|
||||
// ASCIIWhitespace matches any of the standard whitespace characters. It is faster
|
||||
// than the UnicodeWhitespace parser as it does not need to decode unicode runes.
|
||||
func ASCIIWhitespace(s *State) {
|
||||
for s.Pos < len(s.Input) {
|
||||
switch s.Input[s.Pos] {
|
||||
case '\t', '\n', '\v', '\f', '\r', ' ':
|
||||
s.Pos++
|
||||
default:
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// UnicodeWhitespace matches any unicode space character. Its a little slower
|
||||
// than the ascii parser because it matches a rune at a time.
|
||||
func UnicodeWhitespace(s *State) {
|
||||
for s.Pos < len(s.Input) {
|
||||
r, w := utf8.DecodeRuneInString(s.Get())
|
||||
if !unicode.IsSpace(r) {
|
||||
return
|
||||
}
|
||||
s.Pos += w
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// NewState creates a new State from a string
|
||||
func NewState(input string) *State {
|
||||
return &State{
|
||||
Input: input,
|
||||
WSFunc: func(b byte) bool {
|
||||
switch b {
|
||||
case '\t', '\n', '\v', '\f', '\r', ' ':
|
||||
return true
|
||||
}
|
||||
return false
|
||||
},
|
||||
WS: ASCIIWhitespace,
|
||||
}
|
||||
}
|
||||
|
||||
@ -58,14 +77,7 @@ func (s *State) AutoWS() {
|
||||
if s.NoAutoWS {
|
||||
return
|
||||
}
|
||||
s.WS()
|
||||
}
|
||||
|
||||
// WS consumes all whitespace and advances Pos.
|
||||
func (s *State) WS() {
|
||||
for s.Pos < len(s.Input) && s.WSFunc(s.Input[s.Pos]) {
|
||||
s.Pos++
|
||||
}
|
||||
s.WS(s)
|
||||
}
|
||||
|
||||
// Get the remaining input.
|
||||
|
Loading…
Reference in New Issue
Block a user