Add a unicode whitespace parser
This commit is contained in:
parent
d285a0badc
commit
b64fcfaa61
@ -25,6 +25,9 @@ type Result struct {
|
|||||||
// - A parser that consumed some input should advance state.Pos
|
// - A parser that consumed some input should advance state.Pos
|
||||||
type Parser func(*State) Result
|
type Parser func(*State) Result
|
||||||
|
|
||||||
|
// VoidParser is a special type of parser that never returns anything but can still consume input
|
||||||
|
type VoidParser func(*State)
|
||||||
|
|
||||||
// Parserish types are any type that can be turned into a Parser by Parsify
|
// Parserish types are any type that can be turned into a Parser by Parsify
|
||||||
// These currently include *Parser and string literals.
|
// These currently include *Parser and string literals.
|
||||||
//
|
//
|
||||||
@ -76,7 +79,7 @@ func ParsifyAll(parsers ...Parserish) []Parser {
|
|||||||
// WS will consume whitespace, it should only be needed when AutoWS is turned off
|
// WS will consume whitespace, it should only be needed when AutoWS is turned off
|
||||||
func WS() Parser {
|
func WS() Parser {
|
||||||
return NewParser("AutoWS", func(ps *State) Result {
|
return NewParser("AutoWS", func(ps *State) Result {
|
||||||
ps.WS()
|
ps.WS(ps)
|
||||||
return Result{}
|
return Result{}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -203,6 +203,15 @@ func TestAutoWS(t *testing.T) {
|
|||||||
require.Equal(t, "hello", result.Child[1].Token)
|
require.Equal(t, "hello", result.Child[1].Token)
|
||||||
require.Equal(t, "", ps.Get())
|
require.Equal(t, "", ps.Get())
|
||||||
})
|
})
|
||||||
|
|
||||||
|
t.Run("unicode whitespace", func(t *testing.T) {
|
||||||
|
ps := NewState(" \u202f hello")
|
||||||
|
ps.WS = UnicodeWhitespace
|
||||||
|
|
||||||
|
result := Exact("hello")(ps)
|
||||||
|
require.Equal(t, "hello", result.Token)
|
||||||
|
require.False(t, ps.Errored())
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
func runParser(input string, parser Parser) (Result, *State) {
|
func runParser(input string, parser Parser) (Result, *State) {
|
||||||
|
50
state.go
50
state.go
@ -2,6 +2,8 @@ package goparsify
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"unicode"
|
||||||
|
"unicode/utf8"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Error represents a parse error. These will often be set, the parser will back up a little and
|
// Error represents a parse error. These will often be set, the parser will back up a little and
|
||||||
@ -17,9 +19,6 @@ func (e Error) Pos() int { return e.pos }
|
|||||||
// Error satisfies the golang error interface
|
// Error satisfies the golang error interface
|
||||||
func (e Error) Error() string { return fmt.Sprintf("offset %d: expected %s", e.pos, e.expected) }
|
func (e Error) Error() string { return fmt.Sprintf("offset %d: expected %s", e.pos, e.expected) }
|
||||||
|
|
||||||
// WSFunc matches a byte and returns true if it is whitespace
|
|
||||||
type WSFunc func(c byte) bool
|
|
||||||
|
|
||||||
// State is the current parse state. It is entirely public because parsers are expected to mutate it during the parse.
|
// State is the current parse state. It is entirely public because parsers are expected to mutate it during the parse.
|
||||||
type State struct {
|
type State struct {
|
||||||
// The full input string
|
// The full input string
|
||||||
@ -30,21 +29,41 @@ type State struct {
|
|||||||
// in backtracking that it has been inlined to avoid allocations.
|
// in backtracking that it has been inlined to avoid allocations.
|
||||||
Error Error
|
Error Error
|
||||||
// Called to determine what to ignore when WS is called, or when AutoWS fires
|
// Called to determine what to ignore when WS is called, or when AutoWS fires
|
||||||
WSFunc WSFunc
|
WS VoidParser
|
||||||
NoAutoWS bool
|
NoAutoWS bool
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ASCIIWhitespace matches any of the standard whitespace characters. It is faster
|
||||||
|
// than the UnicodeWhitespace parser as it does not need to decode unicode runes.
|
||||||
|
func ASCIIWhitespace(s *State) {
|
||||||
|
for s.Pos < len(s.Input) {
|
||||||
|
switch s.Input[s.Pos] {
|
||||||
|
case '\t', '\n', '\v', '\f', '\r', ' ':
|
||||||
|
s.Pos++
|
||||||
|
default:
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// UnicodeWhitespace matches any unicode space character. Its a little slower
|
||||||
|
// than the ascii parser because it matches a rune at a time.
|
||||||
|
func UnicodeWhitespace(s *State) {
|
||||||
|
for s.Pos < len(s.Input) {
|
||||||
|
r, w := utf8.DecodeRuneInString(s.Get())
|
||||||
|
if !unicode.IsSpace(r) {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
s.Pos += w
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
// NewState creates a new State from a string
|
// NewState creates a new State from a string
|
||||||
func NewState(input string) *State {
|
func NewState(input string) *State {
|
||||||
return &State{
|
return &State{
|
||||||
Input: input,
|
Input: input,
|
||||||
WSFunc: func(b byte) bool {
|
WS: ASCIIWhitespace,
|
||||||
switch b {
|
|
||||||
case '\t', '\n', '\v', '\f', '\r', ' ':
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -58,14 +77,7 @@ func (s *State) AutoWS() {
|
|||||||
if s.NoAutoWS {
|
if s.NoAutoWS {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
s.WS()
|
s.WS(s)
|
||||||
}
|
|
||||||
|
|
||||||
// WS consumes all whitespace and advances Pos.
|
|
||||||
func (s *State) WS() {
|
|
||||||
for s.Pos < len(s.Input) && s.WSFunc(s.Input[s.Pos]) {
|
|
||||||
s.Pos++
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get the remaining input.
|
// Get the remaining input.
|
||||||
|
Loading…
Reference in New Issue
Block a user