2017-08-07 12:07:29 +02:00
|
|
|
package goparsify
|
2017-08-06 06:31:35 +02:00
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
2017-08-09 13:41:57 +02:00
|
|
|
"regexp"
|
2017-08-06 07:43:23 +02:00
|
|
|
"strings"
|
|
|
|
"unicode/utf8"
|
2017-08-06 06:31:35 +02:00
|
|
|
)
|
|
|
|
|
2017-08-09 13:18:14 +02:00
|
|
|
// Result is the output of a parser. Usually only one of its fields will be set and should be though of
|
|
|
|
// more as a union type. having it avoids interface{} littered all through the parsing code and makes
|
|
|
|
// the it easy to do the two most common operations, getting a token and finding a child.
|
|
|
|
type Result struct {
|
2017-08-08 15:11:47 +02:00
|
|
|
Token string
|
2017-08-09 13:18:14 +02:00
|
|
|
Child []Result
|
2017-08-08 15:11:47 +02:00
|
|
|
Result interface{}
|
2017-08-07 10:25:23 +02:00
|
|
|
}
|
|
|
|
|
2017-08-09 13:18:14 +02:00
|
|
|
// Parser is the workhorse of parsify. A parser takes a State and returns a result, consuming some
|
|
|
|
// of the State in the process.
|
|
|
|
// Given state is shared there are a few rules that should be followed:
|
|
|
|
// - A parser that errors must set state.Error
|
|
|
|
// - A parser that errors must not change state.Pos
|
|
|
|
// - A parser that consumed some input should advance state.Pos
|
|
|
|
type Parser func(*State) Result
|
2017-08-06 06:31:35 +02:00
|
|
|
|
2017-08-13 04:56:46 +02:00
|
|
|
// Map shorthand for Map(p, func())
|
|
|
|
func (p Parser) Map(f func(n Result) Result) Parser {
|
|
|
|
return Map(p, f)
|
|
|
|
}
|
|
|
|
|
2017-08-09 14:14:27 +02:00
|
|
|
// VoidParser is a special type of parser that never returns anything but can still consume input
|
|
|
|
type VoidParser func(*State)
|
|
|
|
|
2017-08-06 06:31:35 +02:00
|
|
|
// Parserish types are any type that can be turned into a Parser by Parsify
|
|
|
|
// These currently include *Parser and string literals.
|
|
|
|
//
|
|
|
|
// This makes recursive grammars cleaner and allows string literals to be used directly in most contexts.
|
|
|
|
// eg, matching balanced paren:
|
2017-08-10 16:54:27 +02:00
|
|
|
// var group Parser
|
|
|
|
// group = Seq("(", Maybe(&group), ")")
|
2017-08-06 06:31:35 +02:00
|
|
|
// vs
|
2017-08-10 16:54:27 +02:00
|
|
|
// var group ParserPtr{}
|
|
|
|
// group.P = Seq(Exact("("), Maybe(group.Parse), Exact(")"))
|
2017-08-06 06:31:35 +02:00
|
|
|
type Parserish interface{}
|
|
|
|
|
2017-08-09 13:18:14 +02:00
|
|
|
// Parsify takes a Parserish and makes a Parser out of it. It should be called by
|
|
|
|
// any Parser that accepts a Parser as an argument. It should never be called during
|
|
|
|
// instead call it during parser creation so there is no runtime cost.
|
|
|
|
//
|
|
|
|
// See Parserish for details.
|
2017-08-06 06:31:35 +02:00
|
|
|
func Parsify(p Parserish) Parser {
|
|
|
|
switch p := p.(type) {
|
2017-08-09 13:18:14 +02:00
|
|
|
case func(*State) Result:
|
2017-08-10 13:04:14 +02:00
|
|
|
return p
|
2017-08-06 06:31:35 +02:00
|
|
|
case Parser:
|
|
|
|
return p
|
|
|
|
case *Parser:
|
|
|
|
// Todo: Maybe capture this stack and on nil show it? Is there a good error library to do this?
|
2017-08-09 13:18:14 +02:00
|
|
|
return func(ptr *State) Result {
|
2017-08-06 06:31:35 +02:00
|
|
|
return (*p)(ptr)
|
|
|
|
}
|
|
|
|
case string:
|
|
|
|
return Exact(p)
|
|
|
|
default:
|
|
|
|
panic(fmt.Errorf("cant turn a `%T` into a parser", p))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-08-09 13:18:14 +02:00
|
|
|
// ParsifyAll calls Parsify on all parsers
|
2017-08-06 06:31:35 +02:00
|
|
|
func ParsifyAll(parsers ...Parserish) []Parser {
|
|
|
|
ret := make([]Parser, len(parsers))
|
|
|
|
for i, parser := range parsers {
|
|
|
|
ret[i] = Parsify(parser)
|
|
|
|
}
|
|
|
|
return ret
|
|
|
|
}
|
|
|
|
|
2017-08-09 13:18:14 +02:00
|
|
|
// Run applies some input to a parser and returns the result, failing if the input isnt fully consumed.
|
|
|
|
// It is a convenience method for the most common way to invoke a parser.
|
2017-08-10 14:06:08 +02:00
|
|
|
func Run(parser Parserish, input string, ws ...VoidParser) (result interface{}, err error) {
|
2017-08-06 07:43:23 +02:00
|
|
|
p := Parsify(parser)
|
2017-08-09 13:18:14 +02:00
|
|
|
ps := NewState(input)
|
2017-08-10 14:06:08 +02:00
|
|
|
if len(ws) > 0 {
|
|
|
|
ps.WS = ws[0]
|
|
|
|
}
|
2017-08-07 13:20:30 +02:00
|
|
|
|
2017-08-07 10:25:23 +02:00
|
|
|
ret := p(ps)
|
2017-08-07 13:20:30 +02:00
|
|
|
ps.AutoWS()
|
2017-08-06 07:43:23 +02:00
|
|
|
|
2017-08-09 13:18:14 +02:00
|
|
|
if ps.Error.expected != "" {
|
2017-08-13 08:56:12 +02:00
|
|
|
return ret.Result, &ps.Error
|
2017-08-09 13:18:14 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if ps.Get() != "" {
|
2017-08-13 08:56:12 +02:00
|
|
|
return ret.Result, UnparsedInputError{ps.Get()}
|
2017-08-06 07:43:23 +02:00
|
|
|
}
|
|
|
|
|
2017-08-09 13:18:14 +02:00
|
|
|
return ret.Result, nil
|
2017-08-06 07:43:23 +02:00
|
|
|
}
|
|
|
|
|
2017-08-10 14:06:08 +02:00
|
|
|
// WS will consume whitespace, it should only be needed when AutoWS is turned off
|
|
|
|
func WS() Parser {
|
|
|
|
return NewParser("AutoWS", func(ps *State) Result {
|
|
|
|
ps.WS(ps)
|
|
|
|
return Result{}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// Cut prevents backtracking beyond this point. Usually used after keywords when you
|
|
|
|
// are sure this is the correct path. Improves performance and error reporting.
|
2017-08-10 14:10:30 +02:00
|
|
|
func Cut() Parser {
|
|
|
|
return func(ps *State) Result {
|
|
|
|
ps.Cut = ps.Pos
|
|
|
|
return Result{}
|
|
|
|
}
|
2017-08-10 14:06:08 +02:00
|
|
|
}
|
|
|
|
|
2017-08-09 13:41:57 +02:00
|
|
|
// Regex returns a match if the regex successfully matches
|
|
|
|
func Regex(pattern string) Parser {
|
|
|
|
re := regexp.MustCompile("^" + pattern)
|
|
|
|
return NewParser(pattern, func(ps *State) Result {
|
|
|
|
ps.AutoWS()
|
|
|
|
if match := re.FindString(ps.Get()); match != "" {
|
|
|
|
ps.Advance(len(match))
|
|
|
|
return Result{Token: match}
|
|
|
|
}
|
|
|
|
ps.ErrorHere(pattern)
|
|
|
|
return Result{}
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2017-08-09 13:18:14 +02:00
|
|
|
// Exact will fully match the exact string supplied, or error. The match will be stored in .Token
|
2017-08-06 06:31:35 +02:00
|
|
|
func Exact(match string) Parser {
|
2017-08-07 13:57:12 +02:00
|
|
|
if len(match) == 1 {
|
|
|
|
matchByte := match[0]
|
2017-08-09 13:18:14 +02:00
|
|
|
return NewParser(match, func(ps *State) Result {
|
2017-08-07 13:57:12 +02:00
|
|
|
ps.AutoWS()
|
2017-08-07 14:38:34 +02:00
|
|
|
if ps.Pos >= len(ps.Input) || ps.Input[ps.Pos] != matchByte {
|
2017-08-07 13:57:12 +02:00
|
|
|
ps.ErrorHere(match)
|
2017-08-09 13:18:14 +02:00
|
|
|
return Result{}
|
2017-08-07 13:57:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
ps.Advance(1)
|
|
|
|
|
2017-08-09 13:18:14 +02:00
|
|
|
return Result{Token: match}
|
2017-08-07 13:57:12 +02:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2017-08-09 13:18:14 +02:00
|
|
|
return NewParser(match, func(ps *State) Result {
|
2017-08-07 13:20:30 +02:00
|
|
|
ps.AutoWS()
|
2017-08-06 15:32:10 +02:00
|
|
|
if !strings.HasPrefix(ps.Get(), match) {
|
|
|
|
ps.ErrorHere(match)
|
2017-08-09 13:18:14 +02:00
|
|
|
return Result{}
|
2017-08-06 06:31:35 +02:00
|
|
|
}
|
|
|
|
|
2017-08-06 15:32:10 +02:00
|
|
|
ps.Advance(len(match))
|
2017-08-06 07:43:23 +02:00
|
|
|
|
2017-08-09 13:18:14 +02:00
|
|
|
return Result{Token: match}
|
2017-08-07 12:07:29 +02:00
|
|
|
})
|
2017-08-06 06:31:35 +02:00
|
|
|
}
|
|
|
|
|
2017-08-06 15:32:10 +02:00
|
|
|
func parseRepetition(defaultMin, defaultMax int, repetition ...int) (min int, max int) {
|
|
|
|
min = defaultMin
|
|
|
|
max = defaultMax
|
2017-08-06 07:43:23 +02:00
|
|
|
switch len(repetition) {
|
|
|
|
case 0:
|
|
|
|
case 1:
|
|
|
|
min = repetition[0]
|
|
|
|
case 2:
|
|
|
|
min = repetition[0]
|
|
|
|
max = repetition[1]
|
|
|
|
default:
|
2017-08-10 16:08:08 +02:00
|
|
|
panic(fmt.Errorf("Dont know what %d repetition args mean", len(repetition)))
|
2017-08-06 07:43:23 +02:00
|
|
|
}
|
2017-08-06 15:32:10 +02:00
|
|
|
return min, max
|
|
|
|
}
|
2017-08-06 07:43:23 +02:00
|
|
|
|
2017-08-06 15:32:10 +02:00
|
|
|
// parseMatcher turns a string in the format a-f01234A-F into:
|
2017-08-09 13:18:14 +02:00
|
|
|
// - an alphabet of matches string(01234)
|
2017-08-06 15:32:10 +02:00
|
|
|
// - a set of ranges [][]rune{{'a', 'f'}, {'A', 'F'}}
|
2017-08-09 13:18:14 +02:00
|
|
|
func parseMatcher(matcher string) (alphabet string, ranges [][]rune) {
|
2017-08-06 15:32:10 +02:00
|
|
|
runes := []rune(matcher)
|
|
|
|
|
|
|
|
for i := 0; i < len(runes); i++ {
|
|
|
|
|
2017-08-09 13:58:36 +02:00
|
|
|
if i+2 < len(runes) && runes[i+1] == '-' && runes[i] != '\\' {
|
2017-08-06 15:32:10 +02:00
|
|
|
start := runes[i]
|
|
|
|
end := runes[i+2]
|
|
|
|
if start <= end {
|
|
|
|
ranges = append(ranges, []rune{start, end})
|
|
|
|
} else {
|
|
|
|
ranges = append(ranges, []rune{end, start})
|
|
|
|
}
|
|
|
|
} else if i+1 < len(runes) && runes[i] == '\\' {
|
2017-08-09 13:18:14 +02:00
|
|
|
alphabet += string(runes[i+1])
|
2017-08-06 07:43:23 +02:00
|
|
|
} else {
|
2017-08-09 13:18:14 +02:00
|
|
|
alphabet += string(runes[i])
|
2017-08-06 06:31:35 +02:00
|
|
|
}
|
2017-08-06 15:32:10 +02:00
|
|
|
|
2017-08-06 06:31:35 +02:00
|
|
|
}
|
|
|
|
|
2017-08-09 13:18:14 +02:00
|
|
|
return alphabet, ranges
|
2017-08-06 15:32:10 +02:00
|
|
|
}
|
|
|
|
|
2017-08-09 13:18:14 +02:00
|
|
|
// Chars is the swiss army knife of character matches. It can match:
|
|
|
|
// - ranges: Chars("a-z") will match one or more lowercase letter
|
|
|
|
// - alphabets: Chars("abcd") will match one or more of the letters abcd in any order
|
|
|
|
// - min and max: Chars("a-z0-9", 4, 6) will match 4-6 lowercase alphanumeric characters
|
|
|
|
// the above can be combined in any order
|
2017-08-06 15:32:10 +02:00
|
|
|
func Chars(matcher string, repetition ...int) Parser {
|
2017-08-07 12:07:29 +02:00
|
|
|
return NewParser("["+matcher+"]", charsImpl(matcher, false, repetition...))
|
2017-08-06 15:32:10 +02:00
|
|
|
}
|
|
|
|
|
2017-08-09 13:18:14 +02:00
|
|
|
// NotChars accepts the full range of input from Chars, but it will stop when any
|
|
|
|
// character matches.
|
2017-08-06 15:32:10 +02:00
|
|
|
func NotChars(matcher string, repetition ...int) Parser {
|
2017-08-07 12:07:29 +02:00
|
|
|
return NewParser("!["+matcher+"]", charsImpl(matcher, true, repetition...))
|
2017-08-06 15:32:10 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func charsImpl(matcher string, stopOn bool, repetition ...int) Parser {
|
|
|
|
min, max := parseRepetition(1, -1, repetition...)
|
2017-08-09 13:18:14 +02:00
|
|
|
alphabet, ranges := parseMatcher(matcher)
|
2017-08-06 15:32:10 +02:00
|
|
|
|
2017-08-09 13:18:14 +02:00
|
|
|
return func(ps *State) Result {
|
2017-08-07 13:20:30 +02:00
|
|
|
ps.AutoWS()
|
2017-08-06 07:43:23 +02:00
|
|
|
matched := 0
|
2017-08-06 15:32:10 +02:00
|
|
|
for ps.Pos+matched < len(ps.Input) {
|
2017-08-06 07:43:23 +02:00
|
|
|
if max != -1 && matched >= max {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
|
2017-08-06 15:32:10 +02:00
|
|
|
r, w := utf8.DecodeRuneInString(ps.Input[ps.Pos+matched:])
|
2017-08-06 07:43:23 +02:00
|
|
|
|
2017-08-09 13:18:14 +02:00
|
|
|
anyMatched := strings.ContainsRune(alphabet, r)
|
2017-08-06 15:32:10 +02:00
|
|
|
if !anyMatched {
|
|
|
|
for _, rng := range ranges {
|
|
|
|
if r >= rng[0] && r <= rng[1] {
|
|
|
|
anyMatched = true
|
|
|
|
}
|
2017-08-06 07:43:23 +02:00
|
|
|
}
|
|
|
|
}
|
2017-08-06 15:32:10 +02:00
|
|
|
|
|
|
|
if anyMatched == stopOn {
|
2017-08-06 07:43:23 +02:00
|
|
|
break
|
|
|
|
}
|
|
|
|
|
|
|
|
matched += w
|
|
|
|
}
|
|
|
|
|
|
|
|
if matched < min {
|
2017-08-06 15:32:10 +02:00
|
|
|
ps.ErrorHere(matcher)
|
2017-08-09 13:18:14 +02:00
|
|
|
return Result{}
|
2017-08-06 07:43:23 +02:00
|
|
|
}
|
|
|
|
|
2017-08-06 15:32:10 +02:00
|
|
|
result := ps.Input[ps.Pos : ps.Pos+matched]
|
|
|
|
ps.Advance(matched)
|
2017-08-09 13:18:14 +02:00
|
|
|
return Result{Token: result}
|
2017-08-06 07:43:23 +02:00
|
|
|
}
|
2017-08-06 06:31:35 +02:00
|
|
|
}
|