goparsify/parser.go

304 lines
6.0 KiB
Go
Raw Normal View History

2017-08-07 12:07:29 +02:00
package goparsify
2017-08-06 06:31:35 +02:00
import (
2017-08-06 09:28:34 +02:00
"bytes"
2017-08-06 06:31:35 +02:00
"fmt"
2017-08-06 07:43:23 +02:00
"strings"
"unicode/utf8"
2017-08-06 06:31:35 +02:00
)
2017-08-07 10:25:23 +02:00
type Node struct {
Token string
2017-08-07 13:45:12 +02:00
Children []Node
2017-08-07 10:25:23 +02:00
Result interface{}
}
2017-08-07 13:45:12 +02:00
type Parser func(*State) Node
2017-08-06 06:31:35 +02:00
// Parserish types are any type that can be turned into a Parser by Parsify
// These currently include *Parser and string literals.
//
// This makes recursive grammars cleaner and allows string literals to be used directly in most contexts.
// eg, matching balanced paren:
// ```go
// var group Parser
// group = And("(", Maybe(&group), ")")
// ```
// vs
// ```go
// var group ParserPtr{}
// group.P = And(Exact("("), Maybe(group.Parse), Exact(")"))
// ```
type Parserish interface{}
func Parsify(p Parserish) Parser {
switch p := p.(type) {
2017-08-07 13:38:46 +02:00
case nil:
return nil
2017-08-07 13:45:12 +02:00
case func(*State) Node:
2017-08-07 13:20:30 +02:00
return NewParser("anonymous func", p)
2017-08-06 06:31:35 +02:00
case Parser:
return p
case *Parser:
// Todo: Maybe capture this stack and on nil show it? Is there a good error library to do this?
2017-08-07 13:45:12 +02:00
return func(ptr *State) Node {
2017-08-06 06:31:35 +02:00
return (*p)(ptr)
}
case string:
return Exact(p)
default:
panic(fmt.Errorf("cant turn a `%T` into a parser", p))
}
}
func ParsifyAll(parsers ...Parserish) []Parser {
ret := make([]Parser, len(parsers))
for i, parser := range parsers {
ret[i] = Parsify(parser)
}
return ret
}
2017-08-07 13:20:30 +02:00
func WS() Parser {
2017-08-07 13:45:12 +02:00
return NewParser("AutoWS", func(ps *State) Node {
2017-08-07 13:20:30 +02:00
ps.WS()
2017-08-07 13:45:12 +02:00
return Node{}
2017-08-07 13:20:30 +02:00
})
}
2017-08-06 15:32:10 +02:00
func ParseString(parser Parserish, input string) (result interface{}, remaining string, err error) {
2017-08-06 07:43:23 +02:00
p := Parsify(parser)
2017-08-07 13:20:30 +02:00
ps := InputString(input)
2017-08-07 10:25:23 +02:00
ret := p(ps)
2017-08-07 13:20:30 +02:00
ps.AutoWS()
2017-08-06 07:43:23 +02:00
2017-08-06 15:32:10 +02:00
if ps.Error.Expected != "" {
return nil, ps.Get(), ps.Error
2017-08-06 07:43:23 +02:00
}
2017-08-07 10:25:23 +02:00
return ret.Result, ps.Get(), nil
2017-08-06 07:43:23 +02:00
}
2017-08-06 06:31:35 +02:00
func Exact(match string) Parser {
2017-08-07 13:57:12 +02:00
if len(match) == 1 {
matchByte := match[0]
return NewParser(match, func(ps *State) Node {
ps.AutoWS()
2017-08-07 14:38:34 +02:00
if ps.Pos >= len(ps.Input) || ps.Input[ps.Pos] != matchByte {
2017-08-07 13:57:12 +02:00
ps.ErrorHere(match)
return Node{}
}
ps.Advance(1)
return Node{Token: match}
})
}
2017-08-07 13:45:12 +02:00
return NewParser(match, func(ps *State) Node {
2017-08-07 13:20:30 +02:00
ps.AutoWS()
2017-08-06 15:32:10 +02:00
if !strings.HasPrefix(ps.Get(), match) {
ps.ErrorHere(match)
2017-08-07 13:45:12 +02:00
return Node{}
2017-08-06 06:31:35 +02:00
}
2017-08-06 15:32:10 +02:00
ps.Advance(len(match))
2017-08-06 07:43:23 +02:00
2017-08-07 13:45:12 +02:00
return Node{Token: match}
2017-08-07 12:07:29 +02:00
})
2017-08-06 06:31:35 +02:00
}
2017-08-06 15:32:10 +02:00
func parseRepetition(defaultMin, defaultMax int, repetition ...int) (min int, max int) {
min = defaultMin
max = defaultMax
2017-08-06 07:43:23 +02:00
switch len(repetition) {
case 0:
case 1:
min = repetition[0]
case 2:
min = repetition[0]
max = repetition[1]
default:
panic(fmt.Errorf("Dont know what %d repetion args mean", len(repetition)))
}
2017-08-06 15:32:10 +02:00
return min, max
}
2017-08-06 07:43:23 +02:00
2017-08-06 15:32:10 +02:00
// parseMatcher turns a string in the format a-f01234A-F into:
// - a set string of matches string(01234)
// - a set of ranges [][]rune{{'a', 'f'}, {'A', 'F'}}
func parseMatcher(matcher string) (matches string, ranges [][]rune) {
runes := []rune(matcher)
for i := 0; i < len(runes); i++ {
if i+2 < len(runes) && runes[i+1] == '-' {
start := runes[i]
end := runes[i+2]
if start <= end {
ranges = append(ranges, []rune{start, end})
} else {
ranges = append(ranges, []rune{end, start})
}
} else if i+1 < len(runes) && runes[i] == '\\' {
matches += string(runes[i+1])
2017-08-06 07:43:23 +02:00
} else {
2017-08-06 15:32:10 +02:00
matches += string(runes[i])
2017-08-06 06:31:35 +02:00
}
2017-08-06 15:32:10 +02:00
2017-08-06 06:31:35 +02:00
}
2017-08-06 15:32:10 +02:00
return matches, ranges
}
func Chars(matcher string, repetition ...int) Parser {
2017-08-07 12:07:29 +02:00
return NewParser("["+matcher+"]", charsImpl(matcher, false, repetition...))
2017-08-06 15:32:10 +02:00
}
func NotChars(matcher string, repetition ...int) Parser {
2017-08-07 12:07:29 +02:00
return NewParser("!["+matcher+"]", charsImpl(matcher, true, repetition...))
2017-08-06 15:32:10 +02:00
}
func charsImpl(matcher string, stopOn bool, repetition ...int) Parser {
min, max := parseRepetition(1, -1, repetition...)
matches, ranges := parseMatcher(matcher)
2017-08-07 13:45:12 +02:00
return func(ps *State) Node {
2017-08-07 13:20:30 +02:00
ps.AutoWS()
2017-08-06 07:43:23 +02:00
matched := 0
2017-08-06 15:32:10 +02:00
for ps.Pos+matched < len(ps.Input) {
2017-08-06 07:43:23 +02:00
if max != -1 && matched >= max {
break
}
2017-08-06 15:32:10 +02:00
r, w := utf8.DecodeRuneInString(ps.Input[ps.Pos+matched:])
2017-08-06 07:43:23 +02:00
2017-08-06 15:32:10 +02:00
anyMatched := strings.ContainsRune(matches, r)
if !anyMatched {
for _, rng := range ranges {
if r >= rng[0] && r <= rng[1] {
anyMatched = true
}
2017-08-06 07:43:23 +02:00
}
}
2017-08-06 15:32:10 +02:00
if anyMatched == stopOn {
2017-08-06 07:43:23 +02:00
break
}
matched += w
}
if matched < min {
2017-08-06 15:32:10 +02:00
ps.ErrorHere(matcher)
2017-08-07 13:45:12 +02:00
return Node{}
2017-08-06 07:43:23 +02:00
}
2017-08-06 15:32:10 +02:00
result := ps.Input[ps.Pos : ps.Pos+matched]
ps.Advance(matched)
2017-08-07 13:45:12 +02:00
return Node{Token: result}
2017-08-06 07:43:23 +02:00
}
2017-08-06 06:31:35 +02:00
}
2017-08-08 11:24:49 +02:00
func String(allowedQuotes string) Parser {
2017-08-07 13:45:12 +02:00
return NewParser("string", func(ps *State) Node {
2017-08-07 13:20:30 +02:00
ps.AutoWS()
2017-08-08 11:24:49 +02:00
for i := 0; i < len(allowedQuotes); i++ {
if ps.Input[ps.Pos] == allowedQuotes[i] {
}
}
if !stringContainsByte(allowedQuotes, ps.Input[ps.Pos]) {
ps.ErrorHere(allowedQuotes)
2017-08-07 13:45:12 +02:00
return Node{}
2017-08-06 09:28:34 +02:00
}
2017-08-08 11:24:49 +02:00
quote := ps.Input[ps.Pos]
var end int = ps.Pos + 1
2017-08-06 09:28:34 +02:00
2017-08-08 11:24:49 +02:00
inputLen := len(ps.Input)
var buf *bytes.Buffer
2017-08-06 09:28:34 +02:00
2017-08-08 11:24:49 +02:00
for end < inputLen {
switch ps.Input[end] {
case '\\':
if end+1 >= inputLen {
ps.ErrorHere(string(quote))
return Node{}
}
2017-08-06 09:28:34 +02:00
if buf == nil {
buf = bytes.NewBufferString(ps.Input[ps.Pos+1 : end])
}
2017-08-08 11:24:49 +02:00
c := ps.Input[end+1]
if c == 'u' {
if end+6 >= inputLen {
ps.Error.Expected = "[a-f0-9]{4}"
ps.Error.pos = end + 2
return Node{}
}
2017-08-06 09:28:34 +02:00
2017-08-08 11:24:49 +02:00
r, ok := unhex(ps.Input[end+2 : end+6])
if !ok {
ps.Error.Expected = "[a-f0-9]"
ps.Error.pos = end + 2
return Node{}
}
buf.WriteRune(r)
2017-08-08 11:24:49 +02:00
end += 6
} else {
buf.WriteByte(c)
2017-08-08 11:24:49 +02:00
end += 2
}
case quote:
if buf == nil {
result := ps.Input[ps.Pos+1 : end]
ps.Pos = end + 1
return Node{Token: result}
}
2017-08-08 11:24:49 +02:00
ps.Pos = end + 1
return Node{Token: buf.String()}
2017-08-08 11:24:49 +02:00
default:
r, w := utf8.DecodeRuneInString(ps.Input[end:])
end += w
if buf != nil {
buf.WriteRune(r)
}
2017-08-06 09:28:34 +02:00
}
}
2017-08-08 11:24:49 +02:00
ps.ErrorHere(string(quote))
2017-08-07 13:45:12 +02:00
return Node{}
2017-08-07 12:07:29 +02:00
})
2017-08-06 09:28:34 +02:00
}
2017-08-08 11:24:49 +02:00
func stringContainsByte(s string, b byte) bool {
for i := 0; i < len(s); i++ {
if b == s[i] {
return true
}
}
return false
}
func unhex(b string) (v rune, ok bool) {
for _, c := range b {
v <<= 4
switch {
case '0' <= c && c <= '9':
v |= c - '0'
case 'a' <= c && c <= 'f':
v |= c - 'a' + 10
case 'A' <= c && c <= 'F':
v |= c - 'A' + 10
default:
return 0, false
}
}
return v, true
}