From 68cde88125e1f016c5706ca8d0b3db6ba06624a2 Mon Sep 17 00:00:00 2001 From: Adam Scarr Date: Sun, 6 Aug 2017 14:31:35 +1000 Subject: [PATCH] Initial commit --- .editorconfig | 9 ++ .gitignore | 2 + combinator.go | 114 ++++++++++++++++++++++++ combinator_test.go | 215 +++++++++++++++++++++++++++++++++++++++++++++ examples/html.go | 35 ++++++++ nodes.go | 43 +++++++++ parser.go | 122 +++++++++++++++++++++++++ parser_test.go | 107 ++++++++++++++++++++++ pointer.go | 81 +++++++++++++++++ pointer_test.go | 86 ++++++++++++++++++ 10 files changed, 814 insertions(+) create mode 100644 .editorconfig create mode 100644 .gitignore create mode 100644 combinator.go create mode 100644 combinator_test.go create mode 100644 examples/html.go create mode 100644 nodes.go create mode 100644 parser.go create mode 100644 parser_test.go create mode 100644 pointer.go create mode 100644 pointer_test.go diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..eef10cc --- /dev/null +++ b/.editorconfig @@ -0,0 +1,9 @@ +[*] +end_of_line = lf +insert_final_newline = true +charset = utf-8 +trim_trailing_whitespace = true + +[*.go] +indent_style = tab +indent_size = 4 \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..29b636a --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.idea +*.iml \ No newline at end of file diff --git a/combinator.go b/combinator.go new file mode 100644 index 0000000..2b6b8a3 --- /dev/null +++ b/combinator.go @@ -0,0 +1,114 @@ +package parsec + +func Nil(p Pointer) (Node, Pointer) { + return nil, p +} + +func Never(p Pointer) (Node, Pointer) { + return Error{p.pos, "Never matches"}, p +} + +func And(parsers ...Parserish) Parser { + if len(parsers) == 0 { + return Nil + } + + ps := ParsifyAll(parsers...) + + return func(p Pointer) (Node, Pointer) { + var nodes = make([]Node, 0, len(ps)) + var node Node + newP := p + for _, parser := range ps { + node, newP = parser(newP) + if node == nil { + continue + } + if IsError(node) { + return node, p + } + nodes = append(nodes, node) + } + return NewSequence(p.pos, nodes...), newP + } +} + +func Any(parsers ...Parserish) Parser { + if len(parsers) == 0 { + return Nil + } + + ps := ParsifyAll(parsers...) + + return func(p Pointer) (Node, Pointer) { + errors := []Error{} + for _, parser := range ps { + node, newP := parser(p) + if err, isErr := node.(Error); isErr { + errors = append(errors, err) + continue + } + return node, newP + } + + longestError := errors[0] + for _, e := range errors[1:] { + if e.pos > longestError.pos { + longestError = e + } + } + + return longestError, p + } +} + +func Kleene(opScan Parserish, sepScan ...Parserish) Parser { + return manyImpl(0, opScan, Never, sepScan...) +} + +func KleeneUntil(opScan Parserish, untilScan Parserish, sepScan ...Parserish) Parser { + return manyImpl(0, opScan, untilScan, sepScan...) +} + +func Many(opScan Parserish, sepScan ...Parserish) Parser { + return manyImpl(1, opScan, Never, sepScan...) +} + +func ManyUntil(opScan Parserish, untilScan Parserish, sepScan ...Parserish) Parser { + return manyImpl(1, opScan, untilScan, sepScan...) +} + +func manyImpl(min int, op Parserish, until Parserish, sep ...Parserish) Parser { + opParser := Parsify(op) + untilParser := Parsify(until) + sepParser := Nil + if len(sep) > 0 { + sepParser = Parsify(sep[0]) + } + + return func(p Pointer) (Node, Pointer) { + var node Node + nodes := make([]Node, 0) + newP := p + for { + if node, _ := untilParser(newP); !IsError(node) { + if len(nodes) < min { + return NewError(newP.pos, "Unexpected input"), p + } + break + } + + if node, newP = opParser(newP); IsError(node) { + if len(nodes) < min { + return node, p + } + break + } + nodes = append(nodes, node) + if node, newP = sepParser(newP); IsError(node) { + break + } + } + return NewSequence(p.pos, nodes...), newP + } +} diff --git a/combinator_test.go b/combinator_test.go new file mode 100644 index 0000000..8c0bab0 --- /dev/null +++ b/combinator_test.go @@ -0,0 +1,215 @@ +package parsec + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestNil(t *testing.T) { + p := Pointer{"hello world", 0} + + node, p2 := Nil(p) + require.Equal(t, nil, node) + require.Equal(t, p, p2) +} + +func TestAnd(t *testing.T) { + p := Pointer{"hello world", 0} + + t.Run("matches sequence", func(t *testing.T) { + node, p2 := And("hello", WS, "world")(p) + require.Equal(t, NewSequence(0, NewToken(0, "hello"), NewToken(6, "world")), node) + require.Equal(t, 0, p2.Remaining()) + }) + + t.Run("returns errors", func(t *testing.T) { + e, p3 := And("hello", WS, "there")(p) + require.Equal(t, NewError(6, "Expected there"), e) + require.Equal(t, 0, p3.pos) + }) + + t.Run("No parsers", func(t *testing.T) { + assertNilParser(t, And()) + }) +} + +func TestAny(t *testing.T) { + p := Pointer{"hello world!", 0} + + t.Run("Matches any", func(t *testing.T) { + node, p2 := Any("hello", "world")(p) + require.Equal(t, NewToken(0, "hello"), node) + require.Equal(t, 5, p2.pos) + }) + + t.Run("Returns longest error", func(t *testing.T) { + err, p2 := Any( + Exact("nope"), + And(Exact("hello"), WS, Exact("world"), Exact(".")), + And(Exact("hello"), WS, Exact("brother")), + )(p) + require.Equal(t, NewError(11, "Expected ."), err) + require.Equal(t, 0, p2.pos) + }) + + t.Run("Accepts nil matches", func(t *testing.T) { + node, p2 := Any(Exact("ffffff"), WS)(p) + require.Equal(t, nil, node) + require.Equal(t, 0, p2.pos) + }) + + t.Run("No parsers", func(t *testing.T) { + assertNilParser(t, Any()) + }) +} + +func TestKleene(t *testing.T) { + p := Pointer{"a,b,c,d,e,", 0} + + t.Run("Matches sequence with sep", func(t *testing.T) { + node, p2 := Kleene(CharRun("abcdefg"), Exact(","))(p) + require.Equal(t, NewSequence(0, + NewToken(0, "a"), + NewToken(2, "b"), + NewToken(4, "c"), + NewToken(6, "d"), + NewToken(8, "e"), + ), node) + require.Equal(t, 10, p2.pos) + }) + + t.Run("Matches sequence without sep", func(t *testing.T) { + node, p2 := Kleene(Any(CharRun("abcdefg"), Exact(",")))(p) + require.Equal(t, NewSequence(0, + NewToken(0, "a"), + NewToken(1, ","), + NewToken(2, "b"), + NewToken(3, ","), + NewToken(4, "c"), + NewToken(5, ","), + NewToken(6, "d"), + NewToken(7, ","), + NewToken(8, "e"), + NewToken(9, ","), + ), node) + require.Equal(t, 10, p2.pos) + }) + + t.Run("Stops on error", func(t *testing.T) { + node, p2 := Kleene(CharRun("abc"), Exact(","))(p) + require.Equal(t, NewSequence(0, + NewToken(0, "a"), + NewToken(2, "b"), + NewToken(4, "c"), + ), node) + require.Equal(t, 6, p2.pos) + require.Equal(t, "d,e,", p2.Get()) + }) +} + +func TestMany(t *testing.T) { + p := Pointer{"a,b,c,d,e,", 0} + + t.Run("Matches sequence with sep", func(t *testing.T) { + node, p2 := Many(CharRun("abcdefg"), Exact(","))(p) + require.Equal(t, NewSequence(0, + NewToken(0, "a"), + NewToken(2, "b"), + NewToken(4, "c"), + NewToken(6, "d"), + NewToken(8, "e"), + ), node) + require.Equal(t, 10, p2.pos) + }) + + t.Run("Matches sequence without sep", func(t *testing.T) { + node, p2 := Many(Any(CharRun("abcdefg"), Exact(",")))(p) + require.Equal(t, NewSequence(0, + NewToken(0, "a"), + NewToken(1, ","), + NewToken(2, "b"), + NewToken(3, ","), + NewToken(4, "c"), + NewToken(5, ","), + NewToken(6, "d"), + NewToken(7, ","), + NewToken(8, "e"), + NewToken(9, ","), + ), node) + require.Equal(t, 10, p2.pos) + }) + + t.Run("Stops on error", func(t *testing.T) { + node, p2 := Many(CharRun("abc"), Exact(","))(p) + require.Equal(t, NewSequence(0, + NewToken(0, "a"), + NewToken(2, "b"), + NewToken(4, "c"), + ), node) + require.Equal(t, 6, p2.pos) + require.Equal(t, "d,e,", p2.Get()) + }) + + t.Run("Returns error if nothing matches", func(t *testing.T) { + node, p2 := Many(CharRun("def"), Exact(","))(p) + require.Equal(t, NewError(0, "Expected some of def"), node) + require.Equal(t, 0, p2.pos) + require.Equal(t, "a,b,c,d,e,", p2.Get()) + }) +} + +func TestKleeneUntil(t *testing.T) { + p := Pointer{"a,b,c,d,e,fg", 0} + + t.Run("Matches sequence with sep", func(t *testing.T) { + node, p2 := KleeneUntil(CharRun("abcde"), CharRun("d"), Exact(","))(p) + require.Equal(t, NewSequence(0, + NewToken(0, "a"), + NewToken(2, "b"), + NewToken(4, "c"), + ), node) + require.Equal(t, 6, p2.pos) + }) + + t.Run("Breaks if separator does not match", func(t *testing.T) { + node, p2 := KleeneUntil(Char("abcdefg"), Char("y"), Exact(","))(p) + require.Equal(t, NewSequence(0, + NewToken(0, "a"), + NewToken(2, "b"), + NewToken(4, "c"), + NewToken(6, "d"), + NewToken(8, "e"), + NewToken(10, "f"), + ), node) + require.Equal(t, 11, p2.pos) + }) +} + +func TestManyUntil(t *testing.T) { + p := Pointer{"a,b,c,d,e,", 0} + + t.Run("Matches sequence until", func(t *testing.T) { + node, p2 := ManyUntil(CharRun("abcdefg"), Char("d"), Exact(","))(p) + require.Equal(t, NewSequence(0, + NewToken(0, "a"), + NewToken(2, "b"), + NewToken(4, "c"), + ), node) + require.Equal(t, 6, p2.pos) + }) + + t.Run("Returns error until matches early", func(t *testing.T) { + node, p2 := ManyUntil(CharRun("abc"), Exact("a"), Exact(","))(p) + require.Equal(t, NewError(0, "Unexpected input"), node) + require.Equal(t, 0, p2.pos) + require.Equal(t, "a,b,c,d,e,", p2.Get()) + }) +} + +func assertNilParser(t *testing.T, parser Parser) { + p := Pointer{"fff", 0} + node, p2 := parser(p) + require.Equal(t, nil, node) + require.Equal(t, p, p2) +} diff --git a/examples/html.go b/examples/html.go new file mode 100644 index 0000000..88a3913 --- /dev/null +++ b/examples/html.go @@ -0,0 +1,35 @@ +package main + +import ( + "fmt" + + . "github.com/vektah/goparsify" +) + +func html(p Pointer) (Node, Pointer) { + opentag := Exact("<") + closetag := Exact(">") + equal := Exact("=") + slash := Exact("/") + identifier := And(Char(Range("a-z")), CharRun(Range("a-zA-Z0-9"))) + text := CharRunUntil("<>") + + var tag Parser + + element := Any(text, &tag) + elements := Kleene(element) + //attr := And(identifier, equal, String()) + attr := And(identifier, equal, Exact(`"test"`)) + attrws := And(attr, WS) + attrs := Kleene(attrws) + tstart := And(opentag, identifier, attrs, closetag) + tend := And(opentag, slash, identifier, closetag) + tag = And(tstart, elements, tend) + + return element(p) +} + +func main() { + node, _ := html(Input("

hello world

")) + fmt.Printf("%#v\n", node) +} diff --git a/nodes.go b/nodes.go new file mode 100644 index 0000000..c51db6f --- /dev/null +++ b/nodes.go @@ -0,0 +1,43 @@ +package parsec + +type Node interface { + Pos() int +} + +type Token struct { + pos int + Value string +} + +func (e Token) Pos() int { return e.pos } + +func NewToken(pos int, value string) Token { + return Token{pos, value} +} + +type Error struct { + pos int + Error string +} + +func (e Error) Pos() int { return e.pos } + +func NewError(pos int, message string) Error { + return Error{pos, message} +} + +func IsError(n Node) bool { + _, isErr := n.(Error) + return isErr +} + +type Sequence struct { + pos int + Nodes []Node +} + +func (e Sequence) Pos() int { return e.pos } + +func NewSequence(pos int, n ...Node) Sequence { + return Sequence{pos, n} +} diff --git a/parser.go b/parser.go new file mode 100644 index 0000000..8066526 --- /dev/null +++ b/parser.go @@ -0,0 +1,122 @@ +package parsec + +import ( + "fmt" +) + +type Parser func(Pointer) (Node, Pointer) + +// Parserish types are any type that can be turned into a Parser by Parsify +// These currently include *Parser and string literals. +// +// This makes recursive grammars cleaner and allows string literals to be used directly in most contexts. +// eg, matching balanced paren: +// ```go +// var group Parser +// group = And("(", Maybe(&group), ")") +// ``` +// vs +// ```go +// var group ParserPtr{} +// group.P = And(Exact("("), Maybe(group.Parse), Exact(")")) +// ``` +type Parserish interface{} + +func Parsify(p Parserish) Parser { + switch p := p.(type) { + case func(Pointer) (Node, Pointer): + return Parser(p) + case Parser: + return p + case *Parser: + // Todo: Maybe capture this stack and on nil show it? Is there a good error library to do this? + return func(ptr Pointer) (Node, Pointer) { + return (*p)(ptr) + } + case string: + return Exact(p) + default: + panic(fmt.Errorf("cant turn a `%T` into a parser", p)) + } +} + +func ParsifyAll(parsers ...Parserish) []Parser { + ret := make([]Parser, len(parsers)) + for i, parser := range parsers { + ret[i] = Parsify(parser) + } + return ret +} + +func Exact(match string) Parser { + return func(p Pointer) (Node, Pointer) { + if !p.HasPrefix(match) { + return NewError(p.pos, "Expected "+match), p + } + + return NewToken(p.pos, match), p.Advance(len(match)) + } +} + +func Char(match string) Parser { + return func(p Pointer) (Node, Pointer) { + r, p2 := p.Accept(match) + if r == "" { + return NewError(p.pos, "Expected one of "+string(match)), p + } + + return NewToken(p.pos, string(r)), p2 + } +} + +func CharRun(match string) Parser { + return func(p Pointer) (Node, Pointer) { + s, p2 := p.AcceptRun(match) + if s == "" { + return NewError(p.pos, "Expected some of "+match), p + } + + return NewToken(p.pos, s), p2 + } +} + +func CharRunUntil(match string) Parser { + return func(p Pointer) (Node, Pointer) { + s, p2 := p.AcceptUntil(match) + if s == "" { + return NewError(p.pos, "Expected some of "+match), p + } + + return NewToken(p.pos, s), p2 + } +} + +func Range(r string) string { + runes := []rune(r) + if len(runes)%3 != 0 { + panic("ranges should be in the form a-z0-9") + } + + match := "" + + for i := 0; i < len(runes); i += 3 { + start := runes[i] + end := runes[i+2] + if start > end { + tmp := start + start = end + end = tmp + } + for c := start; c <= end; c++ { + match += string(c) + } + } + + return match +} + +func WS(p Pointer) (Node, Pointer) { + _, p2 := p.AcceptRun("\t\n\v\f\r \x85\xA0") + + return nil, p2 +} diff --git a/parser_test.go b/parser_test.go new file mode 100644 index 0000000..7c22a76 --- /dev/null +++ b/parser_test.go @@ -0,0 +1,107 @@ +package parsec + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestParsify(t *testing.T) { + p := Pointer{"ffooo", 0} + + t.Run("strings", func(t *testing.T) { + node, _ := Parsify("ff")(p) + require.Equal(t, NewToken(0, "ff"), node) + }) + + t.Run("parsers", func(t *testing.T) { + node, _ := Parsify(CharRun("f"))(p) + require.Equal(t, NewToken(0, "ff"), node) + }) + + t.Run("*parsers", func(t *testing.T) { + var parser Parser + parserfied := Parsify(&parser) + parser = CharRun("f") + + node, _ := parserfied(p) + require.Equal(t, NewToken(0, "ff"), node) + }) +} + +func TestExact(t *testing.T) { + p := Pointer{"fooo", 0} + + t.Run("success", func(t *testing.T) { + node, p2 := Exact("fo")(p) + require.Equal(t, NewToken(0, "fo"), node) + require.Equal(t, p.Advance(2), p2) + }) + + t.Run("error", func(t *testing.T) { + node, p2 := Exact("bar")(p) + require.Equal(t, NewError(0, "Expected bar"), node) + require.Equal(t, 0, p2.pos) + }) +} + +func TestChar(t *testing.T) { + p := Pointer{"foobar", 0} + + t.Run("success", func(t *testing.T) { + node, p2 := Char("fo")(p) + require.Equal(t, NewToken(0, "f"), node) + require.Equal(t, p.Advance(1), p2) + }) + + t.Run("error", func(t *testing.T) { + node, p2 := Char("bar")(p) + require.Equal(t, NewError(0, "Expected one of bar"), node) + require.Equal(t, 0, p2.pos) + }) +} + +func TestCharRun(t *testing.T) { + p := Pointer{"foobar", 0} + + t.Run("success", func(t *testing.T) { + node, p2 := CharRun("fo")(p) + require.Equal(t, NewToken(0, "foo"), node) + require.Equal(t, p.Advance(3), p2) + }) + + t.Run("error", func(t *testing.T) { + node, p2 := CharRun("bar")(p) + require.Equal(t, NewError(0, "Expected some of bar"), node) + require.Equal(t, 0, p2.pos) + }) +} + +func TestCharUntil(t *testing.T) { + p := Pointer{"foobar", 0} + + t.Run("success", func(t *testing.T) { + node, p2 := CharRunUntil("z")(p) + require.Equal(t, NewToken(0, "foobar"), node) + require.Equal(t, p.Advance(6), p2) + }) + + t.Run("error", func(t *testing.T) { + node, p2 := CharRunUntil("f")(p) + require.Equal(t, NewError(0, "Expected some of f"), node) + require.Equal(t, 0, p2.pos) + }) +} + +func TestWS(t *testing.T) { + p := Pointer{" fooo", 0} + + node, p2 := WS(p) + require.Equal(t, nil, node) + require.Equal(t, p.Advance(2), p2) +} + +func TestRange(t *testing.T) { + require.Equal(t, "abcdefg", Range("a-g")) + require.Equal(t, "01234abcd", Range("0-4a-d")) +} diff --git a/pointer.go b/pointer.go new file mode 100644 index 0000000..92b2bcb --- /dev/null +++ b/pointer.go @@ -0,0 +1,81 @@ +package parsec + +import ( + "strings" + "unicode/utf8" +) + +const ( + EOF rune = -1 +) + +func Input(s string) Pointer { + return Pointer{s, 0} +} + +type Pointer struct { + input string + pos int +} + +func (p Pointer) Advance(i int) Pointer { + return Pointer{p.input, p.pos + i} +} + +func (p Pointer) Get() string { + return p.input[p.pos:] +} + +func (p Pointer) Remaining() int { + remaining := len(p.input) - p.pos + if remaining < 0 { + return 0 + } + return remaining +} + +func (p Pointer) Next() (rune, Pointer) { + if int(p.pos) >= len(p.input) { + return EOF, p + } + r, w := utf8.DecodeRuneInString(p.input[p.pos:]) + return r, p.Advance(w) +} + +func (p Pointer) HasPrefix(s string) bool { + return strings.HasPrefix(p.input[p.pos:], s) +} + +func (p Pointer) Accept(valid string) (string, Pointer) { + c, newP := p.Next() + if strings.ContainsRune(valid, c) { + return string(c), newP + } + return "", p +} + +func (p Pointer) AcceptRun(valid string) (string, Pointer) { + matched := 0 + for p.pos+matched < len(p.input) { + r, w := utf8.DecodeRuneInString(p.input[p.pos+matched:]) + if !strings.ContainsRune(valid, r) { + break + } + matched += w + } + + return p.input[p.pos : p.pos+matched], p.Advance(matched) +} + +func (p Pointer) AcceptUntil(invalid string) (string, Pointer) { + matched := 0 + for p.pos+matched < len(p.input) { + r, w := utf8.DecodeRuneInString(p.input[p.pos+matched:]) + if strings.ContainsRune(invalid, r) { + break + } + matched += w + } + + return p.input[p.pos : p.pos+matched], p.Advance(matched) +} diff --git a/pointer_test.go b/pointer_test.go new file mode 100644 index 0000000..e8076d7 --- /dev/null +++ b/pointer_test.go @@ -0,0 +1,86 @@ +package parsec + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestPointer(t *testing.T) { + p := Pointer{"fooo", 0} + + t.Run("Advances", func(t *testing.T) { + p2 := p.Advance(2) + require.Equal(t, Pointer{"fooo", 2}, p2) + require.Equal(t, Pointer{"fooo", 0}, p) + require.Equal(t, Pointer{"fooo", 3}, p2.Advance(1)) + }) + + t.Run("Get", func(t *testing.T) { + require.Equal(t, "fooo", p.Get()) + require.Equal(t, "ooo", p.Advance(1).Get()) + }) + + t.Run("Remaining", func(t *testing.T) { + require.Equal(t, 4, p.Remaining()) + require.Equal(t, 0, p.Advance(4).Remaining()) + require.Equal(t, 0, p.Advance(10).Remaining()) + }) + + t.Run("Next takes one character", func(t *testing.T) { + s, p2 := p.Next() + require.Equal(t, p.Advance(1), p2) + require.Equal(t, 'f', s) + }) + + t.Run("Next handles EOF", func(t *testing.T) { + s, p2 := p.Advance(5).Next() + require.Equal(t, p.Advance(5), p2) + require.Equal(t, EOF, s) + }) + + t.Run("HasPrefix", func(t *testing.T) { + require.True(t, p.HasPrefix("fo")) + require.False(t, p.HasPrefix("ooo")) + require.True(t, p.Advance(1).HasPrefix("ooo")) + require.False(t, p.Advance(1).HasPrefix("oooo")) + }) + + t.Run("Accept", func(t *testing.T) { + s, p2 := p.Accept("abcdef") + require.Equal(t, "f", s) + require.Equal(t, p.Advance(1), p2) + + s, p2 = p.Accept("ooooo") + require.Equal(t, "", s) + require.Equal(t, p.Advance(0), p2) + + s, p2 = p.Advance(4).Accept("ooooo") + require.Equal(t, "", s) + require.Equal(t, p.Advance(4), p2) + }) + + t.Run("AcceptRun", func(t *testing.T) { + s, p2 := p.AcceptRun("f") + require.Equal(t, "f", s) + require.Equal(t, p.Advance(1), p2) + + s, p3 := p.AcceptRun("fo") + require.Equal(t, "fooo", s) + require.Equal(t, p.Advance(4), p3) + + s, p4 := p3.AcceptRun("fo") + require.Equal(t, "", s) + require.Equal(t, p.Advance(4), p4) + }) + + t.Run("AcceptUntil", func(t *testing.T) { + s, p2 := p.AcceptUntil("o") + require.Equal(t, "f", s) + require.Equal(t, p.Advance(1), p2) + + s, p3 := p2.AcceptRun("o") + require.Equal(t, "ooo", s) + require.Equal(t, p.Advance(4), p3) + }) +}