Get the HTML parser working

This commit is contained in:
Adam Scarr 2017-08-06 17:02:39 +10:00
parent 8b343d6360
commit a65a9325aa
9 changed files with 216 additions and 147 deletions

View File

@ -1,5 +1,10 @@
package parsec
import (
"bytes"
"fmt"
)
func Nil(p Pointer) (Node, Pointer) {
return nil, p
}
@ -29,7 +34,7 @@ func And(parsers ...Parserish) Parser {
}
nodes = append(nodes, node)
}
return NewSequence(p.pos, nodes...), newP
return nodes, newP
}
}
@ -109,6 +114,53 @@ func manyImpl(min int, op Parserish, until Parserish, sep ...Parserish) Parser {
break
}
}
return NewSequence(p.pos, nodes...), newP
return nodes, newP
}
}
func Maybe(parser Parserish) Parser {
realParser := Parsify(parser)
return func(p Pointer) (Node, Pointer) {
node, newP := realParser(p)
if IsError(node) {
return nil, p
}
return node, newP
}
}
func Map(parser Parserish, f func(n Node) Node) Parser {
p := Parsify(parser)
return func(ptr Pointer) (Node, Pointer) {
node, newPtr := p(ptr)
if IsError(node) {
return node, ptr
}
return f(node), newPtr
}
}
func flatten(n Node) string {
if s, ok := n.(string); ok {
return s
}
if nodes, ok := n.([]Node); ok {
sbuf := &bytes.Buffer{}
for _, node := range nodes {
sbuf.WriteString(flatten(node))
}
return sbuf.String()
}
panic(fmt.Errorf("Dont know how to flatten %t", n))
}
func Merge(parser Parserish) Parser {
return Map(parser, func(n Node) Node {
return flatten(n)
})
}

View File

@ -19,7 +19,7 @@ func TestAnd(t *testing.T) {
t.Run("matches sequence", func(t *testing.T) {
node, p2 := And("hello", WS, "world")(p)
require.Equal(t, NewSequence(0, NewToken(0, "hello"), NewToken(6, "world")), node)
require.Equal(t, []Node{"hello", "world"}, node)
require.Equal(t, "", p2.Get())
})
@ -34,12 +34,28 @@ func TestAnd(t *testing.T) {
})
}
func TestMaybe(t *testing.T) {
p := Pointer{"hello world", 0}
t.Run("matches sequence", func(t *testing.T) {
node, p2 := Maybe("hello")(p)
require.Equal(t, "hello", node)
require.Equal(t, " world", p2.Get())
})
t.Run("returns no errors", func(t *testing.T) {
e, p3 := Maybe("world")(p)
require.Equal(t, nil, e)
require.Equal(t, 0, p3.pos)
})
}
func TestAny(t *testing.T) {
p := Pointer{"hello world!", 0}
t.Run("Matches any", func(t *testing.T) {
node, p2 := Any("hello", "world")(p)
require.Equal(t, NewToken(0, "hello"), node)
require.Equal(t, "hello", node)
require.Equal(t, 5, p2.pos)
})
@ -69,40 +85,19 @@ func TestKleene(t *testing.T) {
t.Run("Matches sequence with sep", func(t *testing.T) {
node, p2 := Kleene(CharRun("abcdefg"), Exact(","))(p)
require.Equal(t, NewSequence(0,
NewToken(0, "a"),
NewToken(2, "b"),
NewToken(4, "c"),
NewToken(6, "d"),
NewToken(8, "e"),
), node)
require.Equal(t, []Node{"a", "b", "c", "d", "e"}, node)
require.Equal(t, 10, p2.pos)
})
t.Run("Matches sequence without sep", func(t *testing.T) {
node, p2 := Kleene(Any(CharRun("abcdefg"), Exact(",")))(p)
require.Equal(t, NewSequence(0,
NewToken(0, "a"),
NewToken(1, ","),
NewToken(2, "b"),
NewToken(3, ","),
NewToken(4, "c"),
NewToken(5, ","),
NewToken(6, "d"),
NewToken(7, ","),
NewToken(8, "e"),
NewToken(9, ","),
), node)
require.Equal(t, []Node{"a", ",", "b", ",", "c", ",", "d", ",", "e", ","}, node)
require.Equal(t, 10, p2.pos)
})
t.Run("Stops on error", func(t *testing.T) {
node, p2 := Kleene(CharRun("abc"), Exact(","))(p)
require.Equal(t, NewSequence(0,
NewToken(0, "a"),
NewToken(2, "b"),
NewToken(4, "c"),
), node)
require.Equal(t, []Node{"a", "b", "c"}, node)
require.Equal(t, 6, p2.pos)
require.Equal(t, "d,e,", p2.Get())
})
@ -113,40 +108,19 @@ func TestMany(t *testing.T) {
t.Run("Matches sequence with sep", func(t *testing.T) {
node, p2 := Many(CharRun("abcdefg"), Exact(","))(p)
require.Equal(t, NewSequence(0,
NewToken(0, "a"),
NewToken(2, "b"),
NewToken(4, "c"),
NewToken(6, "d"),
NewToken(8, "e"),
), node)
require.Equal(t, []Node{"a", "b", "c", "d", "e"}, node)
require.Equal(t, 10, p2.pos)
})
t.Run("Matches sequence without sep", func(t *testing.T) {
node, p2 := Many(Any(CharRun("abcdefg"), Exact(",")))(p)
require.Equal(t, NewSequence(0,
NewToken(0, "a"),
NewToken(1, ","),
NewToken(2, "b"),
NewToken(3, ","),
NewToken(4, "c"),
NewToken(5, ","),
NewToken(6, "d"),
NewToken(7, ","),
NewToken(8, "e"),
NewToken(9, ","),
), node)
require.Equal(t, []Node{"a", ",", "b", ",", "c", ",", "d", ",", "e", ","}, node)
require.Equal(t, 10, p2.pos)
})
t.Run("Stops on error", func(t *testing.T) {
node, p2 := Many(CharRun("abc"), Exact(","))(p)
require.Equal(t, NewSequence(0,
NewToken(0, "a"),
NewToken(2, "b"),
NewToken(4, "c"),
), node)
require.Equal(t, []Node{"a", "b", "c"}, node)
require.Equal(t, 6, p2.pos)
require.Equal(t, "d,e,", p2.Get())
})
@ -164,24 +138,13 @@ func TestKleeneUntil(t *testing.T) {
t.Run("Matches sequence with sep", func(t *testing.T) {
node, p2 := KleeneUntil(CharRun("abcde"), CharRun("d"), Exact(","))(p)
require.Equal(t, NewSequence(0,
NewToken(0, "a"),
NewToken(2, "b"),
NewToken(4, "c"),
), node)
require.Equal(t, []Node{"a", "b", "c"}, node)
require.Equal(t, 6, p2.pos)
})
t.Run("Breaks if separator does not match", func(t *testing.T) {
node, p2 := KleeneUntil(Char("abcdefg"), Char("y"), Exact(","))(p)
require.Equal(t, NewSequence(0,
NewToken(0, "a"),
NewToken(2, "b"),
NewToken(4, "c"),
NewToken(6, "d"),
NewToken(8, "e"),
NewToken(10, "f"),
), node)
require.Equal(t, []Node{"a", "b", "c", "d", "e", "f"}, node)
require.Equal(t, 11, p2.pos)
})
}
@ -191,11 +154,7 @@ func TestManyUntil(t *testing.T) {
t.Run("Matches sequence until", func(t *testing.T) {
node, p2 := ManyUntil(CharRun("abcdefg"), Char("d"), Exact(","))(p)
require.Equal(t, NewSequence(0,
NewToken(0, "a"),
NewToken(2, "b"),
NewToken(4, "c"),
), node)
require.Equal(t, []Node{"a", "b", "c"}, node)
require.Equal(t, 6, p2.pos)
})
@ -207,6 +166,48 @@ func TestManyUntil(t *testing.T) {
})
}
type htmlTag struct {
Name string
}
func TestMap(t *testing.T) {
parser := Map(And("<", Range("a-zA-Z0-9"), ">"), func(n Node) Node {
return htmlTag{n.([]Node)[1].(string)}
})
t.Run("sucess", func(t *testing.T) {
result, _ := parser(Pointer{"<html>", 0})
require.Equal(t, htmlTag{"html"}, result)
})
t.Run("error", func(t *testing.T) {
result, ptr := parser(Pointer{"<html", 0})
require.Equal(t, NewError(5, "Expected >"), result)
require.Equal(t, 0, ptr.pos)
})
}
func TestMerge(t *testing.T) {
var bracer Parser
bracer = And("(", Maybe(&bracer), ")")
parser := Merge(bracer)
t.Run("sucess", func(t *testing.T) {
result, _ := parser(Pointer{"((()))", 0})
require.Equal(t, "((()))", result)
})
t.Run("error", func(t *testing.T) {
result, ptr := parser(Pointer{"((())", 0})
require.Equal(t, NewError(5, "Expected )"), result)
require.Equal(t, 0, ptr.pos)
})
require.Panics(t, func() {
flatten(1)
})
}
func assertNilParser(t *testing.T, parser Parser) {
p := Pointer{"fff", 0}
node, p2 := parser(p)

View File

@ -1,34 +0,0 @@
package main
import (
"fmt"
. "github.com/vektah/goparsify"
)
func html(p Pointer) (Node, Pointer) {
identifier := And(Range("a-z", 1, 1), Range("a-zA-Z0-9"))
text := CharRunUntil("<>")
var tag Parser
element := Any(text, &tag)
elements := Kleene(element)
//attr := And(identifier, equal, String())
attr := And(identifier, "=", `"test"`)
attrws := And(attr, WS)
attrs := Kleene(attrws)
tstart := And("<", identifier, attrs, ">")
tend := And("</", identifier, ">")
tag = And(tstart, elements, tend)
return element(p)
}
func main() {
result, _, err := ParseString(html, "<h1>hello world</h1>")
if err != nil {
panic(err)
}
fmt.Printf("%#v\n", result)
}

4
html/README.md Normal file
View File

@ -0,0 +1,4 @@
example html parser
===
This is a **very** rudimentary html parser that should be used as an example only.

52
html/html.go Normal file
View File

@ -0,0 +1,52 @@
package html
import . "github.com/vektah/goparsify"
func Parse(input string) (result Node, remaining string, err error) {
return ParseString(tag, input)
}
type Tag struct {
Name string
Attributes map[string]string
Body []Node
}
var (
tag Parser
identifier = Merge(And(Range("a-z", 1, 1), Range("a-zA-Z0-9", 0)))
text = CharRunUntil("<>")
element = Any(text, &tag)
elements = Kleene(element)
//attr := And(identifier, equal, String())
attr = And(identifier, WS, "=", WS, `"test"`)
attrs = Map(Kleene(attr, WS), func(node Node) Node {
nodes := node.([]Node)
attr := map[string]string{}
for _, attrNode := range nodes {
attrNodes := attrNode.([]Node)
attr[attrNodes[0].(string)] = attrNodes[2].(string)
}
return attr
})
tstart = And("<", identifier, attrs, ">")
tend = And("</", identifier, ">")
)
func init() {
tag = Map(And(tstart, elements, tend), func(node Node) Node {
nodes := node.([]Node)
openTag := nodes[0].([]Node)
return Tag{
Name: openTag[1].(string),
Attributes: openTag[2].(map[string]string),
Body: nodes[1].([]Node),
}
})
}

17
html/html_test.go Normal file
View File

@ -0,0 +1,17 @@
package html
import (
"testing"
"github.com/stretchr/testify/require"
. "github.com/vektah/goparsify"
)
func TestParse(t *testing.T) {
result, _, err := Parse("<body>hello <b>world</b></body>")
require.NoError(t, err)
require.Equal(t, Tag{Name: "body", Attributes: map[string]string{}, Body: []Node{
"hello ",
Tag{Name: "b", Attributes: map[string]string{}, Body: []Node{"world"}},
}}, result)
}

View File

@ -3,18 +3,6 @@ package parsec
import "fmt"
type Node interface {
Pos() int
}
type Token struct {
pos int
Value string
}
func (e Token) Pos() int { return e.pos }
func NewToken(pos int, value string) Token {
return Token{pos, value}
}
type Error struct {
@ -29,18 +17,7 @@ func NewError(pos int, message string) Error {
return Error{pos, message}
}
func IsError(n Node) bool {
func IsError(n interface{}) bool {
_, isErr := n.(Error)
return isErr
}
type Sequence struct {
pos int
Nodes []Node
}
func (e Sequence) Pos() int { return e.pos }
func NewSequence(pos int, n ...Node) Sequence {
return Sequence{pos, n}
}

View File

@ -67,7 +67,7 @@ func Exact(match string) Parser {
return NewError(p.pos, "Expected "+match), p
}
return NewToken(p.pos, match), p.Advance(len(match))
return match, p.Advance(len(match))
}
}
@ -79,7 +79,7 @@ func Char(match string) Parser {
return NewError(p.pos, "Expected one of "+string(match)), p
}
return NewToken(p.pos, string(r)), p.Advance(w)
return string(r), p.Advance(w)
}
}
@ -98,7 +98,7 @@ func CharRun(match string) Parser {
return NewError(p.pos, "Expected some of "+match), p
}
return NewToken(p.pos, p.input[p.pos:p.pos+matched]), p.Advance(matched)
return p.input[p.pos : p.pos+matched], p.Advance(matched)
}
}
@ -117,7 +117,7 @@ func CharRunUntil(match string) Parser {
return NewError(p.pos, "Expected some of "+match), p
}
return NewToken(p.pos, p.input[p.pos:p.pos+matched]), p.Advance(matched)
return p.input[p.pos : p.pos+matched], p.Advance(matched)
}
}
@ -177,7 +177,7 @@ func Range(r string, repetition ...int) Parser {
return NewError(p.pos+matched, fmt.Sprintf("Expected at least %d more of %s", min-matched, r)), p
}
return NewToken(p.pos, p.input[p.pos:p.pos+matched]), p.Advance(matched)
return p.input[p.pos : p.pos+matched], p.Advance(matched)
}
}

View File

@ -11,19 +11,19 @@ func TestParsify(t *testing.T) {
t.Run("strings", func(t *testing.T) {
node, _ := Parsify("ff")(p)
require.Equal(t, NewToken(0, "ff"), node)
require.Equal(t, "ff", node)
})
t.Run("parsers", func(t *testing.T) {
node, _ := Parsify(CharRun("f"))(p)
require.Equal(t, NewToken(0, "ff"), node)
require.Equal(t, "ff", node)
})
t.Run("parser funcs", func(t *testing.T) {
node, _ := Parsify(func(p Pointer) (Node, Pointer) {
return NewToken(0, "hello"), p
return "hello", p
})(p)
require.Equal(t, NewToken(0, "hello"), node)
require.Equal(t, "hello", node)
})
t.Run("*parsers", func(t *testing.T) {
@ -32,7 +32,7 @@ func TestParsify(t *testing.T) {
parser = CharRun("f")
node, _ := parserfied(p)
require.Equal(t, NewToken(0, "ff"), node)
require.Equal(t, "ff", node)
})
require.Panics(t, func() {
@ -44,7 +44,7 @@ func TestParsifyAll(t *testing.T) {
parsers := ParsifyAll("ff", "gg")
result, _ := parsers[0](Pointer{"ffooo", 0})
require.Equal(t, NewToken(0, "ff"), result)
require.Equal(t, "ff", result)
result, _ = parsers[1](Pointer{"ffooo", 0})
require.Equal(t, NewError(0, "Expected gg"), result)
@ -55,7 +55,7 @@ func TestExact(t *testing.T) {
t.Run("success", func(t *testing.T) {
node, p2 := Exact("fo")(p)
require.Equal(t, NewToken(0, "fo"), node)
require.Equal(t, "fo", node)
require.Equal(t, p.Advance(2), p2)
})
@ -71,7 +71,7 @@ func TestChar(t *testing.T) {
t.Run("success", func(t *testing.T) {
node, p2 := Char("fo")(p)
require.Equal(t, NewToken(0, "f"), node)
require.Equal(t, "f", node)
require.Equal(t, p.Advance(1), p2)
})
@ -87,7 +87,7 @@ func TestCharRun(t *testing.T) {
t.Run("success", func(t *testing.T) {
node, p2 := CharRun("fo")(p)
require.Equal(t, NewToken(0, "foo"), node)
require.Equal(t, "foo", node)
require.Equal(t, p.Advance(3), p2)
})
@ -103,7 +103,7 @@ func TestCharUntil(t *testing.T) {
t.Run("success", func(t *testing.T) {
node, p2 := CharRunUntil("z")(p)
require.Equal(t, NewToken(0, "foobar"), node)
require.Equal(t, "foobar", node)
require.Equal(t, p.Advance(6), p2)
})
@ -125,19 +125,19 @@ func TestWS(t *testing.T) {
func TestRange(t *testing.T) {
t.Run("full match", func(t *testing.T) {
node, p := Range("a-z")(Pointer{"foobar", 0})
require.Equal(t, NewToken(0, "foobar"), node)
require.Equal(t, "foobar", node)
require.Equal(t, "", p.Get())
})
t.Run("partial match", func(t *testing.T) {
node, p := Range("1-4d-a")(Pointer{"a1b2c3d4efg", 0})
require.Equal(t, NewToken(0, "a1b2c3d4"), node)
require.Equal(t, "a1b2c3d4", node)
require.Equal(t, "efg", p.Get())
})
t.Run("limited match", func(t *testing.T) {
node, p := Range("1-4d-a", 1, 2)(Pointer{"a1b2c3d4efg", 0})
require.Equal(t, NewToken(0, "a1"), node)
require.Equal(t, "a1", node)
require.Equal(t, "b2c3d4efg", p.Get())
})
@ -165,7 +165,7 @@ func TestRange(t *testing.T) {
func TestParseString(t *testing.T) {
t.Run("partial match", func(t *testing.T) {
result, remaining, err := ParseString("hello", "hello world")
require.Equal(t, NewToken(0, "hello"), result)
require.Equal(t, "hello", result)
require.Equal(t, " world", remaining)
require.NoError(t, err)
})