From cd73181a620253f8282c744757cef57b215a3f4e Mon Sep 17 00:00:00 2001 From: Christopher Ramey Date: Mon, 18 Jan 2021 07:29:38 -0900 Subject: [PATCH] improved tokenizer, added tests --- config/config.go | 2 +- config/parser.go | 78 ++++---- config/testdata/comments-inline.tok | 5 + config/testdata/comments.tok | 11 +- config/testdata/quotes-multiline.tok | 6 + config/testdata/quotes.tok | 9 +- ...simple-broken.tok => simple-multiline.tok} | 0 config/testdata/simple-spaces.tok | 1 + config/testdata/simple.tok | 1 - config/tokenizer.go | 180 +++++++++++------- config/tokenizer_test.go | 40 +++- 11 files changed, 197 insertions(+), 136 deletions(-) create mode 100644 config/testdata/comments-inline.tok create mode 100644 config/testdata/quotes-multiline.tok rename config/testdata/{simple-broken.tok => simple-multiline.tok} (100%) create mode 100644 config/testdata/simple-spaces.tok delete mode 100644 config/testdata/simple.tok diff --git a/config/config.go b/config/config.go index 88d34ec..4ab5ea4 100644 --- a/config/config.go +++ b/config/config.go @@ -1,9 +1,9 @@ package config import ( + "alrm/alarm" "fmt" "time" - "alrm/alarm" ) type Config struct { diff --git a/config/parser.go b/config/parser.go index 6d014b1..3c79b56 100644 --- a/config/parser.go +++ b/config/parser.go @@ -8,13 +8,13 @@ import ( ) const ( - TK_NONE = iota - TK_SET - TK_MONITOR - TK_GROUP - TK_HOST - TK_CHECK - TK_ALARM + PR_NONE = iota + PR_SET + PR_MONITOR + PR_GROUP + PR_HOST + PR_CHECK + PR_ALARM ) type Parser struct { @@ -39,20 +39,20 @@ func (p *Parser) Parse(fn string) (*Config, error) { tk := tok.Text() stateswitch: switch p.state() { - case TK_NONE: + case PR_NONE: switch strings.ToLower(tk) { case "monitor": - p.setState(TK_MONITOR) + p.setState(PR_MONITOR) case "set": - p.setState(TK_SET) + p.setState(PR_SET) case "alarm": - p.setState(TK_ALARM) + p.setState(PR_ALARM) default: return nil, fmt.Errorf("invalid token in %s, line %d: \"%s\"", fn, tok.Line(), tk) } - case TK_SET: + case PR_SET: key := strings.ToLower(tk) if !tok.Scan() { return nil, fmt.Errorf("empty value name for set in %s, line %d", @@ -76,20 +76,20 @@ func (p *Parser) Parse(fn string) (*Config, error) { } p.prevState() - case TK_MONITOR: + case PR_MONITOR: switch strings.ToLower(tk) { case "host": - p.setState(TK_HOST) + p.setState(PR_HOST) case "group": - p.setState(TK_GROUP) + p.setState(PR_GROUP) default: p.prevState() goto stateswitch } - case TK_GROUP: + case PR_GROUP: if p.lastGroup == nil { p.lastGroup, err = config.NewGroup(tk) if err != nil { @@ -102,14 +102,14 @@ func (p *Parser) Parse(fn string) (*Config, error) { switch strings.ToLower(tk) { case "host": - p.setState(TK_HOST) + p.setState(PR_HOST) default: p.prevState() goto stateswitch } - case TK_HOST: + case PR_HOST: // If a host has no group, inherit the host name if p.lastGroup == nil { p.lastGroup, err = config.NewGroup(tk) @@ -139,14 +139,14 @@ func (p *Parser) Parse(fn string) (*Config, error) { p.lastHost.Address = tok.Text() case "check": - p.setState(TK_CHECK) + p.setState(PR_CHECK) default: p.prevState() goto stateswitch } - case TK_CHECK: + case PR_CHECK: if p.lastCheck == nil { p.lastCheck, err = p.lastHost.NewCheck(tk) if err != nil { @@ -166,7 +166,7 @@ func (p *Parser) Parse(fn string) (*Config, error) { goto stateswitch } - case TK_ALARM: + case PR_ALARM: if p.lastAlarm == nil { if p.lastAlarmName == "" { p.lastAlarmName = tk @@ -204,19 +204,19 @@ func (p *Parser) Parse(fn string) (*Config, error) { func (p *Parser) state() int { if len(p.states) < 1 { - return TK_NONE + return PR_NONE } return p.states[len(p.states)-1] } func (p *Parser) setState(state int) { switch state { - case TK_SET, TK_MONITOR: + case PR_SET, PR_MONITOR: fallthrough - case TK_GROUP: + case PR_GROUP: p.lastGroup = nil fallthrough - case TK_HOST: + case PR_HOST: p.lastHost = nil p.lastCheck = nil } @@ -239,20 +239,20 @@ func (p *Parser) prevState() int { func (p *Parser) stateName() string { switch p.state() { - case TK_NONE: - return "TK_NONE" - case TK_SET: - return "TK_SET" - case TK_MONITOR: - return "TK_MONITOR" - case TK_GROUP: - return "TK_GROUP" - case TK_HOST: - return "TK_HOST" - case TK_CHECK: - return "TK_CHECK" - case TK_ALARM: - return "TK_ALARM" + case PR_NONE: + return "PR_NONE" + case PR_SET: + return "PR_SET" + case PR_MONITOR: + return "PR_MONITOR" + case PR_GROUP: + return "PR_GROUP" + case PR_HOST: + return "PR_HOST" + case PR_CHECK: + return "PR_CHECK" + case PR_ALARM: + return "PR_ALARM" default: return "UNKNOWN" } diff --git a/config/testdata/comments-inline.tok b/config/testdata/comments-inline.tok new file mode 100644 index 0000000..23827b0 --- /dev/null +++ b/config/testdata/comments-inline.tok @@ -0,0 +1,5 @@ +one #one +"two#three" +# "three" +four +# EOF diff --git a/config/testdata/comments.tok b/config/testdata/comments.tok index e252882..9f65962 100644 --- a/config/testdata/comments.tok +++ b/config/testdata/comments.tok @@ -1,5 +1,6 @@ -# one two three -one two three -#four five six -four five six #seven eight nine -# EOF +# one +one +#two +two + # three + three diff --git a/config/testdata/quotes-multiline.tok b/config/testdata/quotes-multiline.tok new file mode 100644 index 0000000..1a74782 --- /dev/null +++ b/config/testdata/quotes-multiline.tok @@ -0,0 +1,6 @@ +"one +two" 'three +four' + +`five + six` diff --git a/config/testdata/quotes.tok b/config/testdata/quotes.tok index e3353d8..d2b3977 100644 --- a/config/testdata/quotes.tok +++ b/config/testdata/quotes.tok @@ -1,8 +1,3 @@ -"one" "two three" +"one" 'two' `three` -"four five" - -" #six" "" "seven" "ei" "ght" - -"multi -line" +`four` 'five' "six" diff --git a/config/testdata/simple-broken.tok b/config/testdata/simple-multiline.tok similarity index 100% rename from config/testdata/simple-broken.tok rename to config/testdata/simple-multiline.tok diff --git a/config/testdata/simple-spaces.tok b/config/testdata/simple-spaces.tok new file mode 100644 index 0000000..9cc4dbe --- /dev/null +++ b/config/testdata/simple-spaces.tok @@ -0,0 +1 @@ +one two three four five six diff --git a/config/testdata/simple.tok b/config/testdata/simple.tok deleted file mode 100644 index 00d4c39..0000000 --- a/config/testdata/simple.tok +++ /dev/null @@ -1 +0,0 @@ -one two three four five six diff --git a/config/tokenizer.go b/config/tokenizer.go index 13e86ea..44ab978 100644 --- a/config/tokenizer.go +++ b/config/tokenizer.go @@ -2,26 +2,37 @@ package config import ( "bufio" - "fmt" + "io" "os" + "strings" + "unicode" +) + +const ( + TK_NONE = iota + TK_VAL + TK_QUOTE + TK_COMMENT ) type Tokenizer struct { - line int + curline int + repline int file *os.File - scanner *bufio.Scanner + reader *bufio.Reader + text string + err error } func NewTokenizer(fn string) (*Tokenizer, error) { var err error - tk := &Tokenizer{line: 1} + tk := &Tokenizer{curline: 1} tk.file, err = os.Open(fn) if err != nil { return nil, err } - tk.scanner = bufio.NewScanner(tk.file) - tk.scanner.Split(tk.Split) + tk.reader = bufio.NewReader(tk.file) return tk, nil } @@ -30,93 +41,114 @@ func (t *Tokenizer) Close() error { } func (t *Tokenizer) Scan() bool { - return t.scanner.Scan() -} - -func (t *Tokenizer) Text() string { - return t.scanner.Text() -} - -func (t *Tokenizer) Line() int { - return t.line -} - -func (t *Tokenizer) Err() error { - return t.scanner.Err() -} - -func (t *Tokenizer) Split(data []byte, atEOF bool) (int, []byte, error) { - if atEOF && len(data) == 0 { - return 0, nil, nil - } + t.repline = t.curline + state := TK_NONE + t.text = "" + + var b strings.Builder + var quo rune + for { + var r rune + r, _, t.err = t.reader.ReadRune() + if t.err != nil { + break + } - var ignoreline bool - var started bool - var startidx int - var quote byte - - for i := 0; i < len(data); i++ { - c := data[i] - //fmt.Printf("%c (%t) (%t)\n", c, started, ignoreline) - switch c { - case '\f', '\n', '\r': - if started { - return i, data[startidx:i], nil + switch state { + case TK_NONE: + // When between values, increment both the reported line + // and the current line, since there's not yet anything + // to report + if r == '\n' { + t.repline++ + t.curline++ } - t.line++ - if ignoreline { - ignoreline = false + // If we're between values and we encounter a space + // or a control character, ignore it + if unicode.IsSpace(r) || unicode.IsControl(r) { continue } - fallthrough - case ' ', '\t', '\v': - if started && quote == 0 { - return i + 1, data[startidx:i], nil + // If we're between values and we encounter a #, it's + // the beginning of a comment + if r == '#' { + state = TK_COMMENT + continue } - case '\'', '"', '`': - // When the quote ends - if quote == c { - // if we've gotten data, return it - if started { - return i + 1, data[startidx:i], nil - } - // if we haven't return nothing - return i + 1, []byte{}, nil + // If we're between values and we get a quote character + // treat it as the beginning of a string literal + if r == '"' || r == '\'' || r == '`' { + state = TK_QUOTE + quo = r + continue } - // start a quoted string - if !ignoreline && quote == 0 { - quote = c + b.WriteRune(r) + state = TK_VAL + + case TK_VAL: + // In values, only increment the current line, so + // if an error is reported, it reports the line + // the value starts on + if r == '\n' { + t.curline++ } - case '#': - if !started { - ignoreline = true + // If we're in a normal value and we encounter a space + // or a control character, end value + if unicode.IsSpace(r) || unicode.IsControl(r) { + goto end + } + b.WriteRune(r) + + case TK_QUOTE: + // In quotes, only increment the current line, so + // if an error is reported, it reports the line + // the quoted value starts on + if r == '\n' { + t.curline++ } - default: - if !ignoreline && !started { - started = true - startidx = i + // End this quote if it's another quote of the same rune + if r == quo { + goto end + } + b.WriteRune(r) + + case TK_COMMENT: + // Comments are ignored, until a new line is encounter + // at which point, increment the current and reported line + if r == '\n' { + t.curline++ + t.repline++ + state = TK_NONE } + continue } } - if atEOF { - if quote != 0 { - return 0, nil, fmt.Errorf("unterminated quote") - } - - if ignoreline { - return len(data), nil, nil - } - if started { - return len(data), data[startidx:], nil +end: + if t.err == nil || t.err == io.EOF { + if b.Len() > 0 { + t.text = b.String() } } + return t.err == nil +} - return 0, nil, nil +func (t *Tokenizer) Text() string { + return t.text +} + +func (t *Tokenizer) Line() int { + return t.repline +} + +func (t *Tokenizer) Err() error { + if t.err == io.EOF { + return nil + } + return t.err } diff --git a/config/tokenizer_test.go b/config/tokenizer_test.go index b726e1f..bf6c31b 100644 --- a/config/tokenizer_test.go +++ b/config/tokenizer_test.go @@ -1,26 +1,48 @@ package config import ( - "testing" "encoding/json" + "testing" ) -func TestTokenizer(t *testing.T) { - runTest(t, "simple", +func TestSimpleSpaces(t *testing.T) { + runTest(t, "simple-spaces", `[["one","two","three","four","five","six"]]`, ) - runTest(t, "simple-broken", +} + +func TestSimpleMultiline(t *testing.T) { + runTest(t, "simple-multiline", `[["one","two","three"],["four","five"],[],[],["six"]]`, ) +} + +func TestQuotes(t *testing.T) { + runTest(t, "quotes", + `[["one","two","three"],[],["four","five","six"]]`, + ) +} + +func TestQuotesMultiline(t *testing.T) { + runTest(t, "quotes-multiline", + `[["one\ntwo"],["three\nfour"],[],[],["five\n six"]]`, + ) +} + +func TestComments(t *testing.T) { runTest(t, "comments", - `[[],["one","two","three"],[],["four","five","six"]]`, + `[[],["one"],[],["two"],[],["three"]]`, ) - runTest(t, "quotes", - `[["one","two three",[],["four five"],[],[" #six","","seven","ei","ght"],[],["multi\nline"]]`, +} + +func TestCommentsInline(t *testing.T) { + runTest(t, "comments-inline", + `[["one"],["two#three"],[],["four"]]`, ) } func runTest(t *testing.T, bn string, exp string) { + t.Logf("Running testdata/%s.tok.. ", bn) tok, err := NewTokenizer("testdata/" + bn + ".tok") if err != nil { t.Fatalf("%s", err.Error()) @@ -49,7 +71,7 @@ func runTest(t *testing.T, bn string, exp string) { if exp != string(out) { t.Logf("Expected: %s", exp) - t.Logf("Got: %s", out) - t.Fail() + t.Logf("Got: %s", out) + t.FailNow() } }