/* AMU: Custom simple markup language Copyright (C) 2021 Arsen Musayelyan This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ // Package scanner provides a scanner and tokenizer for AMU source code package scanner import ( "bufio" "bytes" "io" "unicode" ) // Token represents a lexer token type Token int const ( EOF Token = iota EOL WS WORD HEADING FORMAT PUNCT ) var eof rune = 0 // String converts a token into a string. func (t Token) String() string { switch t { case EOF: return "EOF" case EOL: return "EOL" case WS: return "WS" case WORD: return "WORD" case HEADING: return "HEADING" case FORMAT: return "FORMAT" case PUNCT: return "PUNCT" default: return "unknown" } } // Scanner implements a lexer for AMU source code. type Scanner struct { reader *bufio.Reader } // New creates a new Scanner. func New(r io.Reader) *Scanner { switch r := r.(type) { case *bufio.Reader: return &Scanner{reader: r} case *bufio.ReadWriter: return &Scanner{reader: r.Reader} default: return &Scanner{reader: bufio.NewReader(r)} } } // read reads a single rule from the underlying bufio.Reader func (s *Scanner) read() rune { // Read rune from reader char, _, err := s.reader.ReadRune() if err != nil { return eof } return char } // unread unreads the last read rune from // the underlying bufio.Reader func (s *Scanner) unread() { _ = s.reader.UnreadRune() } // scanHeading attempts to scan a HEADING token func (s *Scanner) scanHeading() (Token, string) { // Create new buffer for token literal buf := &bytes.Buffer{} // Write first character to buffer buf.WriteRune(s.read()) for { // Read character char := s.read() if char == eof { break } else if char != '#' && !unicode.IsSpace(char) { // Unread character as this is not a valid heading s.unread() // Return literal as a WORD token return WORD, buf.String() } else if char != '#' { // Unread character as this is the end of the heading literal s.unread() break } else { // Write character to buffer buf.WriteRune(char) } } // If level more than 6 if buf.Len() > 6 { // Return literal as a WORD token as this is not a valid heading return WORD, buf.String() } // Return HEADING token return HEADING, buf.String() } // isEOL checks if char is an end of line character func isEOL(char rune) bool { return char == '\n' || char == '\r' } // scanEOL scans an EOL token func (s *Scanner) scanEOL() (Token, string) { // Create new buffer for token literal buf := &bytes.Buffer{} // Write first character to buffer buf.WriteRune(s.read()) for { // Read character char := s.read() if char == eof { break } else if !isEOL(char) { // Unread character as this is the beginning of the next line s.unread() break } else { // Write character to buffer buf.WriteRune(char) } } // Return EOL token return EOL, buf.String() } // scanWord scans a WORD token func (s *Scanner) scanWord() (Token, string) { // Create new buffer for token literal buf := &bytes.Buffer{} // Write first character to buffer buf.WriteRune(s.read()) for { // Read character char := s.read() if char == eof { break } else if unicode.IsSpace(char) || unicode.IsPunct(char) { // Unread as this is the end of the word s.unread() break } // Write character to buffer buf.WriteRune(char) } // Return WORD token return WORD, buf.String() } // scanWhitespace scans a WS token func (s *Scanner) scanWhitespace() (Token, string) { // Create new buffer for token literal buf := &bytes.Buffer{} // Write first character to the buffer buf.WriteRune(s.read()) for { // Read character char := s.read() if char == eof { break } else if !unicode.IsSpace(char) || isEOL(char) { // Unread as this is the end of the whitespace s.unread() break } else { // Write character to buffer buf.WriteRune(char) } } // Return WS token return WS, buf.String() } // isFormatRune checks whether char is a // format character func isFormatRune(char rune) bool { return char == '*' || char == '_' || char == '$' || char == '`' || char == '~' } // scanFormat scans a FORMAT token func (s *Scanner) scanFormat() (Token, string) { // Store format rule for use later formatRune := s.read() // Create new buffer for token literal buf := &bytes.Buffer{} // Write first character to buffer buf.WriteRune(formatRune) for { // Read character char := s.read() if char == eof { break } else if isEOL(char) { // Unread as this is not a valid format s.unread() // Return literal as WORD token return WORD, buf.String() } else if char == formatRune { // Write character to buffer buf.WriteRune(char) // Stop scanning as this is the end of the format break } else { // Write character to buffer buf.WriteRune(char) } } // Return FORMAt token return FORMAT, buf.String() } // scanPunct scans a PUNCT token func (s *Scanner) scanPunct() (Token, string) { return PUNCT, string(s.read()) } // Scan scans a single token from the input func (s *Scanner) Scan() (Token, string) { // read character char := s.read() // Unread character as it will be // needed by future functions s.unread() // Run appropriate scan function and return result if isEOL(char) { return s.scanEOL() } else if unicode.IsSpace(char) { return s.scanWhitespace() } else if char == '#' { return s.scanHeading() } else if isFormatRune(char) { return s.scanFormat() } else if unicode.IsPunct(char) { return s.scanPunct() } else if char != eof { return s.scanWord() } else { return EOF, "" } }