alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Scala example source code file (Lexer.scala)

This example Scala source code file (Lexer.scala) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Scala tags/keywords

a, char, char, comma, delim, delim, eof, io, lexer, malformedinput, string, string, stringbuilder, token, token

The Scala Lexer.scala source code

package scala.tools.nsc.io

import java.io.{Reader, Writer, StringReader, StringWriter}
import scala.collection.mutable.{Buffer, ArrayBuffer}
import scala.math.BigInt

/** Companion object of class `Lexer` which defines tokens and some utility concepts
 *  used for tokens and lexers
 */
object Lexer {

  /** An exception raised if a if input does not correspond to what's expected
   *  @param   rdr   the lexer form which the bad input is read
   *  @param   msg   the error message
   */
  class MalformedInput(val rdr: Lexer, val msg: String) extends Exception("Malformed JSON input at "+rdr.tokenPos+": "+msg)

  /** The class of tokens, i.e. descriptions of input words (or: lexemes).
   *  @param str    the characters making up this token
   */
  class Token(val str: String) { 
    override def toString = str 
  }

  /** A subclass of `Token` representing single-character delimiters
   *  @param char the delimiter character making up this token
   */
  case class Delim(char: Char) extends Token("'"+char.toString+"'")

  /** A subclass of token representing integer literals */
  case class IntLit(override val str: String) extends Token(str)

  /** A subclass of token representing floating point literals */
  case class FloatLit(override val str: String) extends Token(str)

  /** A subclass of token representing string literals */
  case class StringLit(override val str: String) extends Token(str) { 
    override def toString = quoted(str) 
  }

  /** The `true` token */
  val TrueLit = new Token("true")

  /** The `false` token */
  val FalseLit = new Token("false")

  /** The `null` token */
  val NullLit = new Token("null")

  /** The '`(`' token */
  val LParen = new Delim('(')

  /** The '`(`' token */
  val RParen = new Delim(')')

  /** The '`{`' token */
  val LBrace = new Delim('{')

  /** The '`}`' token */
  val RBrace = new Delim('}')

  /** The '`[`' token */
  val LBracket = new Delim('[')

  /** The '`]`' token */
  val RBracket = new Delim(']')

  /** The '`,`' token */
  val Comma = new Delim(',')

  /** The '`:`' token */
  val Colon = new Delim(':')

  /** The token representing end of input */
  val EOF = new Token("<end of input>")

  private def toUDigit(ch: Int): Char = {
    val d = ch & 0xF
    (if (d < 10) d + '0' else d - 10 + 'A').toChar
  }

  private def addToStr(buf: StringBuilder, ch: Char) {
    ch match {
      case '"' => buf ++= "\\\""
      case '\b' => buf ++= "\\b"
      case '\f' => buf ++= "\\f"
      case '\n' => buf ++= "\\n"
      case '\r' => buf ++= "\\r"
      case '\t' => buf ++= "\\t"
      case '\\' => buf ++= "\\\\"
      case _ =>
        if (' ' <= ch && ch < 128) buf += ch
        else buf ++= "\\u" += toUDigit(ch >>> 12) += toUDigit(ch >>> 8) += toUDigit(ch >>> 4) += toUDigit(ch)
    }
  }

  /** Returns given string enclosed in `"`-quotes with all string characters escaped
   *  so that they correspond to the JSON standard.
   *  Characters that escaped are:  `"`, `\b`, `\f`, `\n`, `\r`, `\t`, `\`.
   *  Furthermore, every other character which is not in the ASCII range 32-127 is
   *  escaped as a four hex-digit unicode character of the form `\ u x x x x`.
   *  @param   str   the string to be quoted
   */
  def quoted(str: String): String = {
    val buf = new StringBuilder += '\"'
    str foreach (addToStr(buf, _))
    buf += '\"'
    buf.toString
  }
  
  private val BUF_SIZE = 2 << 16
}

import Lexer._

/** A simple lexer for tokens as they are used in JSON, plus parens `(`, `)`
 *  Tokens understood are:
 *
 *  `(`, `)`, `[`, `]`, `{`, `}`, `:`, `,`, `true`, `false`, `null`,
 *  strings (syntax as in JSON),
 *  integer numbers (syntax as in JSON: -?(0|\d+)
 *  floating point numbers (syntax as in JSON: -?(0|\d+)(\.\d+)?((e|E)(+|-)?\d+)?)
 *  The end of input is represented as its own token, EOF.
 *  Lexers can keep one token lookahead
 *
 * @param rd   the reader from which characters are read.
 */
class Lexer(rd: Reader) { 

  /** The last-read character */
  var ch: Char = 0

  /** The number of characters read so far */
  var pos: Long = 0

  /** The last-read token */
  var token: Token = _

  /** The number of characters read before the start of the last-read token */
  var tokenPos: Long = 0

  private var atEOF: Boolean = false
  private val buf = new Array[Char](BUF_SIZE)
  private var nread: Int = 0
  private var bp = 0

  /** Reads next character into `ch` */
  def nextChar() {
    assert(!atEOF)
    if (bp == nread) {
      nread = rd.read(buf) 
      bp = 0
      if (nread <= 0) { ch = 0; atEOF = true; return }
    }
    ch = buf(bp)
    bp += 1
    pos += 1
  }

  /** If last-read character equals given character, reads next character,
   *  otherwise raises an error
   *  @param  c   the given character to compare with last-read character
   *  @throws  MalformedInput if character does not match
   */
  def acceptChar(c: Char) = if (ch == c) nextChar() else error("'"+c+"' expected")

  private val sb = new StringBuilder

  private def putChar() {
    sb += ch; nextChar()
  }

  private def putAcceptString(str: String) { 
    str foreach acceptChar
    sb ++= str 
  }

  /** Skips whitespace and reads next lexeme into `token`
   *  @throws  MalformedInput if lexeme not recognized as a valid token
   */
  def nextToken() {
    sb.clear()
    while (!atEOF && ch <= ' ') nextChar()
    tokenPos = pos - 1
    if (atEOF) token = EOF
    else ch match {
      case '(' => putChar(); token = LParen
      case ')' => putChar(); token = RParen
      case '{' => putChar(); token = LBrace
      case '}' => putChar(); token = RBrace
      case '[' => putChar(); token = LBracket
      case ']' => putChar(); token = RBracket
      case ',' => putChar(); token = Comma
      case ':' => putChar(); token = Colon
      case 't' => putAcceptString("true"); token = TrueLit
      case 'f' => putAcceptString("false"); token = FalseLit
      case 'n' => putAcceptString("null"); token = NullLit
      case '"' => getString()
      case '-' | '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' => getNumber()
      case _ => error("unrecoginezed start of token: '"+ch+"'")
    }
    //println("["+token+"]")
  }

  /** Reads a string literal, and forms a `StringLit` token from it.
   *  Last-read input character `ch` must be opening `"`-quote.
   *  @throws  MalformedInput if lexeme not recognized as a string literal.
   */
  def getString() {
    def udigit() = {
      nextChar()
      if ('0' <= ch && ch <= '9') ch - '9'
      else if ('A' <= ch && ch <= 'F') ch - 'A' + 10
      else if ('a' <= ch && ch <= 'f') ch - 'a' + 10
      else error("illegal unicode escape character: '"+ch+"'")
    }
    val delim = ch
    nextChar()
    while (ch != delim && ch >= ' ') {
      if (ch == '\\') {
        nextChar()
        ch match {
          case '\'' => sb += '\'' 
          case '"' => sb += '"' 
          case '\\' => sb += '\\'
          case '/' => sb += '/'
          case 'b' => sb += '\b'
          case 'f' => sb += '\f'
          case 'n' => sb += '\n'
          case 'r' => sb += '\r'
          case 't' => sb += '\t'
          case 'u' => sb += (udigit() << 12 | udigit() << 8 | udigit() << 4 | udigit()).toChar
          case _ => error("illegal escape character: '"+ch+"'")
        }
        nextChar()
      } else {
        putChar()
      }
    }
    acceptChar(delim)
    token = StringLit(sb.toString)
  }

  /** Reads a numeric literal, and forms an `IntLit` or `FloatLit` token from it.
   *  Last-read input character `ch` must be either `-` or a digit.
   *  @throws  MalformedInput if lexeme not recognized as a numeric literal.
   */
  def getNumber() {
    def digit() =
      if ('0' <= ch && ch <= '9') putChar()
      else error("<digit> expected")
    def digits() =
      do { digit() } while ('0' <= ch && ch <= '9')
    var isFloating = false
    if (ch == '-') putChar()
    if (ch == '0') digit()
    else digits()
    if (ch == '.') {
      isFloating = true
      putChar()
      digits()
    }
    if (ch == 'e' || ch == 'E') {
      isFloating = true
      putChar()
      if (ch == '+' || ch == '-') putChar()
      digits()
    }
    token = if (isFloating) FloatLit(sb.toString) else IntLit(sb.toString)
  }

  /** If current token equals given token, reads next token, otherwise raises an error.
   *  @param  t   the given token to compare current token with
   *  @throws MalformedInput  if the two tokens do not match.
   */
  def accept(t: Token) {
    if (token == t) nextToken()
    else error(t+" expected, but "+token+" found")
  }

  /** The current token is a delimiter consisting of given character, reads next token,
   *  otherwise raises an error.
   *  @param  c   the given delimiter character to compare current token with
   *  @throws MalformedInput  if the the current token `token` is not a delimiter, or
   *                          consists of a character different from `c`.
   */
  def accept(ch: Char) {
    token match {
      case Delim(`ch`) => nextToken()
      case _ => accept(Delim(ch))
    }
  }

  /** Always throws a `MalformedInput` exception with given error message.
   *  @param msg  the error message
   */
  def error(msg: String) = throw new MalformedInput(this, msg)

  nextChar()
  nextToken()
}

Other Scala examples (source code examples)

Here is a short list of links related to this Scala Lexer.scala source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.