cobalt-lang/cobalt/lang/Scanner.java

317 lines
9.5 KiB
Java

package cobalt.lang;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static cobalt.lang.TokenType.*;
class Scanner {
private final String source;
private final List<Token> tokens = new ArrayList<>();
private int start = 0;
private int current = 0;
private int line = 0;
private static final Map<String, TokenType> keywords;
static {
keywords = new HashMap<>();
keywords.put("and", AND);
keywords.put("class", CLASS);
keywords.put("else", ELSE);
keywords.put("false", FALSE);
keywords.put("for", FOR);
keywords.put("func", FUNC);
keywords.put("if", IF);
keywords.put("nil", NIL);
keywords.put("or", OR);
keywords.put("print", PRINT);
keywords.put("return", RETURN);
keywords.put("super", SUPER);
keywords.put("this", THIS);
keywords.put("true", TRUE);
keywords.put("var", VAR);
keywords.put("while", WHILE);
}
// Constructor
// Set our source string to be the incoming character data from the
// input script
Scanner(String source) {
this.source = source;
}
// Scan the input file for all available tokens, return a Token list with all
// of our valid tokens
List<Token> scanTokens() {
while (!isAtEnd()) {
start = current;
scanToken();
}
tokens.add(new Token(EOF, "", null, line));
return tokens;
}
// Check to see if we have reached the end of the script
private boolean isAtEnd() {
return current >= source.length();
}
// Parse the current token from the scanner to see if its a valid
// lexeme. Report an error otherwise
private void scanToken() {
char c = advance();
switch (c) {
// Structural and Accessors
case '(': addToken(LEFT_PAREN); break;
case ')': addToken(RIGHT_PAREN); break;
case '{': addToken(LEFT_BRACE); break;
case '}': addToken(RIGHT_BRACE); break;
case ',': addToken(COMMA); break;
case '.': addToken(DOT); break;
case ';': addToken(SEMICOLON); break;
// Operators
case '*': addToken(STAR); break;
case '-': addToken(MINUS); break;
case '+': addToken(PLUS); break;
case '!':
addToken(match('=') ? BANG_EQUAL : BANG);
break;
case '=':
addToken(match('=') ? EQUAL_EQUAL : EQUAL);
break;
case '<':
addToken(match('=') ? LESS_EQUAL : LESS);
break;
case '>':
addToken(match('=') ? GREATER_EQUAL : GREATER);
break;
case '/':
if (match('/')) {
// A comment goes until the end of the line
while (peek() != '\n' && !isAtEnd()) advance();
} else {
addToken(SLASH);
}
break;
// Whitespace and new lines
case ' ':
case '\r':
case '\t':
//ignore whitespace characters
break;
case '\n':
line++;
break;
case '"':
string();
break;
default:
if (isDigit(c)) {
// Check to see if our incoming value is part of a number
// Switch on base prefix (hex, binary, base10...)
switch (peek()) {
case 'x':
advance(); // Advance to disregard the '0x' prefix
hex();
break;
case 'b':
advance(); // Advance to disregard the '0b' prefix
binary();
break;
case 'o':
advance(); // Advance to disregard the '0o' prefix
octal();
break;
default:
number();
break;
}
} else if (isAlpha(c)) {
// Check to see if our incoming value is part of
// a reserved word or identifier
identifier();
} else {
Cobalt.error(line, "Unexpected character.");
}
break;
}
}
// Determine if the char is a base 10 digit
private boolean isDigit(char c) {
return c >= '0' && c <= '9';
}
// Determine if the character is a valid base 16 digit
private boolean isHex(char c) {
return (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f');
}
// Determine if the character is a valid base 16 digit
private boolean isBinary(char c) {
return (c >= '0' && c <= '1');
}
// Determine if the character is a valid base 8 digit
private boolean isOctal(char c) {
return (c >= '0' && c <= '7');
}
// Parse and tokenize the value as a base 10 number
private void number() {
while (isDigit(peek())) advance();
// Look for a decimal place.
if (peek() == '.' && isDigit(peekNext())) {
// Consume the .
advance();
while (isDigit(peek())) advance();
}
addToken(NUMBER, Double.parseDouble(source.substring(start, current)));
}
// Parse and tokenize the value as a hexadecimal number, store in base 10
private void hex() {
while (isDigit(peek()) || isHex(peek())) advance();
addToken(NUMBER, Long.parseLong(source.substring(start + 2, current), 16));
}
// Parse and tokenize the value as a binary number, store in base 10
private void binary() {
while (isBinary(peek())) advance();
addToken(NUMBER, Long.parseLong(source.substring(start + 2, current), 2));
}
// Parse and tokenize the value as an octal number, store in base 10
private void octal() {
while (isOctal(peek())) advance();
addToken(NUMBER, Long.parseLong(source.substring(start + 2, current), 8));
}
// TODO: Lox spec supports multiline strings, we'll need to
// probably remove that support since I don't intend Cobalt's
// grammar to support that (maybe) :/
// TODO: Escape sequences are not supported atm, for the
// love of god please implement this functionality. Probably
// should make an enum for the valid escape sequences, parse them
// out like we do with operators, and inject the actual escape
// sequence in the object thats returned to the interpreter
// Process the input line if quotation marks are found
// and we have a string literal
private void string() {
while (peek() != '"' && !isAtEnd()) {
if (peek() == '\n') line++;
advance();
}
if (isAtEnd()) {
Cobalt.error(line, "Unterminated string.");
return;
}
// Get closing quotes
advance();
// Trim the
String value = source.substring(start + 1, current - 1);
addToken(STRING, value);
}
// Determine if the infoming token is alphanumeric, and
// add it to the Token list if it is valid
private void identifier() {
while (isAlphaNumeric(peek())) advance();
String text = source.substring(start, current);
TokenType type = keywords.get(text);
if (type == null) type = IDENTIFIER;
addToken(type);
}
// Checkout the next character in our input, but dont consume it
// This is mainly to process things like comments that take an entire line
private char peek() {
if (isAtEnd()) return '\0';
return source.charAt(current);
}
// Checkout the next+1 character in our input, but dont consume it
// This is mainly to process things like comments that take an entire line
private char peekNext() {
if (current + 1 >= source.length()) return '\0';
return source.charAt(current + 1);
}
// Check to see if the character passed is within
// [a-z][A-Z]
private boolean isAlpha(char c) {
return (c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
c == '_';
}
// Check to see if the character passed is within
// [a-z][A-Z][0-9]
private boolean isAlphaNumeric(char c) {
return isAlpha(c) || isDigit(c);
}
// Return a boolean based on if a char is found at the current cursor,
// then increment
private boolean match(char expected) {
if (isAtEnd()) return false;
if (source.charAt(current) != expected) return false;
current++;
return true;
}
// Advance the char pointer in the line scanner
private char advance() {
return source.charAt(current++);
}
// Add a token to the token List that does not have an object literal
// associated with it.
private void addToken(TokenType type) {
addToken(type, null);
}
// Add a token to the token List that has an object associated with it
private void addToken(TokenType type, Object literal) {
String text = source.substring(start, current);
tokens.add(new Token(type, text, literal, line));
}
}