From 6d2f1ee907eaca7bc6f951c21ea30689aecf5586 Mon Sep 17 00:00:00 2001 From: Garrett Dickinson Date: Thu, 16 Jun 2022 17:12:43 -0500 Subject: [PATCH] Add initial language spec, Scanner almost complete --- cobalt/Scanner.java | 211 ++++++++++++++++++++++++++++++++++++++++++ cobalt/TokenType.java | 2 +- lang_spec.txt | 75 +++++++++++++++ 3 files changed, 287 insertions(+), 1 deletion(-) create mode 100644 lang_spec.txt diff --git a/cobalt/Scanner.java b/cobalt/Scanner.java index 1dfe59b..add62c1 100644 --- a/cobalt/Scanner.java +++ b/cobalt/Scanner.java @@ -13,11 +13,36 @@ class Scanner { private int start = 0; private int current = 0; private int line = 0; + private static final Map keywords; + static { + keywords = new HashMap<>(); + keywords.put("and", AND); + keywords.put("class", ELSE); + keywords.put("false", FALSE); + keywords.put("for", FOR); + keywords.put("func", FUNC); + keywords.put("if", IF); + keywords.put("nil", NIL); + keywords.put("or", OR); + keywords.put("print", PRINT); + keywords.put("return", RETURN); + keywords.put("super", SUPER); + keywords.put("this", THIS); + keywords.put("true", TRUE); + keywords.put("var", VAR); + keywords.put("while", WHILE); + } + + // Constructor + // Set our source string to be the incoming character data from the + // input script Scanner(String source) { this.source = source; } + // Scan the input file for all available tokens, return a Token list with all + // of our valid tokens List scanTokens() { while (!isAtEnd()) { start = current; @@ -28,7 +53,193 @@ class Scanner { return tokens; } + // Check to see if we have reached the end of the script private boolean isAtEnd() { return current >= source.length(); } + + // Parse the current token from the scanner to see if its a valid + // lexeme. Report an error otherwise + private void scanToken() { + char c = advance(); + switch (c) { + // Structural and Accessors + case '(': addToken(LEFT_PAREN); break; + case ')': addToken(RIGHT_PAREN); break; + case '{': addToken(LEFT_BRACE); break; + case '}': addToken(RIGHT_BRACE); break; + case ',': addToken(COMMA); break; + case '.': addToken(DOT); break; + case ';': addToken(SEMICOLON); break; + + // Operators + case '*': addToken(STAR); break; + case '-': addToken(MINUS); break; + case '+': addToken(PLUS); break; + case '!': + addToken(match('=') ? BANG_EQUAL : BANG); + break; + case '=': + addToken(match('=') ? EQUAL_EQUAL : EQUAL); + break; + case '<': + addToken(match('=') ? LESS_EQUAL : LESS); + break; + case '>': + addToken(match('=') ? GREATER_EQUAL : GREATER); + break; + case '/': + if (match('/')) { + // A comment goes until the end of the line + while (peek() != '\n' && !isAtEnd()) advance(); + } else { + addToken(SLASH); + } + break; + + // Whitespace and new lines + case ' ': + case '\r': + case '\t': + //ignore whitespace characters + break; + case '\n': + line++; + break; + + default: + if (isDigit(c)) { + // Check to see if our incoming value is part of a number + number(); + } else if (isAlpha(c)) { + // Check to see if our incoming value is part of + // a reserved word or identifier + identifier(); + } else { + Cobalt.error(line, "Unexpected character."); + } + break; + } + } + + // Determine if the char is a base 10 digit + private boolean isDigit(char c) { + return c >= '0' && c <= '9'; + } + + + private void number() { + while (!isDigit(peek())) advance(); + + // Look for a decimal place. + if (peek() == '.' && isDigit(peekNext())) { + // Consume the . + advance(); + + while (isDigit(peek())) advance(); + } + } + + + // TODO: Lox spec supports multiline strings, we'll need to + // probably remove that support since I don't intend Cobalt's + // grammar to support that (maybe) :/ + + // TODO: Escape sequences are not supported atm, for the + // love of god please implement this functionality. Probably + // should make an enum for the valid escape sequences, parse them + // out like we do with operators, and inject the actual escape + // sequence in the object thats returned to the interpreter + + // Process the input line if quotation marks are found + // and we have a string literal + private void string() { + while (peek() != '"' && !isAtEnd()) { + if (peek() == '\n') line++; + advance(); + } + + if (isAtEnd()) { + Cobalt.error(line, "Unterminated string."); + return; + } + + // Get closing quotes + advance(); + + // Trim the + String value = source.substring(start + 1, current - 1); + addToken(STRING, value); + } + + + // Determine if the infoming token is alphanumeric, and + // add it to the Token list if it is valid + private void identifier() { + while (isAlphaNumeric(peek())) advance(); + addToken(IDENTIFIER); + } + + + // Checkout the next character in our input, but dont consume it + // This is mainly to process things like comments that take an entire line + private char peek() { + if (isAtEnd()) return '\0'; + return source.charAt(current); + } + + + // Checkout the next+1 character in our input, but dont consume it + // This is mainly to process things like comments that take an entire line + private char peekNext() { + if (current + 1 >= source.length()) return '\0'; + return source.charAt(current + 1); + } + + + // Check to see if the character passed is within + // [a-z][A-Z] + private boolean isAlpha(char c) { + return (c >= 'a' && c <= 'z') || + (c >= 'A' && c <= 'Z') || + c == '_'; + } + + + // Check to see if the character passed is within + // [a-z][A-Z][0-9] + private boolean isAlphaNumeric(char c) { + return isAlpha(c) || isDigit(c); + } + + + // Return a boolean based on if a char is found at the current cursor, + // then increment + private boolean match(char expected) { + if (isAtEnd()) return false; + if (source.charAt(current) != expected) return false; + + current++; + return true; + } + + + // Advance the char pointer in the line scanner + private char advance() { + return source.charAt(current++); + } + + + // Add a token to the token List that does not have an object literal + // associated with it. + private void addToken(TokenType type) { + addToken(type, null); + } + + + // Add a token to the token List that has an object associated with it + private void addToken(TokenType type, Object literal) { + String text = source.substring(start, current); + tokens.add(new Token(type, text, literal, line)); + } } \ No newline at end of file diff --git a/cobalt/TokenType.java b/cobalt/TokenType.java index c73c928..b0c5ca4 100644 --- a/cobalt/TokenType.java +++ b/cobalt/TokenType.java @@ -13,7 +13,7 @@ enum TokenType { IDENTIFIER, STRING, NUMBER, // Keywords - AND, CLASS, ELSE, FALSE, FUN, FOR, IF, NIL, OR, + AND, CLASS, ELSE, FALSE, FUNC, FOR, IF, NIL, OR, PRINT, RETURN, SUPER, THIS, TRUE, VAR, WHILE, EOF diff --git a/lang_spec.txt b/lang_spec.txt new file mode 100644 index 0000000..401737b --- /dev/null +++ b/lang_spec.txt @@ -0,0 +1,75 @@ +using cobalt.std; +using cobalt.math; + +// Comments + +/* + Block Comments +*/ + +public class MyProgram { + + // *** Instance Variables *** + + // (Option 1) + // Since mutable variables are frequently used, you should have to specify if the variable is intended + // to be immutable/constant + + let x: int = 0; // Mutable type + let y: const int = 0; // Immutable type + + + // (Option 2) + // Assume all variables are immutable, and only allow them to be mutable if specified. Helpful in compiled, memory + // safe languages, but probably not for an interpreted language that sits in memory? + + let mut x: int = 0; // Mutable type + let y: int = 0; // Immutable type + + + // *** Main and Declaring functions *** + + // I think scripts should work similar to python, but not be as + // funky/verbose with function names + + // If there is no main function defined within the script, the interpreter should + // process the file sequentially like Python, + // Otherwise, the main function is ran and operates like any normal program + + // Best practice would have main return an integer, however it could be + // void or return any other type + + + // (Option 1, C/C++/Java style) + private int main1() { + // Some code + return 0; + } + + // (Option 2, Swift/Rust style) + private func main() => int { + // Some code + return 0; + } + + // (Option 3, Ada style) + private func main() returns int { + // Some code + return 0; + } + + // *** Handling Multiple Main Methods *** + + // I feel that classes should be able to have their own main methods, and it could be + // determined which one is the entry point by requiring the user to provide an entry point + // + // (Example) + // -Multiple classes in one file, each with their own main method + // -Specify the script along with the class when opening with Cobalt, and it will run that specific class + // -Attempt to run the class's main method. Error if it doesn't exist + // -If there are multiple mains and one isn't specific, just error + // + // Ex: cobalt script.cblt --main MyClass + // + // This allows for multiple "sub programs" within a single Cobalt script +} \ No newline at end of file