Cagire/crates/forth/src/compiler.rs

use std::sync::Arc;

use super::ops::Op;
use super::types::{Dictionary, SourceSpan};
use super::words::compile_word;

#[derive(Clone, Debug)]
enum Token {
    Int(i64, SourceSpan),
    Float(f64, SourceSpan),
    Str(String, SourceSpan),
    Word(String, SourceSpan),
}

pub(super) fn compile_script(input: &str, dict: &Dictionary) -> Result<Vec<Op>, String> {
    let tokens = tokenize(input);
    compile(&tokens, dict)
}

fn tokenize(input: &str) -> Vec<Token> {
    let mut tokens = Vec::new();
    let mut chars = input.char_indices().peekable();

    while let Some(&(pos, c)) = chars.peek() {
        if c.is_whitespace() {
            chars.next();
            continue;
        }

        if c == '(' || c == ')' {
            chars.next();
            continue;
        }

        if c == '"' {
            let start = pos;
            chars.next();
            let mut s = String::new();
            let mut end = start + 1;
            while let Some(&(i, ch)) = chars.peek() {
                end = i + ch.len_utf8();
                chars.next();
                if ch == '"' {
                    break;
                }
                s.push(ch);
            }
            tokens.push(Token::Str(s, SourceSpan { start, end }));
            continue;
        }

        if c == ';' {
            chars.next(); // consume first ;
            if let Some(&(_, ';')) = chars.peek() {
                // ;; starts a comment to end of line
                chars.next(); // consume second ;
                while let Some(&(_, ch)) = chars.peek() {
                    if ch == '\n' {
                        break;
                    }
                    chars.next();
                }
                continue;
            }
            // single ; is a word, create token
            tokens.push(Token::Word(
                ";".to_string(),
                SourceSpan {
                    start: pos,
                    end: pos + 1,
                },
            ));
            continue;
        }

        let start = pos;
        let mut word = String::new();
        let mut end = start;
        while let Some(&(i, ch)) = chars.peek() {
            if ch.is_whitespace() {
                break;
            }
            end = i + ch.len_utf8();
            word.push(ch);
            chars.next();
        }

        let span = SourceSpan { start, end };

        // Normalize shorthand float syntax: .25 -> 0.25, -.5 -> -0.5
        let word_to_parse = if word.starts_with('.')
            && word.len() > 1
            && word.as_bytes()[1].is_ascii_digit()
        {
            format!("0{word}")
        } else if word.starts_with("-.")
            && word.len() > 2
            && word.as_bytes()[2].is_ascii_digit()
        {
            format!("-0{}", &word[1..])
        } else {
            word.clone()
        };

        if let Ok(i) = word_to_parse.parse::<i64>() {
            tokens.push(Token::Int(i, span));
        } else if let Ok(f) = word_to_parse.parse::<f64>() {
            tokens.push(Token::Float(f, span));
        } else {
            tokens.push(Token::Word(word, span));
        }
    }

    tokens
}

fn compile(tokens: &[Token], dict: &Dictionary) -> Result<Vec<Op>, String> {
    let mut ops = Vec::new();
    let mut i = 0;

    while i < tokens.len() {
        match &tokens[i] {
            Token::Int(n, span) => {
                let key = n.to_string();
                if let Some(body) = dict.lock().get(&key).cloned() {
                    ops.extend(body);
                } else {
                    ops.push(Op::PushInt(*n, Some(*span)));
                }
            }
            Token::Float(f, span) => {
                let key = f.to_string();
                if let Some(body) = dict.lock().get(&key).cloned() {
                    ops.extend(body);
                } else {
                    ops.push(Op::PushFloat(*f, Some(*span)));
                }
            }
            Token::Str(s, span) => ops.push(Op::PushStr(Arc::from(s.as_str()), Some(*span))),
            Token::Word(w, span) => {
                let word = w.as_str();
                if word == "{" {
                    let (quote_ops, consumed, end_span) =
                        compile_quotation(&tokens[i + 1..], dict)?;
                    i += consumed;
                    let body_span = SourceSpan {
                        start: span.start,
                        end: end_span.end,
                    };
                    ops.push(Op::Quotation(Arc::from(quote_ops), Some(body_span)));
                } else if word == "}" {
                    return Err("unexpected }".into());
                } else if word == ":" {
                    let (consumed, name, body) = compile_colon_def(&tokens[i + 1..], dict)?;
                    i += consumed;
                    dict.lock().insert(name, body);
                } else if word == ";" {
                    return Err("unexpected ;".into());
                } else if word == "if" {
                    let (then_ops, else_ops, consumed, then_span, else_span) =
                        compile_if(&tokens[i + 1..], dict)?;
                    i += consumed;
                    if else_ops.is_empty() {
                        ops.push(Op::BranchIfZero(then_ops.len(), then_span, None));
                        ops.extend(then_ops);
                    } else {
                        ops.push(Op::BranchIfZero(then_ops.len() + 1, then_span, else_span));
                        ops.extend(then_ops);
                        ops.push(Op::Branch(else_ops.len()));
                        ops.extend(else_ops);
                    }
                } else if !compile_word(word, Some(*span), &mut ops, dict) {
                    return Err(format!("unknown word: {word}"));
                }
            }
        }
        i += 1;
    }

    Ok(ops)
}

fn compile_quotation(
    tokens: &[Token],
    dict: &Dictionary,
) -> Result<(Vec<Op>, usize, SourceSpan), String> {
    let mut depth = 1;
    let mut end_idx = None;

    for (i, tok) in tokens.iter().enumerate() {
        if let Token::Word(w, _) = tok {
            match w.as_str() {
                "{" => depth += 1,
                "}" => {
                    depth -= 1;
                    if depth == 0 {
                        end_idx = Some(i);
                        break;
                    }
                }
                _ => {}
            }
        }
    }

    let end_idx = end_idx.ok_or("missing }")?;
    let end_span = match &tokens[end_idx] {
        Token::Word(_, span) => *span,
        _ => unreachable!(),
    };
    let quote_ops = compile(&tokens[..end_idx], dict)?;
    Ok((quote_ops, end_idx + 1, end_span))
}

fn token_span(tok: &Token) -> Option<SourceSpan> {
    match tok {
        Token::Int(_, s) | Token::Float(_, s) | Token::Str(_, s) | Token::Word(_, s) => Some(*s),
    }
}

fn compile_colon_def(
    tokens: &[Token],
    dict: &Dictionary,
) -> Result<(usize, String, Vec<Op>), String> {
    if tokens.is_empty() {
        return Err("expected word name after ':'".into());
    }
    let name = match &tokens[0] {
        Token::Word(w, _) => w.clone(),
        Token::Int(n, _) => n.to_string(),
        Token::Float(f, _) => f.to_string(),
        Token::Str(s, _) => s.clone(),
    };
    let mut semi_pos = None;
    for (i, tok) in tokens[1..].iter().enumerate() {
        if let Token::Word(w, _) = tok {
            if w == ";" {
                semi_pos = Some(i + 1);
                break;
            }
        }
    }
    let semi_pos = semi_pos.ok_or("missing ';' in word definition")?;
    let body_tokens = &tokens[1..semi_pos];
    let body_ops = compile(body_tokens, dict)?;
    Ok((semi_pos + 1, name, body_ops))
}

fn tokens_span(tokens: &[Token]) -> Option<SourceSpan> {
    let first = tokens.first().and_then(token_span)?;
    let last = tokens.last().and_then(token_span)?;
    Some(SourceSpan {
        start: first.start,
        end: last.end,
    })
}

#[allow(clippy::type_complexity)]
fn compile_if(
    tokens: &[Token],
    dict: &Dictionary,
) -> Result<
    (
        Vec<Op>,
        Vec<Op>,
        usize,
        Option<SourceSpan>,
        Option<SourceSpan>,
    ),
    String,
> {
    let mut depth = 1;
    let mut else_pos = None;
    let mut then_pos = None;

    for (i, tok) in tokens.iter().enumerate() {
        if let Token::Word(w, _) = tok {
            match w.as_str() {
                "if" => depth += 1,
                "else" if depth == 1 => else_pos = Some(i),
                "then" => {
                    depth -= 1;
                    if depth == 0 {
                        then_pos = Some(i);
                        break;
                    }
                }
                _ => {}
            }
        }
    }

    let then_pos = then_pos.ok_or("missing 'then'")?;

    let (then_ops, else_ops, then_span, else_span) = if let Some(ep) = else_pos {
        let then_slice = &tokens[..ep];
        let else_slice = &tokens[ep + 1..then_pos];
        let then_span = tokens_span(then_slice);
        let else_span = tokens_span(else_slice);
        let then_ops = compile(then_slice, dict)?;
        let else_ops = compile(else_slice, dict)?;
        (then_ops, else_ops, then_span, else_span)
    } else {
        let then_slice = &tokens[..then_pos];
        let then_span = tokens_span(then_slice);
        let then_ops = compile(then_slice, dict)?;
        (then_ops, Vec::new(), then_span, None)
    };

    Ok((then_ops, else_ops, then_pos + 1, then_span, else_span))
}