From cd3733b7767d6400a6b3c2ba226a0abb7f0a50c7 Mon Sep 17 00:00:00 2001
From: Windel Bouwman <windel.bouwman@demcon.com>
Date: Tue, 30 Apr 2024 22:19:55 +0200
Subject: [PATCH 1/4] First draft of lark based parsing

---
 src/plcdoc/parsing/__init__.py  |  13 +++
 src/plcdoc/parsing/lexer.py     | 145 +++++++++++++++++++++++++++++
 src/plcdoc/parsing/nodes.py     | 112 ++++++++++++++++++++++
 src/plcdoc/parsing/parser.py    | 136 +++++++++++++++++++++++++++
 src/plcdoc/parsing/transform.py | 159 ++++++++++++++++++++++++++++++++
 5 files changed, 565 insertions(+)
 create mode 100644 src/plcdoc/parsing/__init__.py
 create mode 100644 src/plcdoc/parsing/lexer.py
 create mode 100644 src/plcdoc/parsing/nodes.py
 create mode 100644 src/plcdoc/parsing/parser.py
 create mode 100644 src/plcdoc/parsing/transform.py

diff --git a/src/plcdoc/parsing/__init__.py b/src/plcdoc/parsing/__init__.py
new file mode 100644
index 0000000..99b7fa4
--- /dev/null
+++ b/src/plcdoc/parsing/__init__.py
@@ -0,0 +1,13 @@
+from .parser import parse_new
+
+
+def parse_str(text: str):
+    return parse_new(text)
+
+
+def parse_file(filename: str):
+    with open(filename, "r") as f:
+        return parse_new(f.read())
+
+
+__all__ = ["parse_str", "parse_file"]
diff --git a/src/plcdoc/parsing/lexer.py b/src/plcdoc/parsing/lexer.py
new file mode 100644
index 0000000..9c347a6
--- /dev/null
+++ b/src/plcdoc/parsing/lexer.py
@@ -0,0 +1,145 @@
+"""
+PLC lexer.
+
+"""
+
+import re
+
+from dataclasses import dataclass
+import lark.lexer
+
+
+class MyLexer(lark.lexer.Lexer):
+    def __init__(self, lexer_conf):
+        pass
+
+    def lex(self, source):
+        # print(code)
+        for token in token_filter(tokenize(source)):
+            type = token.kind
+            yield lark.lexer.Token(type, token)
+
+
+@dataclass
+class Token:
+    kind: str
+    text: str
+    row: int
+    column: int
+    comment1: str
+
+
+def tokenize(source: str):
+    # Note that order is important below:
+    token_spec = [
+        ("COMMENT1", r"\(\*.*?\*\)"),
+        ("COMMENT2", r"//.*?\n"),
+        ("OP2", r"(:=)|(==)|(<=)|(!=)|(>=)|(\.\.)"),
+        ("OP", r"[<>=:;,\.\(\)\+\-\*\/]"),
+        ("BIN_NUMBER", r"2#[0-1][0-1_]*"),
+        ("OCT_NUMBER", r"8#[0-7]+"),
+        ("HEX_NUMBER", r"16#[0-9a-fA-F][0-9a-fA-F_]*"),
+        ("TIME", r"T#[0-9a-fA-F][0-9a-fA-F_]*"),
+        ("REAL", r"[0-9]+\.[0-9]+"),
+        ("NUMBER", r"[0-9]+"),
+        ("ID", r"[A-Za-z][A-Za-z_0-9]*"),
+        ("STRING", r"'[^']*'"),
+        ("SPACE", r"[ \t]+"),
+        ("ATTRIBUTE", r"\{attribute.*?\}"),
+        ("NEWLINE", r"\n"),
+        ("OTHER", r"."),
+    ]
+    op_names = {
+        ":=": "COLON_EQUALS",
+        ":": "COLON",
+        ";": "SEMI",
+        ",": "COMMA",
+        ".": "DOT",
+        "..": "DOTDOT",
+        "+": "PLUS",
+        "-": "MINUS",
+        "*": "ASTERIX",
+        "/": "SLASH",
+        "{": "BRACE_OPEN",
+        "}": "BRACE_CLOSE",
+        "(": "PARENTHESIS_OPEN",
+        ")": "PARENTHESIS_CLOSE",
+        "[": "BRACKET_OPEN",
+        "]": "BRACKET_CLOSE",
+    }
+
+    regex = "|".join(f"(?P<{name}>{pattern})" for name, pattern in token_spec)
+    row = 1
+    column = 1
+
+    for mo in re.finditer(regex, source, re.MULTILINE | re.DOTALL):
+        kind: str = mo.lastgroup
+        value = mo.group()
+        if kind == "OP" or kind == "OP2":
+            kind = op_names[value]
+        elif kind == "ID":
+            if value in KEYWORDS:
+                kind = "KW_" + value
+        elif kind == "NEWLINE":
+            row += 1
+        elif kind == "OTHER":
+            if value.isprintable():
+                c = value
+            else:
+                c = str(value.encode(encoding="utf-8", errors="replace"))
+            raise ValueError(f"Unexpected character: {c}")
+
+        yield Token(kind, value, row, column, "")
+
+    yield Token("EOF", "EOF", row, column, "")
+
+
+KEYWORDS = {
+    "ABSTRACT",
+    "ARRAY",
+    "END_STRUCT",
+    "END_TYPE",
+    "END_VAR",
+    "EXTENDS",
+    "FINAL",
+    "FUNCTION",
+    "FUNCTION_BLOCK",
+    "INTERFACE",
+    "INTERNAL",
+    "METHOD",
+    "OF",
+    "POINTER",
+    "PROGRAM",
+    "PROPERTY",
+    "PRIVATE",
+    "PROTECTED",
+    "PUBLIC",
+    "REFERENCE",
+    "STRING",
+    "STRUCT",
+    "TO",
+    "TYPE",
+    "VAR",
+    "VAR_GLOBAL",
+    "VAR_INPUT",
+    "VAR_OUTPUT",
+}
+
+
+def token_filter(tokens):
+    comment1 = ""
+    attr = ""
+    for token in tokens:
+        if token.kind == "SPACE" or token.kind == "NEWLINE":
+            continue
+        elif token.kind == "COMMENT1":
+            comment1 = token.text
+            continue
+        elif token.kind == "COMMENT2":
+            continue
+        elif token.kind == "ATTRIBUTE":
+            attr = token.text
+        else:
+            token.comment1 = comment1
+            yield token
+            comment1 = ""
diff --git a/src/plcdoc/parsing/nodes.py b/src/plcdoc/parsing/nodes.py
new file mode 100644
index 0000000..04271ae
--- /dev/null
+++ b/src/plcdoc/parsing/nodes.py
@@ -0,0 +1,112 @@
+""" Parsed AST nodes.
+
+"""
+
+from typing import Optional, Any, Union
+from dataclasses import dataclass
+
+
+@dataclass
+class Function:
+    comment1: str
+    kind: str
+    name: str
+    variable_lists: list["VariableList"]
+
+
+# @dataclass
+# class FunctionBlock:
+#     name: str
+
+@dataclass
+class Property:
+    name: str
+    ty: "Type"
+    # init: Optional["Expression"]
+
+@dataclass
+class VariableList:
+    kind: str
+    variables: list["Variable"]
+
+
+@dataclass
+class Variable:
+    name: str
+    ty: "Type"
+    init: Optional["Expression"]
+
+
+@dataclass
+class TypeDef:
+    name: str
+    ty: "Type"
+
+
+class Type:
+    pass
+
+
+@dataclass
+class Struct(Type):
+    fields: list["StructField"]
+
+
+@dataclass
+class Union(Type):
+    fields: list["StructField"]
+
+
+StructField = Variable
+
+
+@dataclass
+class Enum(Type):
+    options: list["EnumOption"]
+
+
+@dataclass
+class EnumOption:
+    name: str
+    init: None
+
+
+@dataclass
+class LabeledArgument:
+    label: str
+    value: "Expression"
+
+class Expression:
+    pass
+
+
+@dataclass
+class Binop(Expression):
+    lhs: "Expression"
+    op: str
+    rhs: "Expression"
+
+
+@dataclass
+class Number(Expression):
+    value: int
+
+
+@dataclass
+class NameRef(Expression):
+    name: str
+
+
+@dataclass
+class TypeRef:
+    name: str
+
+@dataclass
+class Array:
+    ranges: list["Range"]
+    element_type: "Type"
+
+@dataclass
+class Range:
+    begin: "Expression"
+    end: "Expression"
diff --git a/src/plcdoc/parsing/parser.py b/src/plcdoc/parsing/parser.py
new file mode 100644
index 0000000..5069366
--- /dev/null
+++ b/src/plcdoc/parsing/parser.py
@@ -0,0 +1,136 @@
+""" 
+Lark based PLC parser.
+"""
+
+
+from .lexer import MyLexer
+from .transform import MyTransformer
+from pprint import pprint
+import logging
+import lark
+
+logger = logging.getLogger(__name__)
+
+
+def parse_new(text: str):
+    print(text)
+    # for token in tokenize(text):
+    #     print(token)
+
+    tree = parser.parse(text)
+    # print("PARSED", tree)
+    print("PARSED:")
+    pprint(tree)
+    return tree
+
+
+grammar = """
+start: declaration EOF
+
+declaration: function | property | type_def | variable_list
+
+function: (KW_PROGRAM | KW_FUNCTION_BLOCK | KW_FUNCTION | KW_METHOD | KW_INTERFACE) visibility ID (COLON type)? extends? SEMI? variable_lists
+property: KW_PROPERTY visibility ID COLON type
+extends: KW_EXTENDS ID
+visibility: (KW_ABSTRACT | KW_PUBLIC | KW_PRIVATE | KW_PROTECTED | KW_INTERNAL | KW_FINAL)?
+variable_lists: variable_list*
+variable_list: (KW_VAR_INPUT | KW_VAR_OUTPUT | KW_VAR_GLOBAL | KW_VAR) variable* KW_END_VAR
+variable: ID COLON type initializer? SEMI
+
+type_def: KW_TYPE ID extends? COLON (struct | union | enum) KW_END_TYPE
+struct: KW_STRUCT variable* KW_END_STRUCT
+union: KW_UNION variable* KW_END_UNION
+enum: PARENTHESIS_OPEN enum_values PARENTHESIS_CLOSE SEMI
+enum_values: enum_value
+           | enum_values COMMA enum_value
+enum_value: ID initializer?
+
+initializer: COLON_EQUALS expression
+labeled_arguments: labeled_argument
+                 | labeled_arguments COMMA labeled_argument
+labeled_argument: ID COLON_EQUALS expression
+
+expressions: expression
+           | expressions COMMA expression
+expression: term
+          | expression (PLUS | MINUS) term
+term: factor
+    | term (ASTERIX | SLASH) factor
+factor: atom
+atom: literal
+    | name_ref
+    | struct_literal
+    | range_literal
+    | PARENTHESIS_OPEN expression PARENTHESIS_CLOSE
+name_ref: ID
+struct_literal: PARENTHESIS_OPEN labeled_arguments PARENTHESIS_CLOSE
+range_literal: PARENTHESIS_OPEN expression DOTDOT expression PARENTHESIS_CLOSE
+literal: NUMBER
+       | REAL
+       | BIN_NUMBER
+       | OCT_NUMBER
+       | HEX_NUMBER
+       | STRING
+
+type: name_ref
+    | string_type
+    | array_type
+    | pointer_type
+    | reference_type
+string_type: KW_STRING
+           | KW_STRING PARENTHESIS_OPEN expression PARENTHESIS_CLOSE
+           | KW_STRING BRACKET_OPEN expression BRACKET_CLOSE
+array_type: KW_ARRAY BRACKET_OPEN subranges BRACKET_CLOSE KW_OF type
+pointer_type: KW_POINTER KW_TO ID
+reference_type: KW_REFERENCE KW_TO ID
+
+subranges: subrange
+         | subranges COMMA subrange
+subrange: ASTERIX
+        | expression DOTDOT expression
+
+%declare KW_ABSTRACT
+%declare KW_PROGRAM
+%declare KW_FUNCTION
+%declare KW_FUNCTION_BLOCK
+%declare KW_INTERFACE
+%declare KW_METHOD
+%declare KW_PROPERTY
+%declare KW_EXTENDS
+%declare KW_FINAL
+%declare KW_PUBLIC
+%declare KW_PRIVATE
+%declare KW_PROTECTED
+%declare KW_INTERNAL
+%declare KW_TYPE
+%declare KW_END_TYPE
+%declare KW_POINTER
+%declare KW_STRUCT
+%declare KW_END_STRUCT
+%declare KW_UNION
+%declare KW_END_UNION
+%declare KW_STRING
+%declare KW_ARRAY
+%declare KW_OF
+%declare KW_REFERENCE
+%declare KW_TO
+%declare KW_VAR_GLOBAL
+%declare KW_VAR_INPUT
+%declare KW_VAR_OUTPUT
+%declare KW_VAR
+%declare KW_END_VAR
+
+%declare ID
+%declare NUMBER REAL BIN_NUMBER OCT_NUMBER HEX_NUMBER
+%declare STRING
+%declare COLON_EQUALS
+%declare COLON SEMI COMMA DOT DOTDOT
+%declare PLUS MINUS ASTERIX SLASH
+%declare BRACE_OPEN BRACE_CLOSE
+%declare PARENTHESIS_OPEN PARENTHESIS_CLOSE
+%declare BRACKET_OPEN BRACKET_CLOSE
+%declare EOF
+
+"""
+
+parser = lark.Lark(grammar, parser="lalr", transformer=MyTransformer(), lexer=MyLexer)
diff --git a/src/plcdoc/parsing/transform.py b/src/plcdoc/parsing/transform.py
new file mode 100644
index 0000000..78a487a
--- /dev/null
+++ b/src/plcdoc/parsing/transform.py
@@ -0,0 +1,159 @@
+import lark
+from . import nodes as ast
+
+
+class MyTransformer(lark.Transformer):
+    def start(self, rhs):
+        return rhs[0]
+
+    def declaration(self, rhs):
+        return rhs[0]
+
+    def visibility(self, rhs):
+        return 1
+
+    def function(self, rhs):
+        comment1 = rhs[0].value.comment1
+        print("FUNC", rhs)
+        kind = rhs[0].value.text.lower().replace("_", "")
+        index = 1
+        if isinstance(rhs[index], int):
+            index += 1
+        name = rhs[index].value.text
+        variable_lists = rhs[-1]
+        return ast.Function(comment1, kind, name, variable_lists)
+
+    def property(self, rhs):
+        name = rhs[2].value.text
+        ty = rhs[4]
+        return ast.Property(name, ty)
+
+    def type_def(self, rhs):
+        name = rhs[1].value.text
+        ty = rhs[-2]
+        return ast.TypeDef(name, ty)
+
+    def enum(self, rhs):
+        options = rhs[1]
+        return ast.Enum(options)
+
+    def enum_values(self, rhs):
+        return comma(rhs)
+
+    def enum_value(self, rhs):
+        name = rhs[0].value.text
+        if len(rhs) > 1:
+            init = rhs[1]
+        else:
+            init = None
+        return ast.EnumOption(name, init)
+
+    def struct(self, rhs):
+        fields = rhs[1:-1]
+        return ast.Struct(fields)
+
+    def union(self, rhs):
+        fields = rhs[1:-1]
+        return ast.Union(fields)
+
+    def variable_lists(self, rhs):
+        return rhs
+
+    def variable_list(self, rhs):
+        kind = rhs[0].value.text
+        variables = rhs[1:-1]
+        return ast.VariableList(kind, variables)
+
+    def variable(self, rhs):
+        name = rhs[0].value.text
+        ty = rhs[2]
+        if len(rhs) > 4:
+            init = rhs[3]
+        else:
+            init = None
+        return ast.Variable(name, ty, init)
+
+    def initializer(self, rhs):
+        return rhs[1]
+
+    def labeled_arguments(self, rhs):
+        return comma(rhs)
+
+    def labeled_argument(self, rhs):
+        label = rhs[0].value.text
+        value = rhs[2]
+        return ast.LabeledArgument(label, value)
+
+    def expressions(self, rhs):
+        return comma(rhs)
+
+    def expression(self, rhs):
+        return binop(rhs)
+
+    def term(self, rhs):
+        return binop(rhs)
+
+    def atom(self, rhs):
+        if len(rhs) == 1:
+            return rhs[0]
+        else:
+            assert len(rhs) == 3
+            return rhs[1]
+
+    def literal(self, rhs):
+        value = rhs[0].value.text
+        return ast.Number(value)
+    
+    def struct_literal(self, rhs):
+        return rhs[1]
+    
+    def range_literal(self, rhs):
+        begin = rhs[1]
+        end = rhs[3]
+        return ast.Range(begin, end)
+
+    def name_ref(self, rhs):
+        name = rhs[0].value.text
+        return ast.NameRef(name)
+
+    def type(self, rhs):
+        return rhs[0]
+
+    def string_type(self, rhs):
+        name = rhs[0].value.text
+        return ast.TypeRef(name)
+
+    def array_type(self, rhs):
+        ranges = rhs[2]
+        element_type = rhs[5]
+        return ast.Array(ranges, element_type)
+
+    def subranges(self, rhs):
+        return comma(rhs)
+
+    def subrange(self, rhs):
+        if len(rhs) == 1:
+            return None
+        else:
+            begin = rhs[0]
+            end = rhs[2]
+            return ast.Range(begin, end)
+
+
+def binop(rhs) -> ast.Binop:
+    if len(rhs) == 1:
+        return rhs[0]
+    else:
+        assert len(rhs) == 3
+        lhs = rhs[0]
+        op = rhs[1].value.text
+        rhs = rhs[2]
+        return ast.Binop(lhs, op, rhs)
+
+
+def comma(rhs):
+    """Handle a rule with one or more items, seperated by commas"""
+    if len(rhs) == 1:
+        return [rhs[0]]
+    else:
+        return rhs[0] + [rhs[2]]

From a40b858391527103992ef4e685183a09166e36c9 Mon Sep 17 00:00:00 2001
From: Windel Bouwman <windel.bouwman@demcon.com>
Date: Fri, 3 May 2024 06:48:12 +0200
Subject: [PATCH 2/4] Add data class for variable declarations.

---
 src/plcdoc/documenters.py |  35 ++--
 src/plcdoc/interpreter.py | 341 ++++++++++++++++++++++++--------------
 2 files changed, 232 insertions(+), 144 deletions(-)

diff --git a/src/plcdoc/documenters.py b/src/plcdoc/documenters.py
index 4f67467..ebb7824 100644
--- a/src/plcdoc/documenters.py
+++ b/src/plcdoc/documenters.py
@@ -13,7 +13,7 @@
 )
 from docutils.statemachine import StringList
 
-from .interpreter import PlcInterpreter, PlcDeclaration, TextXMetaClass
+from .interpreter import PlcInterpreter, PlcDeclaration, PlcVariableDeclaration
 
 logger = logging.getLogger(__name__)
 
@@ -172,7 +172,7 @@ def format_name(self) -> str:
     def format_args(self, **kwargs: Any) -> Optional[str]:
         """Format arguments for signature, based on auto-data."""
 
-        arg_strs = [f"{var.name}" for var in self.object.get_args()]
+        arg_strs = [f"{var.name}" for var in self.object.args]
 
         return "(" + ", ".join(arg_strs) + ")"
 
@@ -205,10 +205,10 @@ def add_content(self, more_content: Optional[StringList]) -> None:
 
         # Also add VARs from meta-model
         args_block = []
-        for var in self.object.get_args():
-            line_param = f":{var.kind} {var.type.name} {var.name}:"
-            if var.comment and var.comment.text:
-                line_param += " " + var.comment.text
+        for var in self.object.args:
+            line_param = f":{var.kind} {var.ty} {var.name}:"
+            if var.comment:
+                line_param += " " + var.comment
             args_block.append(line_param)
 
         if args_block:
@@ -230,7 +230,7 @@ def get_doc(self) -> Optional[List[List[str]]]:
         """Get docstring from the meta-model."""
 
         # Read main docblock
-        comment_str = self.object.get_comment()
+        comment_str = self.object.comment
         if not comment_str:
             return []
 
@@ -393,10 +393,9 @@ def document_members(self, all_members: bool = False) -> None:
         member_documenters = [
             PlcStructMemberDocumenter(
                 self.directive,
-                member.name,
-                self.indent,
-                parent=self.object,
                 member=member,
+                indent=self.indent,
+                parent=self.object,
             )
             for member in self.object.members
         ]
@@ -431,12 +430,11 @@ class PlcStructMemberDocumenter(PlcDataDocumenter):
     def __init__(
         self,
         directive,
-        name: str,
+        member: PlcVariableDeclaration,
         indent: str = "",
         parent: PlcDeclaration = None,
-        member: Optional[TextXMetaClass] = None,
     ) -> None:
-        super().__init__(directive, name, indent)
+        super().__init__(directive, member.name, indent)
 
         self.object = parent
         self.member = member
@@ -444,23 +442,22 @@ def __init__(
     @classmethod
     def can_document_member(
         cls,
-        member: Union[PlcDeclaration, Any],
+        member: PlcVariableDeclaration,
         membername: str,
         isattr: bool,
         parent: Any,
     ) -> bool:
-        return type(member).__name__ == "Variable"
-        # Note: a TextX variable class is passed, not a complete PlcDeclaration
+        return isinstance(member, PlcVariableDeclaration) and member.kind == "member"
 
     def import_object(self, raiseerror: bool = False) -> bool:
         return self.member is not None  # Expect member through constructor
 
     def get_doc(self) -> Optional[List[List[str]]]:
         # Read main docblock
-        if self.member is None or self.member.comment is None:
+        if self.member is None:
             return []
 
-        comment_str = self.member.comment.text
+        comment_str = self.member.comment
         if not comment_str:
             return []
 
@@ -471,7 +468,7 @@ def format_signature(self, **kwargs: Any) -> str:
             return ""
 
         # Insert the known variable type
-        return f" : {self.member.type.name}"
+        return f" : {self.member.ty}"
 
 
 class PlcFolderDocumenter(PlcDataDocumenter):
diff --git a/src/plcdoc/interpreter.py b/src/plcdoc/interpreter.py
index 8bf8788..560f519 100644
--- a/src/plcdoc/interpreter.py
+++ b/src/plcdoc/interpreter.py
@@ -2,9 +2,12 @@
 
 import os
 from typing import List, Dict, Optional, Any
+from dataclasses import dataclass
 from glob import glob
 import logging
+from .parsing import parse_new, nodes as ast
 import xml.etree.ElementTree as ET
+
 from textx import metamodel_from_file, TextXSyntaxError
 
 PACKAGE_DIR = os.path.dirname(__file__)
@@ -137,42 +140,45 @@ def _parse_file(self, filepath) -> bool:
             # Name is repeated inside the declaration, use it from there instead
             # name = item.attrib["Name"]
 
-            object_model = self._parse_declaration(item)
-            if object_model is None:
+            obj = self._parse_declaration(item, filepath)
+            if obj is None:
                 continue
 
-            obj = PlcDeclaration(object_model, filepath)
-
             # Methods are inside their own subtree with a `Declaration` - simply append
             # them to the object
             for node in item:
                 if node.tag in ["Declaration", "Implementation"]:
                     continue
-                method_model = self._parse_declaration(node)
-                if method_model is None:
+                method = self._parse_declaration(node, filepath)
+                if method is None:
                     continue
-                method = PlcDeclaration(method_model, filepath)
                 obj.add_child(method)
 
             self._add_model(obj)
 
         return True
 
-    def _parse_declaration(self, item) -> Optional["TextXMetaClass"]:
+    def _parse_declaration(self, item, filepath) -> Optional["TextXMetaClass"]:
         declaration_node = item.find("Declaration")
         if declaration_node is None:
             return None
-        try:
-            meta_model = self._meta_model.model_from_str(declaration_node.text)
-            return meta_model
-        except TextXSyntaxError as err:
-            name = item.attrib.get("Name", "<Unknown>")
-            logger.error(
-                "Error parsing node `%s` in file `%s`\n(%s)",
-                name,
-                self._active_file,
-                str(err),
-            )
+
+        use_textx = True
+        if use_textx:
+            try:
+                meta_model = self._meta_model.model_from_str(declaration_node.text)
+                return textx_model_to_declaration(meta_model, filepath)
+            except TextXSyntaxError as err:
+                name = item.attrib.get("Name", "<Unknown>")
+                logger.error(
+                    "Error parsing node `%s` in file `%s`\n(%s)",
+                    name,
+                    self._active_file,
+                    str(err),
+                )
+        else:
+            node = parse_new(declaration_node.text)
+            return ast_node_to_plc_declaration(node, filepath)
 
         return None
 
@@ -255,6 +261,174 @@ def get_objects_in_folder(self, folder: str) -> List["PlcDeclaration"]:
         raise KeyError(f"Found no models in the folder `{folder}`")
 
 
+def ast_node_to_plc_declaration(node, file) -> "PlcDeclaration":
+    objtype = None
+    name = None
+
+    if isinstance(node, ast.Function):
+        name = node.name
+        objtype = node.kind
+    elif isinstance(node, ast.TypeDef):
+        name = node.name
+        objtype = str(node.ty)
+    elif isinstance(node, ast.Property):
+        objtype = "property"
+        name = node.name
+    elif isinstance(node, ast.VariableList):
+        if file is None:
+            raise ValueError("Cannot parse GVL without file as no naming is present")
+        name = os.path.splitext(os.path.basename(file))[0]
+        objtype = "variable_list"
+    else:
+        raise ValueError(f"Unrecognized declaration in `{node}`")
+
+    assert name is not None
+    return PlcDeclaration(objtype, name, file)
+
+
+def textx_model_to_declaration(
+    meta_model: TextXMetaClass, file=None
+) -> "PlcDeclaration":
+    objtype = None
+    name = None
+    members = []
+
+    if meta_model.functions:
+        model = meta_model.functions[0]
+        objtype = model.function_type.lower().replace("_", "")
+
+    if meta_model.types:
+        model = meta_model.types[0]
+        type_str = type(model.type).__name__
+        if "Enum" in type_str:
+            objtype = "enum"
+        elif "Struct" in type_str:
+            objtype = "struct"
+            if model.type:
+                print(model.type.members)
+                # aarg
+                members = [member_to_plc_declaration(m) for m in model.type.members]
+        elif "Union" in type_str:
+            objtype = "union"
+            if model.type:
+                members = [member_to_plc_declaration(m) for m in model.type.members]
+        else:
+            raise ValueError(f"Could not categorize type `{type_str}`")
+
+    if meta_model.properties:
+        model = meta_model.properties[0]
+        objtype = "property"
+
+    if meta_model.variable_lists:
+        if file is None:
+            raise ValueError("Cannot parse GVL without file as no naming is present")
+        name = os.path.splitext(os.path.basename(file))[0]
+        #     # GVL are annoying because no naming is present in source - we need to
+        #     # extract it from the file name
+
+        model = meta_model.variable_lists[0]
+        objtype = "variable_list"
+
+    if objtype is None:
+        raise ValueError(f"Unrecognized declaration in `{meta_model}`")
+
+    if name is None:
+        name = model.name
+    comment = get_comment(model)
+    args = get_args(model)
+
+    return PlcDeclaration(
+        objtype, name, comment=comment, args=args, members=members, file=file
+    )
+
+
+def member_to_plc_declaration(member) -> "PlcVariableDeclaration":
+    # print()
+    name = member.name
+    comment = member.comment.text if member.comment else ""
+    ty = member.type.name
+    return PlcVariableDeclaration(
+        kind="member",
+        name=name,
+        ty=ty,
+        comment=comment,
+    )
+
+
+def get_comment(_model) -> Optional[str]:
+    """Process main block comment from model into a neat list.
+
+    A list is created for each 'region' of comments. The first comment block above
+    a declaration is the most common one.
+    """
+    if hasattr(_model, "comment") and _model.comment is not None:
+        # Probably a comment line
+        big_block: str = _model.comment.text
+    elif hasattr(_model, "comments") and _model.comments:
+        # Probably a comment block (amongst multiple maybe)
+        block_comment = None
+        for comment in reversed(_model.comments):
+            # Find last block-comment
+            if type(comment).__name__ == "CommentBlock":
+                block_comment = comment
+                break
+
+        if block_comment is None:
+            return None
+
+        big_block: str = block_comment.text
+    else:
+        return None
+
+    big_block = big_block.strip()  # Get rid of whitespace
+
+    # Remove comment indicators (cannot get rid of them by TextX)
+    if big_block.startswith("(*"):
+        big_block = big_block[2:]
+    if big_block.endswith("*)"):
+        big_block = big_block[:-2]
+
+    # It looks like Windows line endings are already lost by now, but make sure
+    big_block = big_block.replace("\r\n", "\n")
+
+    return big_block
+
+
+def get_args(model) -> List:
+    """Return arguments.
+
+    :param skip_internal: If true, only return in, out and inout variables
+    :retval: Empty list if there are none or arguments are applicable to this type.
+    """
+    skip_internal = True
+    if not hasattr(model, "lists"):
+        return []
+
+    args = []
+
+    for var_list in model.lists:
+        var_kind = var_list.name.lower()
+        if skip_internal and var_kind not in [
+            "var_input",
+            "var_output",
+            "var_input_output",
+        ]:
+            continue  # Skip internal variables `VAR`
+
+        for var in var_list.variables:
+            print(var, type(var))
+            args.append(textx_to_var(var_kind, var))
+
+    return args
+
+
+def textx_to_var(var_kind, var):
+    name = var.name
+    ty = var.type.name
+    comment = var.comment.text if var.comment else ""
+    return PlcVariableDeclaration(kind=var_kind, name=name, ty=ty, comment=comment)
+
+
 class PlcDeclaration:
     """Wrapper class for the result of the TextX parsing of a PLC source file.
 
@@ -265,52 +439,19 @@ class PlcDeclaration:
     The `objtype` is as they appear in :class:`StructuredTextDomain`.
     """
 
-    def __init__(self, meta_model: TextXMetaClass, file=None):
+    def __init__(
+        self, objtype: str, name: str, comment=None, args=(), members=(), file=None
+    ):
         """
 
         :param meta_model: Parsing result
         :param file: Path to the file this model originates from
         """
-        self._objtype = None
-        self._name = None
-
-        if meta_model.functions:
-            self._model = meta_model.functions[0]
-            self._objtype = self._model.function_type.lower().replace("_", "")
-
-        if meta_model.types:
-            self._model = meta_model.types[0]
-            type_str = type(self._model.type).__name__
-            if "Enum" in type_str:
-                self._objtype = "enum"
-            elif "Struct" in type_str:
-                self._objtype = "struct"
-            elif "Union" in type_str:
-                self._objtype = "union"
-            else:
-                raise ValueError(f"Could not categorize type `{type_str}`")
-
-        if meta_model.properties:
-            self._model = meta_model.properties[0]
-            self._objtype = "property"
-
-        if meta_model.variable_lists:
-            if file is None:
-                raise ValueError(
-                    "Cannot parse GVL without file as no naming is present"
-                )
-            self._name, _ = os.path.splitext(os.path.basename(file))
-            # GVL are annoying because no naming is present in source - we need to
-            # extract it from the file name
-
-            self._model = meta_model.variable_lists[0]
-            self._objtype = "variable_list"
-
-        if self._objtype is None:
-            raise ValueError(f"Unrecognized declaration in `{meta_model}`")
-
-        if self._name is None:
-            self._name = self._model.name
+        self._objtype = objtype
+        self._name = name
+        self._comment = comment
+        self._args = args
+        self._members = members
         self._file: Optional[str] = file
         self._children: Dict[str, "PlcDeclaration"] = {}
 
@@ -339,73 +480,23 @@ def children(self) -> Dict[str, "PlcDeclaration"]:
 
     @property
     def members(self) -> List[TextXMetaClass]:
-        if not self._model.type:
-            return []
-        return self._model.type.members
-
-    def get_comment(self) -> Optional[str]:
-        """Process main block comment from model into a neat list.
-
-        A list is created for each 'region' of comments. The first comment block above
-        a declaration is the most common one.
-        """
-        if hasattr(self._model, "comment") and self._model.comment is not None:
-            # Probably a comment line
-            big_block: str = self._model.comment.text
-        elif hasattr(self._model, "comments") and self._model.comments:
-            # Probably a comment block (amongst multiple maybe)
-            block_comment = None
-            for comment in reversed(self._model.comments):
-                # Find last block-comment
-                if type(comment).__name__ == "CommentBlock":
-                    block_comment = comment
-                    break
-
-            if block_comment is None:
-                return None
-
-            big_block: str = block_comment.text
-        else:
-            return None
-
-        big_block = big_block.strip()  # Get rid of whitespace
-
-        # Remove comment indicators (cannot get rid of them by TextX)
-        if big_block.startswith("(*"):
-            big_block = big_block[2:]
-        if big_block.endswith("*)"):
-            big_block = big_block[:-2]
-
-        # It looks like Windows line endings are already lost by now, but make sure
-        big_block = big_block.replace("\r\n", "\n")
-
-        return big_block
-
-    def get_args(self, skip_internal=True) -> List:
-        """Return arguments.
+        return self._members
 
-        :param skip_internal: If true, only return in, out and inout variables
-        :retval: Empty list if there are none or arguments are applicable to this type.
-        """
-        if not hasattr(self._model, "lists"):
-            return []
-
-        args = []
-
-        for var_list in self._model.lists:
-            var_kind = var_list.name.lower()
-            if skip_internal and var_kind not in [
-                "var_input",
-                "var_output",
-                "var_input_output",
-            ]:
-                continue  # Skip internal variables `VAR`
-
-            for var in var_list.variables:
-                var.kind = var_kind
-                args.append(var)
+    @property
+    def comment(self) -> Optional[str]:
+        return self._comment
 
-        return args
+    @property
+    def args(self) -> List:
+        return self._args
 
     def add_child(self, child: "PlcDeclaration"):
         self._children[child.name] = child
+
+
+@dataclass
+class PlcVariableDeclaration:
+    kind: str
+    name: str
+    ty: str
+    comment: str

From 2f84d00ac24ec6cb193d28be0303ff69c97cff62 Mon Sep 17 00:00:00 2001
From: Windel Bouwman <windel.bouwman@demcon.com>
Date: Sun, 5 May 2024 13:37:22 +0200
Subject: [PATCH 3/4] Test lark parser on test projects.

---
 src/plcdoc/documenters.py       |   2 +-
 src/plcdoc/interpreter.py       |  49 ++++++++++-
 src/plcdoc/parsing/lexer.py     | 112 ++++++++++++++++++++-----
 src/plcdoc/parsing/nodes.py     | 111 +++++++++++++++++++++++--
 src/plcdoc/parsing/parser.py    | 129 ++++++++++++++++-------------
 src/plcdoc/parsing/transform.py | 141 +++++++++++++++++++++++---------
 6 files changed, 419 insertions(+), 125 deletions(-)

diff --git a/src/plcdoc/documenters.py b/src/plcdoc/documenters.py
index ebb7824..1c45f29 100644
--- a/src/plcdoc/documenters.py
+++ b/src/plcdoc/documenters.py
@@ -2,7 +2,7 @@
 
 import os.path
 from abc import ABC
-from typing import Tuple, List, Dict, Optional, Any, Union
+from typing import Tuple, List, Dict, Optional, Any
 import re
 
 from sphinx.util import logging
diff --git a/src/plcdoc/interpreter.py b/src/plcdoc/interpreter.py
index 560f519..b78734a 100644
--- a/src/plcdoc/interpreter.py
+++ b/src/plcdoc/interpreter.py
@@ -10,6 +10,9 @@
 
 from textx import metamodel_from_file, TextXSyntaxError
 
+USE_TEXTX = False
+# USE_TEXTX = True
+
 PACKAGE_DIR = os.path.dirname(__file__)
 logger = logging.getLogger(__name__)
 
@@ -163,8 +166,7 @@ def _parse_declaration(self, item, filepath) -> Optional["TextXMetaClass"]:
         if declaration_node is None:
             return None
 
-        use_textx = True
-        if use_textx:
+        if USE_TEXTX:
             try:
                 meta_model = self._meta_model.model_from_str(declaration_node.text)
                 return textx_model_to_declaration(meta_model, filepath)
@@ -264,14 +266,39 @@ def get_objects_in_folder(self, folder: str) -> List["PlcDeclaration"]:
 def ast_node_to_plc_declaration(node, file) -> "PlcDeclaration":
     objtype = None
     name = None
+    args = []
+    members = []
+    comment = ""
 
     if isinstance(node, ast.Function):
         name = node.name
         objtype = node.kind
+        comment = process_comment(node.comment)
+        for vl in node.variable_lists:
+            for v in vl.variables:
+                arg = PlcVariableDeclaration(
+                    kind=vl.kind.lower(),
+                    name=v.name,
+                    ty=ast.type_to_text(v.ty),
+                    comment=v.comment,
+                )
+                args.append(arg)
+
     elif isinstance(node, ast.TypeDef):
         name = node.name
-        objtype = str(node.ty)
+        comment = process_comment(node.comment)
+        if isinstance(node.ty, ast.Struct):
+            objtype = "struct"
+            for f in node.ty.fields:
+                members.append(lark_field_to_var(f))
+        elif isinstance(node.ty, ast.Union):
+            objtype = "union"
+        elif isinstance(node.ty, ast.Enum):
+            objtype = "enum"
+        else:
+            raise ValueError(f"typedef not supported for type: {node.ty}")
     elif isinstance(node, ast.Property):
+        comment = process_comment(node.comment)
         objtype = "property"
         name = node.name
     elif isinstance(node, ast.VariableList):
@@ -283,7 +310,18 @@ def ast_node_to_plc_declaration(node, file) -> "PlcDeclaration":
         raise ValueError(f"Unrecognized declaration in `{node}`")
 
     assert name is not None
-    return PlcDeclaration(objtype, name, file)
+
+    return PlcDeclaration(
+        objtype, name=name, comment=comment, args=args, members=members, file=file
+    )
+
+
+def lark_field_to_var(field: ast.StructField) -> "PlcVariableDeclaration":
+    comment = field.comment
+    ty = ast.type_to_text(field.ty)
+    return PlcVariableDeclaration(
+        kind="member", name=field.name, ty=ty, comment=comment
+    )
 
 
 def textx_model_to_declaration(
@@ -381,7 +419,10 @@ def get_comment(_model) -> Optional[str]:
         return None
 
     big_block = big_block.strip()  # Get rid of whitespace
+    return process_comment(big_block)
+
 
+def process_comment(big_block):
     # Remove comment indicators (cannot get rid of them by TextX)
     if big_block.startswith("(*"):
         big_block = big_block[2:]
diff --git a/src/plcdoc/parsing/lexer.py b/src/plcdoc/parsing/lexer.py
index 9c347a6..d73a20e 100644
--- a/src/plcdoc/parsing/lexer.py
+++ b/src/plcdoc/parsing/lexer.py
@@ -15,9 +15,12 @@ def __init__(self, lexer_conf):
 
     def lex(self, source):
         # print(code)
-        for token in token_filter(tokenize(source)):
+        # tokens = iter()
+        # tokens = map(token_filter2, tokens)
+        # tokens = map(, tokens)
+        for token in token_filter(token_filter2(tokenize(source))):
             type = token.kind
-            yield lark.lexer.Token(type, token)
+            yield lark.lexer.Token(type, token, line=token.row, column=token.column)
 
 
 @dataclass
@@ -27,6 +30,7 @@ class Token:
     row: int
     column: int
     comment1: str
+    comment2: str
 
 
 def tokenize(source: str):
@@ -35,17 +39,23 @@ def tokenize(source: str):
         ("COMMENT1", r"\(\*.*?\*\)"),
         ("COMMENT2", r"//.*?\n"),
         ("OP2", r"(:=)|(==)|(<=)|(!=)|(>=)|(\.\.)"),
-        ("OP", r"[<>=:;,\.\(\)\+\-\*\/]"),
+        ("OP", r"[<>=:;,\.\(\)\+\-\*\/\[\]]"),
         ("BIN_NUMBER", r"2#[0-1][0-1_]*"),
-        ("OCT_NUMBER", r"8#[0-7]+"),
+        ("OCT_NUMBER", r"8#[0-7][0-7_]*"),
+        ("DEC_NUMBER", r"10#[0-9][0-9_]*"),
         ("HEX_NUMBER", r"16#[0-9a-fA-F][0-9a-fA-F_]*"),
-        ("TIME", r"T#[0-9a-fA-F][0-9a-fA-F_]*"),
-        ("REAL", r"[0-9]+\.[0-9]+"),
-        ("NUMBER", r"[0-9]+"),
-        ("ID", r"[A-Za-z][A-Za-z_0-9]*"),
+        ("TIME", r"T#[0-9hHmMsS]+"),
+        ("ADDR", r"%[A-Za-z][A-Za-z0-9]*\*"),
+        ("REAL1", r"[0-9][0-9_]*[eE][-+]?[0-9]+"),  # example: 1E2
+        ("REAL2", r"[0-9][0-9_]*\.[0-9][0-9_]*"),  # example: 1.0
+        ("REAL3", r"[0-9][0-9_]*\.[0-9][0-9_]*[eE][-+]?[0-9]+"),  # example: 1.0E2
+        ("REAL4", r"\.[0-9][0-9_]*"),  # example: .1
+        ("REAL5", r"\.[0-9][0-9_]*[eE][-+]?[0-9]+"),  # example: .1E3
+        ("NUMBER", r"[0-9][0-9_]*"),
+        ("ID", r"[A-Za-z_][A-Za-z_0-9]*"),
         ("STRING", r"'[^']*'"),
         ("SPACE", r"[ \t]+"),
-        ("ATTRIBUTE", r"\{attribute.*?\}"),
+        ("ATTRIBUTE", r"\{.*?\}"),
         ("NEWLINE", r"\n"),
         ("OTHER", r"."),
     ]
@@ -80,55 +90,100 @@ def tokenize(source: str):
         elif kind == "ID":
             if value in KEYWORDS:
                 kind = "KW_" + value
-        elif kind == "NEWLINE":
+            elif value in VAR_KEYWORDS:
+                kind = "KW_VAR"
+            elif value in ACCESS_KEYWORDS:
+                kind = "KW_ACCESS"
+            elif value in INTEGER_DATA_TYPES:
+                kind = "INTTYPE"
+        elif kind == "NEWLINE" or kind == "COMMENT2":
             row += 1
+            column = 1
+        elif kind == "SPACE":
+            continue
+        elif kind.startswith("REAL"):
+            kind = "REAL"
+        elif kind.endswith("_NUMBER"):
+            kind = "NUMBER"
         elif kind == "OTHER":
             if value.isprintable():
                 c = value
             else:
                 c = str(value.encode(encoding="utf-8", errors="replace"))
-            raise ValueError(f"Unexpected character: {c}")
+            raise ValueError(f"Unexpected character: {c} at ({row=},{column=})")
 
-        yield Token(kind, value, row, column, "")
+        yield Token(kind, value, row, column, "", "")
 
-    yield Token("EOF", "EOF", row, column, "")
+    yield Token("EOF", "EOF", row, column, "", "")
 
 
 KEYWORDS = {
     "ABSTRACT",
     "ARRAY",
+    "AT",
+    "CONSTANT",
     "END_STRUCT",
     "END_TYPE",
+    "END_UNION",
     "END_VAR",
     "EXTENDS",
     "FINAL",
     "FUNCTION",
     "FUNCTION_BLOCK",
+    "IMPLEMENTS",
     "INTERFACE",
-    "INTERNAL",
     "METHOD",
     "OF",
+    "PERSISTENT",
     "POINTER",
     "PROGRAM",
     "PROPERTY",
-    "PRIVATE",
-    "PROTECTED",
-    "PUBLIC",
     "REFERENCE",
     "STRING",
     "STRUCT",
     "TO",
     "TYPE",
+    "UNION",
+    "WSTRING",
+}
+
+ACCESS_KEYWORDS = {
+    "PRIVATE",
+    "PROTECTED",
+    "PUBLIC",
+    "INTERNAL",
+}
+
+VAR_KEYWORDS = {
     "VAR",
     "VAR_GLOBAL",
+    "VAR_IN_OUT",
     "VAR_INPUT",
+    "VAR_INST",
     "VAR_OUTPUT",
+    "VAR_STAT",
+    "VAR_TEMP",
+}
+
+INTEGER_DATA_TYPES = {
+    "BYTE",
+    "WORD",
+    "DWORD",
+    "LWORD",
+    "SINT",
+    "USINT",
+    "INT",
+    "UINT",
+    "DINT",
+    "UDINT",
+    "LINT",
+    "ULINT",
 }
 
 
 def token_filter(tokens):
+    """Remove comment tokens, and add comment as attribute to the next token."""
     comment1 = ""
-    attr = ""
     for token in tokens:
         if token.kind == "SPACE" or token.kind == "NEWLINE":
             continue
@@ -138,8 +193,25 @@ def token_filter(tokens):
         elif token.kind == "COMMENT2":
             continue
         elif token.kind == "ATTRIBUTE":
-            attr = token.text
+            pass
         else:
-            token.comment1 = comment1
+            if comment1:
+                token.comment1 = comment1
             yield token
             comment1 = ""
+
+
+def token_filter2(tokens):
+    previous_token = None
+    for token in tokens:
+        if token.kind == "COMMENT2":
+            if previous_token:
+                comment = token.text[2:].strip()
+                previous_token.comment1 = comment
+
+        if previous_token:
+            yield previous_token
+        previous_token = token
+
+    if previous_token:
+        yield previous_token
diff --git a/src/plcdoc/parsing/nodes.py b/src/plcdoc/parsing/nodes.py
index 04271ae..2483eb7 100644
--- a/src/plcdoc/parsing/nodes.py
+++ b/src/plcdoc/parsing/nodes.py
@@ -2,13 +2,13 @@
 
 """
 
-from typing import Optional, Any, Union
+from typing import Optional
 from dataclasses import dataclass
 
 
 @dataclass
 class Function:
-    comment1: str
+    comment: str
     kind: str
     name: str
     variable_lists: list["VariableList"]
@@ -18,27 +18,34 @@ class Function:
 # class FunctionBlock:
 #     name: str
 
+
 @dataclass
 class Property:
+    comment: str
     name: str
     ty: "Type"
     # init: Optional["Expression"]
 
+
 @dataclass
 class VariableList:
     kind: str
+    flags: list[str]
     variables: list["Variable"]
 
 
 @dataclass
 class Variable:
     name: str
+    address: Optional[str]
     ty: "Type"
     init: Optional["Expression"]
+    comment: str
 
 
 @dataclass
 class TypeDef:
+    comment: str
     name: str
     ty: "Type"
 
@@ -63,6 +70,7 @@ class Union(Type):
 @dataclass
 class Enum(Type):
     options: list["EnumOption"]
+    base: Optional["Type"]
 
 
 @dataclass
@@ -76,6 +84,7 @@ class LabeledArgument:
     label: str
     value: "Expression"
 
+
 class Expression:
     pass
 
@@ -87,25 +96,115 @@ class Binop(Expression):
     rhs: "Expression"
 
 
+@dataclass
+class Unop(Expression):
+    op: str
+    rhs: "Expression"
+
+
+@dataclass
+class Call(Expression):
+    callee: "Expression"
+    arguments: list["Expression"]
+
+
 @dataclass
 class Number(Expression):
     value: int
 
 
 @dataclass
-class NameRef(Expression):
-    name: str
+class FqNameRef(Expression):
+    names: str
+
+
+def expression_to_text(expr, parens=False) -> str:
+    if isinstance(expr, Number):
+        return f"{expr.value}"
+    elif isinstance(expr, FqNameRef):
+        return ".".join(expr.names)
+    elif isinstance(expr, Unop):
+        rhs = expression_to_text(expr.rhs, parens=True)
+        if parens:
+            return f"({expr.op}{rhs})"
+        else:
+            return f"{expr.op}{rhs}"
+    elif isinstance(expr, Call):
+        callee = expression_to_text(expr.callee, parens=True)
+        args = ",".join(expression_to_text(a) for a in expr.arguments)
+        return f"{callee}({args})"
+    elif isinstance(expr, Binop):
+        lhs = expression_to_text(expr.lhs, parens=True)
+        rhs = expression_to_text(expr.rhs, parens=True)
+        if parens:
+            return f"({lhs} {expr.op} {rhs})"
+        else:
+            return f"{lhs} {expr.op} {rhs}"
+    else:
+        raise NotImplementedError(f"Not impl: {expr}")
+
+
+def type_to_text(ty) -> str:
+    if isinstance(ty, StringType):
+        if ty.size:
+            size = expression_to_text(ty.size)
+            return f"STRING({size})"
+        else:
+            return "STRING"
+    elif isinstance(ty, IntegerType):
+        return ty.kind
+    elif isinstance(ty, FqNameRef):
+        return ".".join(ty.names)
+    elif isinstance(ty, ArrayType):
+        ",".join(
+            f"{expression_to_text(r.begin)}..{expression_to_text(r.end)}" if r else "*"
+            for r in ty.ranges
+        )
+        d = 1  # TODO
+        e = type_to_text(ty.element_type)
+        return f"ARRAY [{d}] OF {e}"
+    elif isinstance(ty, PointerType):
+        e = type_to_text(ty.element_type)
+        return f"POINTER TO {e}"
+    elif isinstance(ty, ReferenceType):
+        e = type_to_text(ty.element_type)
+        return f"REFERENCE TO {e}"
+    else:
+        raise ValueError(f"Not impl: {type(ty)}")
 
 
 @dataclass
 class TypeRef:
     name: str
 
+
 @dataclass
-class Array:
-    ranges: list["Range"]
+class StringType(Type):
+    size: Optional["Expression"]
+
+
+@dataclass
+class IntegerType(Type):
+    kind: str
+    domain: Optional["Range"]
+
+
+@dataclass
+class ArrayType(Type):
+    ranges: list[Optional["Range"]]
+    element_type: "Type"
+
+
+@dataclass
+class PointerType(Type):
     element_type: "Type"
 
+
+@dataclass
+class ReferenceType(Type):
+    element_type: "Type"
+
+
 @dataclass
 class Range:
     begin: "Expression"
diff --git a/src/plcdoc/parsing/parser.py b/src/plcdoc/parsing/parser.py
index 5069366..3fe2888 100644
--- a/src/plcdoc/parsing/parser.py
+++ b/src/plcdoc/parsing/parser.py
@@ -1,11 +1,9 @@
-""" 
+"""
 Lark based PLC parser.
 """
 
-
 from .lexer import MyLexer
 from .transform import MyTransformer
-from pprint import pprint
 import logging
 import lark
 
@@ -13,116 +11,133 @@
 
 
 def parse_new(text: str):
-    print(text)
-    # for token in tokenize(text):
-    #     print(token)
-
+    # print(text)
     tree = parser.parse(text)
-    # print("PARSED", tree)
-    print("PARSED:")
-    pprint(tree)
+    # print("PARSED:")
+    # pprint(tree, width=150)
     return tree
 
 
 grammar = """
-start: declaration EOF
+start: declaration+ EOF
 
 declaration: function | property | type_def | variable_list
 
-function: (KW_PROGRAM | KW_FUNCTION_BLOCK | KW_FUNCTION | KW_METHOD | KW_INTERFACE) visibility ID (COLON type)? extends? SEMI? variable_lists
+function: function_kind visibility ID (COLON type)? exim SEMI? variable_lists
+function_kind: KW_PROGRAM
+             | KW_FUNCTION_BLOCK
+             | KW_FUNCTION
+             | KW_METHOD
+             | KW_INTERFACE
 property: KW_PROPERTY visibility ID COLON type
-extends: KW_EXTENDS ID
-visibility: (KW_ABSTRACT | KW_PUBLIC | KW_PRIVATE | KW_PROTECTED | KW_INTERNAL | KW_FINAL)?
+exim: extends implements?
+extends: (KW_EXTENDS fq_name_ref)?
+implements: KW_IMPLEMENTS fq_name_ref
+visibility: (KW_ABSTRACT | KW_ACCESS | KW_FINAL)?
 variable_lists: variable_list*
-variable_list: (KW_VAR_INPUT | KW_VAR_OUTPUT | KW_VAR_GLOBAL | KW_VAR) variable* KW_END_VAR
-variable: ID COLON type initializer? SEMI
-
-type_def: KW_TYPE ID extends? COLON (struct | union | enum) KW_END_TYPE
-struct: KW_STRUCT variable* KW_END_STRUCT
-union: KW_UNION variable* KW_END_UNION
-enum: PARENTHESIS_OPEN enum_values PARENTHESIS_CLOSE SEMI
+variable_list: KW_VAR variable_list_flags variable* KW_END_VAR
+variable_list_flags: (KW_CONSTANT | KW_PERSISTENT)*
+variable: ids address COLON variable_type_init SEMI
+variable_type_init: type initializer
+                  | type PARENTHESIS_OPEN labeled_arguments PARENTHESIS_CLOSE
+                  | type PARENTHESIS_OPEN expressions PARENTHESIS_CLOSE
+                  | type PARENTHESIS_OPEN PARENTHESIS_CLOSE
+address: (KW_AT ADDR)?
+
+type_def: KW_TYPE ID extends COLON (struct_decl | union_decl | enum_decl) KW_END_TYPE
+struct_decl: KW_STRUCT variable* KW_END_STRUCT
+union_decl: KW_UNION variable* KW_END_UNION
+enum_decl: PARENTHESIS_OPEN enum_values PARENTHESIS_CLOSE integer_type? SEMI
 enum_values: enum_value
            | enum_values COMMA enum_value
-enum_value: ID initializer?
+enum_value: ID initializer
 
-initializer: COLON_EQUALS expression
+initializer: (COLON_EQUALS expression)?
 labeled_arguments: labeled_argument
                  | labeled_arguments COMMA labeled_argument
 labeled_argument: ID COLON_EQUALS expression
 
 expressions: expression
            | expressions COMMA expression
-expression: term
+
+expression: sum
+sum: term
           | expression (PLUS | MINUS) term
 term: factor
     | term (ASTERIX | SLASH) factor
 factor: atom
+      | MINUS factor
 atom: literal
-    | name_ref
+    | fq_name_ref
     | struct_literal
     | range_literal
     | PARENTHESIS_OPEN expression PARENTHESIS_CLOSE
-name_ref: ID
+    | atom PARENTHESIS_OPEN expressions PARENTHESIS_CLOSE
+
+ids: ID
+   | ids COMMA ID
+fq_name_ref: ID
+           | fq_name_ref DOT ID
+
 struct_literal: PARENTHESIS_OPEN labeled_arguments PARENTHESIS_CLOSE
 range_literal: PARENTHESIS_OPEN expression DOTDOT expression PARENTHESIS_CLOSE
 literal: NUMBER
        | REAL
-       | BIN_NUMBER
-       | OCT_NUMBER
-       | HEX_NUMBER
+       | TIME
        | STRING
 
-type: name_ref
+type: fq_name_ref
+    | integer_type
     | string_type
     | array_type
     | pointer_type
     | reference_type
-string_type: KW_STRING
-           | KW_STRING PARENTHESIS_OPEN expression PARENTHESIS_CLOSE
-           | KW_STRING BRACKET_OPEN expression BRACKET_CLOSE
+integer_type: INTTYPE range_literal?
+string_type: (KW_STRING | KW_WSTRING)
+           | (KW_STRING | KW_WSTRING) PARENTHESIS_OPEN expression PARENTHESIS_CLOSE
+           | (KW_STRING | KW_WSTRING) BRACKET_OPEN expression BRACKET_CLOSE
+pointer_type: KW_POINTER KW_TO type
+reference_type: KW_REFERENCE KW_TO type
 array_type: KW_ARRAY BRACKET_OPEN subranges BRACKET_CLOSE KW_OF type
-pointer_type: KW_POINTER KW_TO ID
-reference_type: KW_REFERENCE KW_TO ID
-
 subranges: subrange
          | subranges COMMA subrange
 subrange: ASTERIX
         | expression DOTDOT expression
 
 %declare KW_ABSTRACT
-%declare KW_PROGRAM
+%declare KW_ARRAY
+%declare KW_ACCESS
+%declare KW_AT
+%declare KW_CONSTANT
+%declare KW_END_STRUCT
+%declare KW_END_TYPE
+%declare KW_END_UNION
+%declare KW_END_VAR
+%declare KW_EXTENDS
+%declare KW_FINAL
 %declare KW_FUNCTION
 %declare KW_FUNCTION_BLOCK
+%declare KW_IMPLEMENTS
 %declare KW_INTERFACE
 %declare KW_METHOD
+%declare KW_OF
+%declare KW_PERSISTENT
 %declare KW_PROPERTY
-%declare KW_EXTENDS
-%declare KW_FINAL
-%declare KW_PUBLIC
-%declare KW_PRIVATE
-%declare KW_PROTECTED
-%declare KW_INTERNAL
-%declare KW_TYPE
-%declare KW_END_TYPE
+%declare KW_PROGRAM
 %declare KW_POINTER
 %declare KW_STRUCT
-%declare KW_END_STRUCT
-%declare KW_UNION
-%declare KW_END_UNION
-%declare KW_STRING
-%declare KW_ARRAY
-%declare KW_OF
 %declare KW_REFERENCE
+%declare KW_STRING
 %declare KW_TO
-%declare KW_VAR_GLOBAL
-%declare KW_VAR_INPUT
-%declare KW_VAR_OUTPUT
+%declare KW_TYPE
+%declare KW_UNION
 %declare KW_VAR
-%declare KW_END_VAR
+%declare KW_WSTRING
 
 %declare ID
-%declare NUMBER REAL BIN_NUMBER OCT_NUMBER HEX_NUMBER
-%declare STRING
+%declare NUMBER REAL
+%declare TIME ADDR
+%declare STRING INTTYPE
 %declare COLON_EQUALS
 %declare COLON SEMI COMMA DOT DOTDOT
 %declare PLUS MINUS ASTERIX SLASH
diff --git a/src/plcdoc/parsing/transform.py b/src/plcdoc/parsing/transform.py
index 78a487a..c91a87a 100644
--- a/src/plcdoc/parsing/transform.py
+++ b/src/plcdoc/parsing/transform.py
@@ -4,6 +4,8 @@
 
 class MyTransformer(lark.Transformer):
     def start(self, rhs):
+        # TODO: we can have multiple declarations
+        # For example VAR_GLOBAL ..  VAR_GLOBAL CONSTANT
         return rhs[0]
 
     def declaration(self, rhs):
@@ -13,46 +15,49 @@ def visibility(self, rhs):
         return 1
 
     def function(self, rhs):
-        comment1 = rhs[0].value.comment1
-        print("FUNC", rhs)
-        kind = rhs[0].value.text.lower().replace("_", "")
-        index = 1
-        if isinstance(rhs[index], int):
-            index += 1
-        name = rhs[index].value.text
+        # print("FUNC", rhs)
+        comment, kind = rhs[0]
+        name = rhs[2].value.text
         variable_lists = rhs[-1]
-        return ast.Function(comment1, kind, name, variable_lists)
+        return ast.Function(
+            comment=comment, kind=kind, name=name, variable_lists=variable_lists
+        )
+
+    def function_kind(self, rhs):
+        comment = rhs[0].value.comment1
+        kind = rhs[0].value.text.lower().replace("_", "")
+        return comment, kind
 
     def property(self, rhs):
+        comment = rhs[0].value.comment1
         name = rhs[2].value.text
         ty = rhs[4]
-        return ast.Property(name, ty)
+        return ast.Property(comment, name, ty)
 
     def type_def(self, rhs):
+        comment = rhs[0].value.comment1
         name = rhs[1].value.text
         ty = rhs[-2]
-        return ast.TypeDef(name, ty)
+        return ast.TypeDef(comment=comment, name=name, ty=ty)
 
-    def enum(self, rhs):
+    def enum_decl(self, rhs):
         options = rhs[1]
-        return ast.Enum(options)
+        base = rhs[-2] if len(rhs) == 5 else None
+        return ast.Enum(options, base)
 
     def enum_values(self, rhs):
         return comma(rhs)
 
     def enum_value(self, rhs):
         name = rhs[0].value.text
-        if len(rhs) > 1:
-            init = rhs[1]
-        else:
-            init = None
+        init = rhs[1]
         return ast.EnumOption(name, init)
 
-    def struct(self, rhs):
+    def struct_decl(self, rhs):
         fields = rhs[1:-1]
         return ast.Struct(fields)
 
-    def union(self, rhs):
+    def union_decl(self, rhs):
         fields = rhs[1:-1]
         return ast.Union(fields)
 
@@ -61,20 +66,40 @@ def variable_lists(self, rhs):
 
     def variable_list(self, rhs):
         kind = rhs[0].value.text
-        variables = rhs[1:-1]
-        return ast.VariableList(kind, variables)
+        flags = rhs[1]
+        variables = rhs[2:-1]
+        return ast.VariableList(kind, flags, variables)
+
+    def variable_list_flags(self, rhs):
+        return [r.value.text for r in rhs]
 
     def variable(self, rhs):
-        name = rhs[0].value.text
-        ty = rhs[2]
-        if len(rhs) > 4:
-            init = rhs[3]
-        else:
+        # print("VAR", rhs)
+        names = rhs[0]
+        name = names[0]
+        # TODO: support more than 1 name?
+        address = rhs[1]
+        ty, init = rhs[3]
+        comment = rhs[-1].value.comment1
+        return ast.Variable(name, address, ty, init, comment)
+
+    def variable_type_init(self, rhs):
+        ty = rhs[0]
+        if len(rhs) == 2:
+            init = rhs[1]
+        elif len(rhs) == 3:
             init = None
-        return ast.Variable(name, ty, init)
+        else:
+            init = rhs[2]
+        return (ty, init)
+
+    def address(self, rhs):
+        if len(rhs) == 2:
+            return rhs[1].value.text
 
     def initializer(self, rhs):
-        return rhs[1]
+        if len(rhs) == 2:
+            return rhs[1]
 
     def labeled_arguments(self, rhs):
         return comma(rhs)
@@ -88,45 +113,87 @@ def expressions(self, rhs):
         return comma(rhs)
 
     def expression(self, rhs):
+        return rhs[0]
+
+    def sum(self, rhs):
         return binop(rhs)
 
     def term(self, rhs):
         return binop(rhs)
 
-    def atom(self, rhs):
+    def factor(self, rhs):
         if len(rhs) == 1:
             return rhs[0]
         else:
-            assert len(rhs) == 3
+            op = rhs[0].value.text
+            return ast.Unop(op, rhs[1])
+
+    def atom(self, rhs):
+        if len(rhs) == 1:
+            return rhs[0]
+        elif len(rhs) == 3:
             return rhs[1]
+        else:
+            assert len(rhs) == 4
+            callee = rhs[0]
+            args = rhs[2]
+            return ast.Call(callee, args)
 
     def literal(self, rhs):
         value = rhs[0].value.text
         return ast.Number(value)
-    
+
     def struct_literal(self, rhs):
         return rhs[1]
-    
+
     def range_literal(self, rhs):
         begin = rhs[1]
         end = rhs[3]
         return ast.Range(begin, end)
 
-    def name_ref(self, rhs):
-        name = rhs[0].value.text
-        return ast.NameRef(name)
+    def ids(self, rhs):
+        if len(rhs) == 1:
+            name = rhs[0].value.text
+            names = [name]
+        else:
+            name = rhs[2].value.text
+            names = rhs[0] + [name]
+        return names
+
+    def fq_name_ref(self, rhs):
+        if len(rhs) == 1:
+            name = rhs[0].value.text
+            names = [name]
+        else:
+            name = rhs[2].value.text
+            names = rhs[0].names + [name]
+        return ast.FqNameRef(names)
 
     def type(self, rhs):
+        # TODO: handle range indicator for integer types.
         return rhs[0]
 
+    def integer_type(self, rhs):
+        ty = rhs[0].value.text
+        domain = rhs[1] if len(rhs) > 1 else None
+        return ast.IntegerType(ty, domain)
+
     def string_type(self, rhs):
-        name = rhs[0].value.text
-        return ast.TypeRef(name)
+        size = rhs[2] if len(rhs) == 4 else None
+        return ast.StringType(size)
+
+    def pointer_type(self, rhs):
+        ty = rhs[-1]
+        return ast.PointerType(ty)
+
+    def reference_type(self, rhs):
+        ty = rhs[-1]
+        return ast.ReferenceType(ty)
 
     def array_type(self, rhs):
         ranges = rhs[2]
         element_type = rhs[5]
-        return ast.Array(ranges, element_type)
+        return ast.ArrayType(ranges, element_type)
 
     def subranges(self, rhs):
         return comma(rhs)

From fc61bef10a338cae1a9031b9ac3d32f4875c30d5 Mon Sep 17 00:00:00 2001
From: Windel Bouwman <windel.bouwman@demcon.com>
Date: Sun, 5 May 2024 13:40:06 +0200
Subject: [PATCH 4/4] Add lark as dependency.

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 2eb304a..21d57e1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,6 +18,7 @@ classifiers = [
     "Operating System :: OS Independent",
 ]
 dependencies = [
+    "lark>=1",
     "sphinx>=5.0,<7.0",
     "textX>=3.0",
 ]