sharktide · sharktide · Sep 14, 2025 · Sep 4, 2025 · Sep 14, 2025
diff --git a/docs/source/reference/Features/OPTIMIZE.rst b/docs/source/reference/Features/OPTIMIZE.rst
@@ -1,32 +1,85 @@
 <OPTIMIZE> Directives
 =====================
 
-reStructuredPython allows you to apply runtime optimizations using special compiler directives and decorators.
+reStructuredPython allows you to apply runtime optimizations using special 
+compiler directives and decorators. These directives can be applied to both
+loops and functions to enhance performance, enable diagnostics,
+and optionally parallelize execution.
+
+Loop Optimization
+-----------------
+
+Use ``<OPTIMIZE ...>`` before a ``for`` or ``while`` loop to apply runtime enhancements:
 
 .. code-block:: python
 
-   <OPTIMIZE gct=True, parallel=True, profile=True>
-   for i in range(10_000_000) {
-       temp = str(i) * 10
-   }
+<OPTIMIZE gct=True, parallel=True, profile=True, cache=True>
+for i in range(10_000_000) {
+    temp = str(i) * 10
+}
 
+.. versionadded:: 
+   Added the cache option in 2.6.0
 
-.. note::
-   Optimizations currently support loops only. The function implementation currently **does not** have actual optimizations, only profiling and tracing.
+.. versionchanged::
+   Changed the parallel functionality in 2.6.0 
 
 Arguments for <OPTIMIZE ...> on loops include:
 
-- ``gct=True``: Enable garbage collection tracking.
-- ``profile=True``: Enable execution time logging.
-- ``parallel=True``: Enables multiprocessing pool
+- ``gct=False``: Trigger garbage collection before loop execution.
+
+- ``profile=False``: Log execution time of the loop.
+
+- ``parallel=Flase``: Attempts **multithreading** using ``concurrent.futures.ThreadPoolExecutor``
 
-Arguments for <OPTIMIZE ...> on functions include:
+- ``cache=False``: Enable memoization for loop-returning functions.
 
-- ``profile=False`` Enable execution time logging
-- ``trace=False`` Enable event tracing
+- ``unroll=N``: Unrolls loops to preserve preformance
 
-This will generate a python file that imports the optimization decorators from this ( the ``restructuredpython`` package ), so you will need to have this package installed via pip on systems running your compiled, optimized program.
+.. warning::
+   **<OPTIMIZE parallel=True> on loops uses multithreading only.** 
+   For true multiprocessing, use python's multiprocessing module with
+   top-level functions and ensure your script includes freeze_support
+   due to limitations of python's multiprocessing setup.
 
-However, as of 2.5.0, you could technically open the generated python file, remove the imports from ``restructuredpython``, and instead use ``include 'subinterpreter.optimize'``. However, this is expictily NOT recommended as it will break in future versions of reStructuredPython and will include an annoying copyright header in the generated file.
+   .. code-block:: python
+      if __name__ == "__main__":
+         from multiprocessing import freeze_support
+         freeze_support()
+         main()
+
+
+Function Optimization
+---------------------
+
+You can also apply ``<OPTIMIZE ...>`` before a function definition 
+to enable diagnostics and performance enhancements:
+
+.. code-block:: python
 
-We recommend running this with ``repycl`` the restructuredpython interpreter & launcher.
+   <OPTIMIZE profile=True, trace=True, cache=True>
+   def compute(x) {
+      return x ** 2
+   }
+
+.. versionadded::
+
+   Note: Function optimization now includes caching 
+   as well profiling and tracing, starting from version 2.6.0
+
+Arguments for <OPTIMIZE ...> on functions include
+
+- ``cache=False``: Uses LRU cahching for function memoization
+- ``profile=False``: Log execution time of the function.
+- ``trace=True``: Trace events for the function
+
+This will generate a python file that imports the
+optimization decorators from this ( the ``restructuredpython`` package ),
+so you will need to have this package installed via pip on systems
+running your compiled, optimized program.
+
+.. note::
+   However, as of 2.6.0, you could technically open the generated python file, remove the imports from ``restructuredpython``, and instead use ``include 'subinterpreter.optimize'``. However, this is expictily NOT recommended as it will break in future versions of reStructuredPython and will include an annoying copyright header in the generated file.
+
+.. note::
+   We recommend running this with ``repycl`` the restructuredpython interpreter & launcher.
diff --git a/restructuredpython/parser.py b/restructuredpython/parser.py
@@ -15,11 +15,10 @@
 from .check_syntax import check_syntax
 import re
 
-
 def wrap_loops_for_optimization(code):
     """
     Rewrites for/while loops with <OPTIMIZE ...> annotations into runtime functions
-    with decorators.
+    with decorators. Supports loop unrolling via `unroll=N` and parallel execution.
     """
     lines = code.splitlines()
     modified_lines = []
@@ -30,31 +29,120 @@ def wrap_loops_for_optimization(code):
         line = lines[i].strip()
         if line.startswith("@optimize_loop("):
             decorator_line = lines[i]
+
+            unroll_match = re.search(r'unroll\s*=\s*(\d+)', decorator_line)
+            unroll_factor = int(unroll_match.group(1)) if unroll_match else 1
+            parallel = "parallel=True" in decorator_line.replace(" ", "")
+
             loop_line = lines[i + 1].strip()
             loop_indent = len(lines[i + 1]) - len(loop_line)
             func_name = f"_repy_optimized_loop_{loop_counter}"
             loop_counter += 1
 
-            modified_lines.append(decorator_line)
-            modified_lines.append(" " * loop_indent + f"def {func_name}():")
-            modified_lines.append(" " * (loop_indent + 4) + loop_line)
-            i += 2
+            if parallel and loop_line.startswith("for "):
+                loop_match = re.match(r'for\s+(.*?)\s+in\s+(.*):?', loop_line)
+                loop_vars = loop_match.group(1).strip()  # type: ignore
+                iter_expr = loop_match.group(2).strip().rstrip(':')  # type: ignore
+                is_tuple_unpack = ',' in loop_vars
+
+                body_func_name = f"_repy_loop_body_{loop_counter}"
+                i += 2
+                loop_body = []
+                while i < len(lines):
+                    body_line = lines[i]
+                    if body_line.strip() == "":
+                        loop_body.append(body_line)
+                        i += 1
+                        continue
+                    body_indent = len(body_line) - len(body_line.lstrip())
+                    if body_indent <= loop_indent:
+                        break
+                    relative_indent = body_indent - loop_indent
+                    new_indent = loop_indent + 4 + relative_indent
+                    loop_body.append(" " * new_indent + body_line.lstrip())
+                    i += 1
+
+                body_func_lines = [f"def {body_func_name}({loop_vars}):"]
+                body_func_lines.extend(loop_body)
+
+                func_name = f"_repy_optimized_loop_{loop_counter}"
+                loop_counter += 1
+
+                modified_lines.append(" " * loop_indent + decorator_line)
+                modified_lines.append(" " * loop_indent + f"def {body_func_name}({loop_vars}):")
+                modified_lines.extend(loop_body)
+
+                # Emit executor function
+                modified_lines.append(" " * loop_indent + f"def {func_name}():")
+                executor_type = "ThreadPoolExecutor"
+                modified_lines.append(" " * (loop_indent + 4) + f"from concurrent.futures import {executor_type}")
+
+                if is_tuple_unpack:
+                    modified_lines.append(" " * (loop_indent + 4) + "def starmap_pool(fn, iterable):")
+                    modified_lines.append(" " * (loop_indent + 8) + f"with {executor_type}() as pool:")
+                    modified_lines.append(" " * (loop_indent + 12) + "futures = [pool.submit(fn, *args) for args in iterable]")
+                    modified_lines.append(" " * (loop_indent + 12) + "return [f.result() for f in futures]")
+                    modified_lines.append(" " * (loop_indent + 4) + f"starmap_pool({body_func_name}, {iter_expr})")
+                else:
+                    modified_lines.append(" " * (loop_indent + 4) + f"with {executor_type}() as pool:")
+                    modified_lines.append(" " * (loop_indent + 8) + f"list(pool.map({body_func_name}, {iter_expr}))")
+
+                # Call the executor function
+                modified_lines.append(" " * loop_indent + f"{func_name}()")
 
-            # Copy indented body lines until dedent or EOF
-            while i < len(lines):
-                body_line = lines[i]
-                if body_line.strip() == "":
-                    modified_lines.append(body_line)
+            else:
+                # Non-parallel loop: wrap as usual
+                modified_lines.append(" " * loop_indent + decorator_line)
+                modified_lines.append(" " * loop_indent + f"def {func_name}():")
+                modified_lines.append(" " * (loop_indent + 4) + loop_line)
+                i += 2
+                loop_body = []
+                while i < len(lines):
+                    body_line = lines[i]
+                    if body_line.strip() == "":
+                        loop_body.append(body_line)
+                        i += 1
+                        continue
+                    body_indent = len(body_line) - len(body_line.lstrip())
+                    if body_indent <= loop_indent:
+                        break
+                    relative_indent = body_indent - loop_indent
+                    new_indent = loop_indent + 4 + relative_indent
+                    loop_body.append(" " * new_indent + body_line.lstrip())
                     i += 1
-                    continue
-                body_indent = len(body_line) - len(body_line.lstrip())
-                if body_indent <= loop_indent:
-                    break
-                modified_lines.append(
-                    " " * (loop_indent + 8) + body_line.strip())
-                i += 1
-
-            modified_lines.append(" " * loop_indent + f"{func_name}()")
+
+                if unroll_factor > 1 and loop_line.startswith("for ") and "range(" in loop_line:
+                    range_match = re.search(r'range\(([^)]+)\)', loop_line)
+                    if range_match:
+                        range_args = range_match.group(1).split(',')
+                        if len(range_args) == 1:
+                            start, end, step = "0", range_args[0].strip(), str(unroll_factor)
+                        elif len(range_args) == 2:
+                            start, end = range_args[0].strip(), range_args[1].strip()
+                            step = str(unroll_factor)
+                        elif len(range_args) == 3:
+                            start, end, step = [arg.strip() for arg in range_args]
+                            step = f"({step}) * {unroll_factor}"
+
+                        var_match = re.match(r'for\s+(\w+)\s+in\s+range', loop_line)
+                        loop_var = var_match.group(1) if var_match else "i"
+
+                        new_loop_line = f"for {loop_var} in range({start}, {end}, {step}):"
+                        modified_lines[-1] = " " * (loop_indent + 4) + new_loop_line
+
+                        for offset in range(unroll_factor):
+                            for body in loop_body:
+                                if offset == 0:
+                                    unrolled_line = body
+                                else:
+                                    unrolled_line = re.sub(rf'\b{loop_var}\b', f"{loop_var}+{offset}", body)
+                                modified_lines.append(unrolled_line)
+                    else:
+                        modified_lines.extend(loop_body)
+                else:
+                    modified_lines.extend(loop_body)
+
+                modified_lines.append(" " * loop_indent + f"{func_name}()")
         else:
             modified_lines.append(lines[i])
             i += 1
@@ -98,7 +186,6 @@ def nest(parts):
     for i, line in enumerate(lines):
         stripped = line.strip()
 
-        # Detect <OPTIMIZE ...> directive
         if stripped.startswith("<OPTIMIZE") and stripped.endswith(">"):
             match = re.match(r'<OPTIMIZE\s+(.+?)>', stripped)
             if match:
@@ -126,14 +213,15 @@ def nest(parts):
             modified_code.append(f"# {processed_line[:-2].strip()}")
             continue
 
-        # Apply decorator before next block
         if pending_optimize:
+            loop_indent = len(lines[i]) - len(lines[i].lstrip())
             if re.match(r'^\s*(for|while)\s+.*\{', processed_line):
                 modified_code.append(f"@optimize_loop({pending_optimize})")
+
                 required_imports.add("optimize_loop")
                 pending_optimize = None
             elif re.match(r'^\s*def\s+.*\{', processed_line):
-                modified_code.append(f"@optimize_function({pending_optimize})")
+                modified_code.append(" " * loop_indent + f"@optimize_function({pending_optimize})")
                 required_imports.add("optimize_function")
                 pending_optimize = None
 

diff --git a/restructuredpython/predefined/subinterpreter/optimize.py b/restructuredpython/predefined/subinterpreter/optimize.py
@@ -16,63 +16,71 @@
 import time
 import sys
 import functools
-import types
-import dis
 import multiprocessing
+import warnings
 
-
-def optimize_loop(profile=False, gct=False, parallel=False, unroll=0):
+def optimize_loop(profile=False, gct=False, multithreading=False, parallel=False, cache=False, unroll=0):
+    """
+    Decorator to optimize loop-like functions.
+    Supports profiling, garbage collection, parallel execution, caching, and JIT compilation.
+    """
     def decorator(fn):
-        @functools.wraps(fn)
+        original_fn = fn
+
+        # Caching
+        if cache:
+            fn = functools.lru_cache(maxsize=None)(fn)
+
+        @functools.wraps(original_fn)
         def wrapper(*args, **kwargs):
-            # Garbage Collection
             if gct:
                 gc.collect()
 
-            # Profiling
             start = time.perf_counter() if profile else None
 
-            # Execute loop
-            if parallel:
-                # Naive parallelism: assuming fn yields or returns a list
-                results = fn(*args, **kwargs)
-                with multiprocessing.Pool() as pool:
-                    pool.map(lambda x: x, results)
-            else:
-                fn(*args, **kwargs)
+            fn(*args, **kwargs)
 
             if profile:
-                duration = time.perf_counter() - start
+                duration = time.perf_counter() - start # type: ignore
                 print(f"[PROFILE] Loop took {duration:.4f}s")
 
         return wrapper
     return decorator
 
 
-def optimize_function(profile=False, trace=False):
+def optimize_function(profile=False, trace=False, cache=False, parallel=False):
+    """
+    Decorator to optimize general functions.
+    Supports profiling, tracing, caching
+    """
     def decorator(fn):
+        original_fn = fn
+
+        if cache:
+            fn = functools.lru_cache(maxsize=None)(fn)
+
         if profile:
-            @functools.wraps(fn)
+            @functools.wraps(original_fn)
             def profiled(*args, **kwargs):
                 start = time.perf_counter()
-                result = fn(*args, **kwargs)
-                print(
-                    f"[PROFILE] {
-                        fn.__name__} took {
-                        time.perf_counter() -
-                        start:.4f}s")
+                result = fn(*args, **kwargs) # type: ignore
+                print(f"[PROFILE] {original_fn.__name__} took {time.perf_counter() - start:.4f}s")
                 return result
             return profiled
+
         if trace:
             def tracer(frame, event, arg):
                 print(f"[TRACE] {event} in {frame.f_code.co_name}")
                 return tracer
 
-            def wrapped(*args, **kwargs):
+            @functools.wraps(original_fn)
+            def traced(*args, **kwargs):
                 sys.settrace(tracer)
-                result = fn(*args, **kwargs)
+                result = fn(*args, **kwargs) # type: ignore
                 sys.settrace(None)
                 return result
-            return wrapped
+            return traced
+
         return fn
+
     return decorator