arklexai · alexjyc · Aug 4, 2025
diff --git a/README.md b/README.md
@@ -20,7 +20,8 @@ Launching Streamlit: `streamlit run src/w2_app.py`
 
 Import zipped directory of the PDFs
 
-### Python
+## Document Extraction with OCR Process
+#### For working with OCR pipeline and optimizing prompts:
 ```bash
 python src/w2_extract.py --fieldpath <fieldpath> --filepath <filepath> [options]
 ```
@@ -38,9 +39,31 @@ python src/w2_extract.py --fieldpath <fieldpath> --filepath <filepath> [options]
 - `--spatial_ocr`: Enable spatial OCR with coordinate information
 - `--prompt_opt`: Enable prompt optimization with evaluation
 - `--label_file FILE`: Path to label file for evaluation (required when using --prompt_opt)
+- `--train_file FILE`: Path to the training file path for prompt optimization (required when using --prompt_opt)
+- `--train_label FILE`: Path to the training label file for prompt optimization (required when using --prompt_opt)
 
 Should be able to handle PDF, directories, and zipped directory paths
 
+## OCR-based Extraction and Prompt Optimization
+#### For working with pre-extracted OCR data and optimizing prompts:
+```bash
+python src/extract_ocr.py --fieldpath <fieldpath> --test_file <test_file> [options]
+```
+
+##### Required Arguments:
+- `--fieldpath`: Path to the field definitions file (.json, .yaml, or .yml)
+- `--test_file`: Path to the test OCR data file (.json)
+
+##### Optional Arguments:
+- `--test_label`: Path to the test labels file (.csv) for evaluation
+- `--training_file`: Path to the training OCR data file (.json) for prompt optimization
+- `--training_label`: Path to the training label file (.csv) for prompt optimization
+- `--file_out`: Path to save the output CSV or Excel file
+- `--model_type {gpt-4o-mini,gpt-4.1,gpt-o3}`: Model type for extraction (default: gpt-4o-mini)
+- `--prompt_opt`: Enable prompt optimization (requires training_file and training_label)
+- `--opt_iterations`: Number of optimization iterations (default: 3)
+- `--max_workers`: Number of worker threads for parallel processing (default: 4)
+
 ## Evaluation
 
 ### Standalone Evaluation

diff --git a/src/example.py b/src/example.py
@@ -0,0 +1,85 @@
+class Example:
+    def __init__(self, base=None, fields=None, context=None, mode=None, output=None):
+        # Internal storage
+        self._store = {}
+        self._demos = []
+        self._input_keys = {'fields', 'context', 'mode'}
+
+        # Initialize from a base Example if provided
+        if base and isinstance(base, type(self)):
+            self._store = base._store.copy()
+            self._input_keys = base._input_keys.copy()
+
+        # Initialize from a dict if provided
+        elif base and isinstance(base, dict):
+            self._store = base.copy()
+
+        if fields is not None:
+            self._store['fields'] = fields
+        if context is not None:
+            self._store['context'] = context
+        if mode is not None:
+            self._store['mode'] = mode
+        if output is not None:
+            self._store['output'] = output
+
+    def __getattr__(self, key):
+        if key.startswith("__") and key.endswith("__"):
+            raise AttributeError
+        if key in self._store:
+            return self._store[key]
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{key}'")
+
+    def __setattr__(self, key, value):
+        if key.startswith("_") or key in dir(self.__class__):
+            super().__setattr__(key, value)
+        else:
+            self._store[key] = value
+
+    def __getitem__(self, key):
+        return self._store[key]
+
+    def __setitem__(self, key, value):
+        self._store[key] = value
+
+    def __delitem__(self, key):
+        del self._store[key]
+
+    def __contains__(self, key):
+        return key in self._store
+
+    def keys(self):
+        return self._store.keys()
+
+    def values(self):
+        return self._store.values()
+
+    def get(self, key, default=None):
+        return self._store.get(key, default)
+
+    def inputs(self):
+        if self._input_keys is None:
+            raise ValueError("Inputs have not been set for this example.")
+
+        d = {key: self._store[key] for key in self._store if key in self._input_keys}
+        new_instance = type(self)(base=d)
+        new_instance._input_keys = self._input_keys
+        return new_instance
+
+    def labels(self):
+        input_keys = self.inputs().keys()
+        d = {key: self._store[key] for key in self._store if key not in input_keys}
+        return type(self)(d)
+
+    def copy(self, **kwargs):
+        return type(self)(base=self, **kwargs)
+
+    def without(self, *keys):
+        copied = self.copy()
+        for key in keys:
+            if key in copied._store:
+                del copied._store[key]
+        return copied
+
+    def to_dict(self):
+        return self._store.copy()