diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5147e8f1..50bf3fb9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -79,6 +79,7 @@ repos:
     hooks:
       - id: codespell
         additional_dependencies: [ 'tomli' ]
+        exclude: tests/.*\.py
   - repo: https://github.com/python-jsonschema/check-jsonschema
     rev: 0.31.1
     hooks:
diff --git a/CHANGES.rst b/CHANGES.rst
index 7e67fff1..344aa650 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -22,7 +22,8 @@ New features and enhancements
 
 Breaking changes
 ^^^^^^^^^^^^^^^^
-* ``cdm_reader_mapper.cdm_mapper``: rename `map_and_covnert` to helper function `_map_and_convert` (:pull:`343`)
+* ``cdm_reader_mapper.cdm_mapper``: rename `map_and_convert` to helepr function `_map_and_convert` (:pull:`343`)
+* replace `logging.error` with `raise` error statements (:pull:`345`)
 
 Internal changes
 ^^^^^^^^^^^^^^^^
@@ -34,6 +35,22 @@ Internal changes
 * ``cdm_reader_mapper.cdm_mapper``: introduce some helper functions (:pull:`324`)
 * add more unit tests (:issue:`311`, :pull:`324`)
 * ``cdm_reader_mapper.cdm_mapper``: split `map_and_convert` into multiple helper functions (:issue:`333`, :pull:`343`)
+* exclude tests/*.py from `pre-commit` codespell hook (:pull:`345`)
+* replace many `os` functions with `pathlib.Path` (:pull:`345`)
+* re-work `mdf_reader` (:issue:`334`, :pull:`345`)
+
+  * remove `reader.MDFFileReader` class
+  * remove `utils.configurator` module
+  * remove both `utils.decoder` and `mdf_reader.utils.converter` modules
+  * introduce `utils.parser` module: bunch of functions to parse input data into MDF data
+  * introduce `utils.convert_and_decode`: make converter and decoder functions more modular
+  * make `utils.validator` module more modular
+  * `utils.filereader.FileReader` uses `utils.parser` function for parsing
+  * move many helper function to `utils.utilities`
+  * serialize `schemas.schemas` module
+
+* add type hints and docstrings to `mdf_reader` (:pull:`345`)
+* add unit tests for `mdf_reader` module to testing suite (:pull:`345`)
 
 Bug fixes
 ^^^^^^^^^
diff --git a/cdm_reader_mapper/common/inspect.py b/cdm_reader_mapper/common/inspect.py
index 6ee3fbcf..267caafa 100755
--- a/cdm_reader_mapper/common/inspect.py
+++ b/cdm_reader_mapper/common/inspect.py
@@ -12,7 +12,7 @@
 
 import pandas as pd
 
-from cdm_reader_mapper.common import pandas_TextParser_hdlr
+from . import pandas_TextParser_hdlr
 
 
 def _count_by_cat(series) -> dict:
diff --git a/cdm_reader_mapper/common/replace.py b/cdm_reader_mapper/common/replace.py
index 6e07fb48..15426ec0 100755
--- a/cdm_reader_mapper/common/replace.py
+++ b/cdm_reader_mapper/common/replace.py
@@ -22,7 +22,7 @@
 
 import pandas as pd
 
-from cdm_reader_mapper.common import logging_hdlr
+from . import logging_hdlr
 
 
 def replace_columns(
diff --git a/cdm_reader_mapper/mdf_reader/codes/codes.py b/cdm_reader_mapper/mdf_reader/codes/codes.py
index 6ba65924..66b8a679 100755
--- a/cdm_reader_mapper/mdf_reader/codes/codes.py
+++ b/cdm_reader_mapper/mdf_reader/codes/codes.py
@@ -9,8 +9,6 @@
 
 from __future__ import annotations
 
-import logging
-import os
 from pathlib import Path
 
 from cdm_reader_mapper.common.json_dict import (
@@ -23,53 +21,60 @@
 
 
 def read_table(
-    code_table_name,
-    imodel=None,
-    ext_table_path=None,
+    code_table_name: str,
+    imodel: str | None = None,
+    ext_table_path: str | None = None,
 ) -> dict:
     """
-    Read a data model code table file to a dictionary.
+    Load a data model code table into a Python dictionary.
 
-    It completes the code table to the full complexity
-    the data reader expects, by appending information
-    on secondary keys and expanding range keys.
+    The code table may define secondary keys, range expansions, or other
+    structures required by the data reader. This function resolves the
+    file location either from an external path or an internal data model.
 
-    Parameter
-    ---------
-    code_table_name: str
-        The external code table file.
-    imodel: str, optional
-        Name of internally available input data model.
-        e.g. icoads_r300_d704
-    ext_table_path: str, optional
-        The path to the external code table file.
-        One of ``imodel`` and ``ext_table_path`` must be set.
+    Parameters
+    ----------
+    code_table_name : str
+        The name of the code table (without file extension).
+        e.g., `"ICOADS.C0.IM"`
+    imodel : str, optional
+        Internal data model name, e.g., `"icoads_r300_d704"`. Required if
+        `ext_table_path` is not provided.
+    ext_table_path : str, optional
+        External path containing the code table file. If set, this path
+        takes precedence over `imodel`.
 
     Returns
     -------
-    dict
-        Code table
+    Dict
+        The fully combined code table dictionary.
+
+    Raises
+    ------
+    FileNotFoundError
+        If the specified table file cannot be found.
+    ValueError
+        If neither `imodel` nor `ext_table_path` is provided.
     """
-    # 1. Validate input
     if ext_table_path:
-        table_path = os.path.abspath(ext_table_path)
-        table_files = os.path.join(table_path, code_table_name + ".json")
-        if not os.path.isfile(table_files):
-            logging.error(f"Can't find input code table file {table_files}")
-            return
-        table_files = Path(table_files)
-    else:
-        imodel = imodel.split("_")
+        table_path = Path(ext_table_path).resolve()
+        table_file = table_path / f"{code_table_name}.json"
+        if not table_file.is_file():
+            raise FileNotFoundError(f"Can't find input code table file {table_file}")
+        table_files = [table_file]
+    elif imodel:
+        parts = imodel.split("_")
         table_files = collect_json_files(
-            *imodel,
+            *parts,
             base=f"{properties._base}.codes",
             name=code_table_name,
         )
 
-    if isinstance(table_files, Path):
-        table_files = [table_files]
-    # 2. Get tables
+        if isinstance(table_files, Path):
+            table_files = [table_files]
+    else:
+        raise ValueError("One of 'imodel' or 'ext_table_path' must be set")
+
     tables = [open_json_file(ifile) for ifile in table_files]
 
-    # 3. Combine tables
     return combine_dicts(tables)
diff --git a/cdm_reader_mapper/mdf_reader/reader.py b/cdm_reader_mapper/mdf_reader/reader.py
index 1e5e9a5f..c921c031 100755
--- a/cdm_reader_mapper/mdf_reader/reader.py
+++ b/cdm_reader_mapper/mdf_reader/reader.py
@@ -2,342 +2,87 @@
 
 from __future__ import annotations
 
-import ast
-import csv
-import logging
-import os
 from io import StringIO as StringIO
+from pathlib import Path
 
-import pandas as pd
+from cdm_reader_mapper import DataBundle
 
-from cdm_reader_mapper.common.json_dict import open_json_file
-from cdm_reader_mapper.common.pandas_TextParser_hdlr import make_copy
-from cdm_reader_mapper.core.databundle import DataBundle
+from ..common.json_dict import open_json_file
 
-from . import properties
 from .utils.filereader import FileReader
-from .utils.utilities import adjust_dtype, remove_boolean_values, validate_arg
-from .utils.validators import validate
-
+from .utils.utilities import validate_arg
+
+from .utils.utilities import as_list, as_path, read_csv
+
+
+def validate_read_mdf_args(
+    *,
+    source: str | Path,
+    imodel: str | None = None,
+    ext_schema_path: str | Path | None = None,
+    ext_schema_file: str | Path | None = None,
+    year_init: int | None = None,
+    year_end: int | None = None,
+    chunksize: int | None = None,
+    skiprows: int | None = None,
+):
+    """
+    Validate arguments for reading an MDF file.
 
-class MDFFileReader(FileReader):
-    """Class to represent reader output.
+    This function performs validation on file paths and numeric arguments
+    required for reading an MDF dataset.
 
-    Attributes
-    ----------
-    data : pd.DataFrame or pd.io.parsers.TextFileReader
-        a pandas.DataFrame or pandas.io.parsers.TextFileReader
-        with the output data
-    mask : pd.DataFrame or pd.io.parsers.TextFileReader
-        a pandas.DataFrame or pandas.io.parsers.TextFileReader
-        with the output data validation mask
-    attrs : dict
-        a dictionary with the output data elements attributes
+    Raises
+    ------
+    FileNotFoundError
+        If the source file does not exist.
+    ValueError
+        If required arguments are missing or numeric constraints are violated.
     """
+    source = as_path(source, "source")
 
-    def __init__(self, *args, **kwargs):
-        FileReader.__init__(self, *args, **kwargs)
-
-    def _convert_and_decode(
-        self,
-        df,
-        converter_dict,
-        converter_kwargs,
-        decoder_dict,
-    ) -> pd.DataFrame:
-        for section in converter_dict.keys():
-            if section not in df.columns:
-                continue
-            if section in decoder_dict.keys():
-                decoded = decoder_dict[section](df[section])
-                decoded.index = df[section].index
-                df[section] = decoded
-
-            converted = converter_dict[section](
-                df[section], **converter_kwargs[section]
-            )
-            converted.index = df[section].index
-            df[section] = converted
-        return df
-
-    def _validate(self, df) -> pd.DataFrame:
-        return validate(
-            data=df,
-            imodel=self.imodel,
-            ext_table_path=self.ext_table_path,
-            schema=self.schema,
-            disables=self.disable_reads,
-        )
+    if not source.exists():
+        raise FileNotFoundError(f"Source file not found: {source}")
 
-    def convert_and_decode_entries(
-        self,
-        data,
-        convert=True,
-        decode=True,
-        converter_dict=None,
-        converter_kwargs=None,
-        decoder_dict=None,
-    ) -> pd.DataFrame | pd.io.parsers.TextFileReader:
-        """Convert and decode data entries by using a pre-defined data model.
-
-        Overwrite attribute `data` with converted and/or decoded data.
-
-        Parameters
-        ----------
-        data: pd.DataFrame or pd.io.parsers.TextFileReader
-          Data to convert and decode.
-        convert: bool, default: True
-          If True convert entries by using a pre-defined data model.
-        decode: bool, default: True
-          If True decode entries by using a pre-defined data model.
-        converter_dict: dict of {Hashable: func}, optional
-          Functions for converting values in specific columns.
-          If None use information from a pre-defined data model.
-        converter_kwargs: dict of {Hashable: kwargs}, optional
-          Key-word arguments for converting values in specific columns.
-          If None use information from a pre-defined data model.
-        decoder_dict: dict, optional
-          Functions for decoding values in specific columns.
-          If None use information from a pre-defined data model.
-        """
-        if converter_dict is None:
-            converter_dict = self.configurations["convert_decode"]["converter_dict"]
-        if converter_kwargs is None:
-            converter_kwargs = self.configurations["convert_decode"]["converter_kwargs"]
-        if decoder_dict is None:
-            decoder_dict = self.configurations["convert_decode"]["decoder_dict"]
-        if not (convert and decode):
-            self.dtypes = "object"
-            return data
-        if convert is not True:
-            converter_dict = {}
-            converter_kwargs = {}
-        if decode is not True:
-            decoder_dict = {}
-
-        if isinstance(data, pd.DataFrame):
-            data = self._convert_and_decode(
-                data,
-                converter_dict,
-                converter_kwargs,
-                decoder_dict,
-            )
-        else:
-            data_buffer = StringIO()
-            TextParser = make_copy(data)
-            for i, df_ in enumerate(TextParser):
-                df = self._convert_and_decode(
-                    df_,
-                    converter_dict,
-                    converter_kwargs,
-                    decoder_dict,
-                )
-                df.to_csv(
-                    data_buffer,
-                    header=False,
-                    mode="a",
-                    encoding=self.encoding,
-                    index=False,
-                    quoting=csv.QUOTE_NONE,
-                    sep=properties.internal_delimiter,
-                    quotechar="\0",
-                    escapechar="\0",
-                )
-
-            data_buffer.seek(0)
-            data = pd.read_csv(
-                data_buffer,
-                names=df.columns,
-                chunksize=self.chunksize,
-                dtype=object,
-                delimiter=properties.internal_delimiter,
-                quotechar="\0",
-                escapechar="\0",
-            )
-        return data
-
-    def validate_entries(
-        self, data, validate
-    ) -> pd.DataFrame | pd.io.parsers.TextFileReader:
-        """Validate data entries by using a pre-defined data model.
-
-        Fill attribute `valid` with boolean mask.
-        """
-        if validate is not True:
-            mask = pd.DataFrame(dtype="boolean")
-        elif isinstance(data, pd.DataFrame):
-            mask = self._validate(data)
-        else:
-            data_buffer = StringIO()
-            TextParser_ = make_copy(data)
-            for i, df_ in enumerate(TextParser_):
-                mask_ = self._validate(df_)
-                mask_.to_csv(
-                    data_buffer,
-                    header=False,
-                    mode="a",
-                    encoding=self.encoding,
-                    index=False,
-                )
-            data_buffer.seek(0)
-            mask = pd.read_csv(
-                data_buffer,
-                names=df_.columns,
-                chunksize=self.chunksize,
-                dtype="boolean",
-            )
-        return mask
-
-    def remove_boolean_values(
-        self, data
-    ) -> pd.DataFrame | pd.io.parsers.TextFileReader:
-        """DOCUMENTATION"""
-        if isinstance(data, pd.DataFrame):
-            data = data.map(remove_boolean_values)
-            dtype = adjust_dtype(self.dtypes, data)
-            return data.astype(dtype)
-        else:
-            data_buffer = StringIO()
-            TextParser = make_copy(data)
-            for i, df_ in enumerate(TextParser):
-                df = df_.map(remove_boolean_values)
-                dtype = adjust_dtype(self.dtypes, df)
-                date_columns = []
-                df.to_csv(
-                    data_buffer,
-                    header=False,
-                    mode="a",
-                    encoding=self.encoding,
-                    index=False,
-                    quoting=csv.QUOTE_NONE,
-                    sep=properties.internal_delimiter,
-                    quotechar="\0",
-                    escapechar="\0",
-                )
-            date_columns = []
-            for i, element in enumerate(list(dtype)):
-                if dtype.get(element) == "datetime":
-                    date_columns.append(i)
-            dtype = adjust_dtype(dtype, df)
-            data_buffer.seek(0)
-            data = pd.read_csv(
-                data_buffer,
-                names=df.columns,
-                chunksize=self.chunksize,
-                dtype=dtype,
-                parse_dates=date_columns,
-                delimiter=properties.internal_delimiter,
-                quotechar="\0",
-                escapechar="\0",
-            )
-        return data
-
-    def read(
-        self,
-        chunksize=None,
-        sections=None,
-        skiprows=0,
-        convert=True,
-        decode=True,
-        converter_dict=None,
-        converter_kwargs=None,
-        validate=True,
-        encoding: str | None = None,
-        **kwargs,
-    ) -> DataBundle:
-        """Read data from disk.
-
-        Parameters
-        ----------
-        chunksize : int, optional
-          Number of reports per chunk.
-        sections : list, optional
-          List with subset of data model sections to output, optional
-          If None read pre-defined data model sections.
-        skiprows : int
-          Number of initial rows to skip from file, default: 0
-        convert: bool, default: True
-          If True convert entries by using a pre-defined data model.
-        decode: bool, default: True
-          If True decode entries by using a pre-defined data model.
-        converter_dict: dict of {Hashable: func}, optional
-          Functions for converting values in specific columns.
-          If None use information from a pre-defined data model.
-        converter_kwargs: dict of {Hashable: kwargs}, optional
-          Key-word arguments for converting values in specific columns.
-          If None use information from a pre-defined data model.
-        validate: bool, default: True
-          Validate data entries by using a pre-defined data model.
-        encoding: str, optional
-          Encoding of the input file, overrides the value in the imodel schema
-        """
-        # 0. VALIDATE INPUT
-        if not validate_arg("sections", sections, list):
-            return
-        if not validate_arg("chunksize", chunksize, int):
-            return
-        if not validate_arg("skiprows", skiprows, int):
-            return
-
-        self.chunksize = chunksize
-        self.skiprows = skiprows
-
-        # 2. READ AND VALIDATE DATA
-        logging.info(f"EXTRACTING DATA FROM MODEL: {self.imodel}")
-        # 2.1. Subset data model sections to requested sections
-        parsing_order = self.schema["header"].get("parsing_order")
-        sections_ = [x.get(y) for x in parsing_order for y in x]
-        read_sections_list = [y for x in sections_ for y in x]
-        if sections is None:
-            sections = read_sections_list
-
-        # 2.2 Homogenize input data to an iterable with dataframes:
-        # a list with a single dataframe or a pd.io.parsers.TextFileReader
-        logging.info("Getting data string from source...")
-        self.configurations = self.get_configurations(read_sections_list, sections)
-        self.encoding = encoding or self.encoding
-        data = self.open_data(
-            read_sections_list,
-            sections,
-            # INFO: Set default as "pandas" to account for custom schema
-            open_with=properties.open_file.get(self.imodel, "pandas"),
-            encoding=self.encoding,
-            chunksize=chunksize,
+    if not imodel and not (ext_schema_path or ext_schema_file):
+        raise ValueError(
+            "One of imodel or ext_schema_path/ext_schema_file must be provided"
         )
 
-        # 2.3. Extract, read and validate data in same loop
-        logging.info("Extracting and reading sections")
-        data = self.convert_and_decode_entries(
-            data,
-            convert=convert,
-            decode=decode,
-        )
-        mask = self.validate_entries(data, validate)
-
-        # 3. Create output DataBundle object
-        logging.info("Create an output DataBundle object")
-        data = self.remove_boolean_values(data)
-        return DataBundle(
-            data=data,
-            columns=self.columns,
-            dtypes=self.dtypes,
-            parse_dates=self.parse_dates,
-            encoding=self.encoding,
-            mask=mask,
-            imodel=self.imodel,
-        )
+    validate_arg("chunksize", chunksize, int)
+    if chunksize is not None and chunksize <= 0:
+        raise ValueError("chunksize must be a positive integer")
+
+    validate_arg("skiprows", skiprows, int)
+    if skiprows is not None and skiprows < 0:
+        raise ValueError("skiprows must be >= 0")
+
+    if year_init is not None and year_end is not None:
+        if year_init > year_end:
+            raise ValueError("year_init must be <= year_end")
 
 
 def read_mdf(
     source,
-    imodel=None,
-    ext_schema_path=None,
-    ext_schema_file=None,
-    ext_table_path=None,
-    year_init=None,
-    year_end=None,
+    imodel: str | None = None,
+    ext_schema_path: str | None = None,
+    ext_schema_file: str | None = None,
+    ext_table_path: str | None = None,
+    year_init: int | None = None,
+    year_end: int | None = None,
     encoding: str | None = None,
-    **kwargs,
+    chunksize: int | None = None,
+    skiprows: int = None,
+    convert_flag: bool = True,
+    converter_dict: dict | None = None,
+    converter_kwargs: dict | None = None,
+    decode_flag: bool = True,
+    decoder_dict: dict | None = None,
+    validate_flag: bool = True,
+    sections: str | list | None = None,
+    excludes: str | list | None = None,
+    pd_kwargs: dict | None = None,
+    xr_kwargs: dict | None = None,
 ) -> DataBundle:
     """Read data files compliant with a user specific data model.
 
@@ -362,14 +107,38 @@ def read_mdf(
     ext_schema_file: str, optional
         The external input data model schema file.
         One of ``imodel`` and ``ext_schema_path`` or ``ext_schema_file`` must be set.
-    ext_table_path: str, optional
-        The path to the external input data model code tables.
     year_init: str or int, optional
         Left border of time axis.
     year_end: str or int, optional
         Right border of time axis.
     encoding : str, optional
         The encoding of the input file. Overrides the value in the imodel schema file.
+    chunksize : int, optional
+          Number of reports per chunk.
+    skiprows : int, optional
+          Number of initial rows to skip from file, default: 0
+    convert_flag: bool, default: True
+          If True convert entries by using a pre-defined data model.
+    converter_dict: dict of {Hashable: func}, optional
+          Functions for converting values in specific columns.
+          If None use information from a pre-defined data model.
+    converter_kwargs: dict of {Hashable: kwargs}, optional
+          Key-word arguments for converting values in specific columns.
+          If None use information from a pre-defined data model.
+    decode_flag: bool, default: True
+          If True decode entries by using a pre-defined data model.
+    decoder_dict: dict of {Hashable: func}, optional
+          Functions for decoding values in specific columns.
+          If None use information from a pre-defined data model.
+    validate_flag: bool, default: True
+          Validate data entries by using a pre-defined data model.
+    sections : list, optional
+          List with subset of data model sections to output, optional
+          If None read pre-defined data model sections.
+    pd_kwargs: dict, optional
+          Additional pandas arguments
+    xr_kwargs: dict, optional
+          Additional xarray arguments
 
     Returns
     -------
@@ -384,28 +153,70 @@ def read_mdf(
     write_data : Write MDF data and validation mask to disk.
     write_tables : Write CDM tables to disk.
     """
-
-    def get_list_element(lst, idx):
-        try:
-            return lst[idx]
-        except IndexError:
-            return None
-
-    logging.basicConfig(
-        format="%(levelname)s\t[%(asctime)s](%(filename)s)\t%(message)s",
-        level=logging.INFO,
-        datefmt="%Y%m%d %H:%M:%S",
-        filename=None,
-    )
-    return MDFFileReader(
+    if skiprows is None:
+        skiprows = 0
+    validate_read_mdf_args(
         source=source,
         imodel=imodel,
         ext_schema_path=ext_schema_path,
         ext_schema_file=ext_schema_file,
+        year_init=year_init,
+        year_end=year_end,
+        chunksize=chunksize,
+        skiprows=skiprows,
+    )
+
+    pd_kwargs = pd_kwargs or {}
+    pd_kwargs.setdefault("encoding", encoding)
+    pd_kwargs.setdefault("chunksize", chunksize)
+    pd_kwargs.setdefault("skiprows", skiprows)
+
+    xr_kwargs = xr_kwargs or {}
+
+    convert_kwargs = dict(
+        convert_flag=convert_flag,
+        converter_dict=converter_dict,
+        converter_kwargs=converter_kwargs,
+    )
+
+    decode_kwargs = dict(
+        decode_flag=decode_flag,
+        decoder_dict=decoder_dict,
+    )
+
+    validate_kwargs = dict(
+        validate_flag=validate_flag,
         ext_table_path=ext_table_path,
+    )
+
+    sections = as_list(sections)
+    excludes = as_list(excludes)
+
+    validate_arg("sections", sections, list)
+    validate_arg("excludes", excludes, list)
+
+    select_kwargs = dict(
+        sections=sections,
+        excludes=excludes,
         year_init=year_init,
         year_end=year_end,
-    ).read(encoding=encoding, **kwargs)
+    )
+
+    filereader = FileReader(
+        imodel=imodel,
+        ext_schema_path=ext_schema_path,
+        ext_schema_file=ext_schema_file,
+    )
+
+    return filereader.read(
+        source=source,
+        pd_kwargs=pd_kwargs,
+        xr_kwargs=xr_kwargs,
+        convert_kwargs=convert_kwargs,
+        decode_kwargs=decode_kwargs,
+        validate_kwargs=validate_kwargs,
+        select_kwargs=select_kwargs,
+    )
 
 
 def read_data(
@@ -456,52 +267,25 @@ def read_data(
     write_data : Write MDF data and validation mask to disk.
     write_tables : Write CDM tables to disk.
     """
-
-    def _update_column_labels(columns):
-        new_cols = []
-        for col in columns:
-            try:
-                col_ = ast.literal_eval(col)
-            except SyntaxError:
-                col_ = tuple(col.split(":"))
-            except ValueError:
-                col_ = col
-            new_cols.append(col_)
-
-        if all(isinstance(c, tuple) for c in new_cols):
-            return pd.MultiIndex.from_tuples(new_cols)
-
-        return pd.Index(new_cols)
-
-    def _read_csv(ifile, col_subset=None, **kwargs):
-        if ifile is None or not os.path.isfile(ifile):
-            return pd.DataFrame()
-
-        df = pd.read_csv(ifile, delimiter=",", **kwargs)
-        df.columns = _update_column_labels(df.columns)
-        if col_subset is not None:
-            df = df[col_subset]
-
-        return df
-
-    if info is None:
-        info_dict = {}
-    else:
-        info_dict = open_json_file(info)
-
+    info_dict = open_json_file(info) if info else {}
     dtype = info_dict.get("dtypes", "object")
     parse_dates = info_dict.get("parse_dates", False)
-    if encoding is None:
-        encoding = info_dict.get("encoding", None)
+    encoding = encoding or info_dict.get("encoding", None)
 
-    data = _read_csv(
+    pd_kwargs = kwargs.copy()
+    pd_kwargs.setdefault("dtype", dtype)
+    pd_kwargs.setdefault("parse_dates", parse_dates)
+    pd_kwargs.setdefault("encoding", encoding)
+
+    data = read_csv(
         source,
         col_subset=col_subset,
-        dtype=dtype,
-        parse_dates=parse_dates,
-        encoding=encoding,
+        **pd_kwargs,
     )
-    mask = _read_csv(mask, col_subset=col_subset, dtype="boolean")
+    mask = read_csv(mask, col_subset=col_subset, dtype="boolean")
+    if not mask.empty:
+        mask = mask.reindex(columns=data.columns)
+
     return DataBundle(
         data=data,
         columns=data.columns,
diff --git a/cdm_reader_mapper/mdf_reader/schemas/schemas.py b/cdm_reader_mapper/mdf_reader/schemas/schemas.py
index cd34f7e2..96ff7718 100755
--- a/cdm_reader_mapper/mdf_reader/schemas/schemas.py
+++ b/cdm_reader_mapper/mdf_reader/schemas/schemas.py
@@ -9,115 +9,151 @@
 
 from __future__ import annotations
 
-import logging
-import os
 from pathlib import Path
+from typing import TypedDict
 
 from cdm_reader_mapper.common.json_dict import collect_json_files, combine_dicts
 
 from .. import properties
 
 
-def convert_dtype_to_default(dtype, section, element) -> str:
-    """Convert data type to defaults (int, float)."""
-    if dtype is None:
-        return
-    elif dtype == "float":
-        return dtype
-    elif dtype == "int":
-        return properties.pandas_int
-    elif "float" in dtype.lower():
-        logging.warning(
-            f"Set column type of ({section}, {element}) from deprecated {dtype} to float."
-        )
-        return "float"
-    elif "int" in dtype.lower():
-        logging.warning(
-            f"Set column type of ({section}, {element}) from deprecated {dtype} to int."
-        )
-        return properties.pandas_int
-    return dtype
-
-
-def _read_schema(schema) -> dict:
-    """DOCUMENTATION."""
-    if not schema["header"]:
-        if not schema["sections"]:
-            logging.error(
-                f"'sections' block needs to be defined in a schema with no header. Error in data model schema file {schema['name']}"
-            )
-            return
-        schema["header"] = dict()
-
-    if schema["header"].get("multiple_reports_per_line"):
-        logging.error("Multiple reports per line data model: not yet supported")
-        return
-
-    # 3.2. Make no section formats be internally treated as 1 section format
-    if not schema.get("sections"):
-        if not schema.get("elements"):
-            logging.error(
-                f"Data elements not defined in data model schema file {schema['name']} under key 'elements' "
-            )
-            return
-        schema["sections"] = {
-            properties.dummy_level: {
-                "header": {},
-                "elements": schema.get("elements"),
-            }
+class SectionDict(TypedDict, total=False):
+    """
+    Schema definition for a single section within a report.
+
+    Attributes
+    ----------
+    header : dict, optional
+        Metadata or configuration for the section header.
+    elements : dict, optional
+        Dictionary of elements/fields contained within the section.
+    """
+
+    header: dict
+    elements: dict
+
+
+class SchemaHeaderDict(TypedDict, total=False):
+    """
+    Schema definition for the report header.
+
+    Attributes
+    ----------
+    parsing_order : list[dict], optional
+        List of dictionaries defining the order in which header fields are parsed.
+    delimiter : str, optional
+        Delimiter used to separate fields in the header.
+    field_layout : str, optional
+        Layout or format of the fields (e.g., fixed width, CSV).
+    format : str, optional
+        General format type of the header.
+    encoding : str, optional
+        Text encoding for the header, e.g., 'utf-8'.
+    multiple_reports_per_line : bool, optional
+        Whether multiple reports may appear on a single line.
+    """
+
+    parsing_order: list[dict]
+    delimiter: str
+    field_layout: str
+    format: str
+    encoding: str
+    multiple_reports_per_line: bool
+
+
+class SchemaDict(TypedDict, total=False):
+    """
+    Complete schema definition for a report.
+
+    Attributes
+    ----------
+    header : SchemaHeaderDict, optional
+        Configuration for the report header.
+    sections : dict[str, SectionDict], optional
+        Mapping of section names to section schemas.
+    elements : dict, optional
+        Mapping of element names to their attributes.
+    name : list[Path], optional
+        List of Path objects representing schema files or sources.
+    imodel : str | None, optional
+        Name of the internal data model, if applicable.
+    """
+
+    header: SchemaHeaderDict
+    sections: dict[str, SectionDict]
+    elements: dict
+    name: list[Path]
+    imodel: str | None
+
+
+def _resolve_schema_files(
+    *,
+    imodel: str | None = None,
+    ext_schema_path: str | None = None,
+    ext_schema_file: str | None = None,
+) -> list[Path]:
+    """Determine which schema file(s) to use based on the input parameters."""
+    if ext_schema_file:
+        path = Path(ext_schema_file)
+        if not path.is_file():
+            raise FileNotFoundError(f"Can't find input schema file {ext_schema_file}")
+        return [path]
+
+    if ext_schema_path:
+        schema_path = Path(ext_schema_path).resolve()
+        path = schema_path / f"{schema_path.name}.json"
+        if not path.is_file():
+            raise FileNotFoundError(f"Can't find input schema path {ext_schema_path}")
+        return [path]
+
+    if imodel:
+        parts = imodel.split("_")
+        model = parts[0]
+        if model not in properties.supported_data_models:
+            raise ValueError(f"Input data model {model} not supported")
+
+        return collect_json_files(*parts, base=f"{properties._base}.schemas")
+
+    raise ValueError(
+        "One of 'imodel', 'ext_schema_path', or 'ext_schema_file' must be set"
+    )
+
+
+def _normalize_schema(schema: SchemaDict) -> SchemaDict:
+    """Normalize a schema dictionary by ensuring it has sections and a parsing order."""
+    header = schema.get("header", {})
+    sections = schema.get("sections")
+    elements = schema.get("elements")
+
+    if not sections:
+        if not elements:
+            raise KeyError("Schema has no sections and no elements")
+        level = properties.dummy_level
+        dummy_header = {
+            k: header[k] for k in ("delimiter", "field_layout", "format") if k in header
         }
-        schema["header"]["parsing_order"] = [{"s": [properties.dummy_level]}]
-        schema.pop("elements", None)
-        schema["sections"][properties.dummy_level]["header"]["delimiter"] = schema[
-            "header"
-        ].get("delimiter")
-        schema["header"].pop("delimiter", None)
-        schema["sections"][properties.dummy_level]["header"]["field_layout"] = schema[
-            "header"
-        ].get("field_layout")
-        schema["header"].pop("field_layout", None)
-        schema["sections"][properties.dummy_level]["header"]["format"] = schema[
-            "header"
-        ].get("format")
-        schema["header"].pop("format", None)
-
-    # 3.3. Make parsing order explicit
-    if not schema["header"].get("parsing_order"):  # assume sequential
-        schema["header"]["parsing_order"] = [{"s": list(schema["sections"].keys())}]
-
-    # 3.4. Make disable_read and field_layout explicit: this is ruled by delimiter being set,
-    # unless explicitly set
-    for section in schema["sections"].keys():
-        if schema["sections"][section]["header"].get("disable_read"):
-            continue
-        else:
-            schema["sections"][section]["header"]["disable_read"] = False
-        if not schema["sections"][section]["header"].get("field_layout"):
-            delimiter = schema["sections"][section]["header"].get("delimiter")
-            schema["sections"][section]["header"]["field_layout"] = (
-                "delimited" if delimiter else "fixed_width"
-            )
-        for element in schema["sections"][section]["elements"].keys():
-            column_type = schema["sections"][section]["elements"][element].get(
-                "column_type"
-            )
-            schema["sections"][section]["elements"][element]["column_type"] = (
-                convert_dtype_to_default(
-                    column_type,
-                    section,
-                    element,
-                )
-            )
-    return schema
-
-
-def read_schema(imodel=None, ext_schema_path=None, ext_schema_file=None) -> dict:
+        sections = {level: {"header": dummy_header, "elements": elements}}
+        schema = {k: v for k, v in schema.items() if k != "elements"}
+
+    header = {
+        **header,
+        "parsing_order": header.get("parsing_order") or [{"s": list(sections.keys())}],
+    }
+
+    return {**schema, "header": header, "sections": sections}
+
+
+def read_schema(
+    imodel: str | None = None,
+    ext_schema_path: str | None = None,
+    ext_schema_file: str | None = None,
+) -> SchemaDict:
     """
-    Read a data model schema file.
+    Load and normalize a data model schema.
 
-    Read a data model schema file to a dictionary and
-    completes it by adding explicitly information the
-    reader tool needs
+    Reads a data model schema file into a dictionary and
+    normalizes it by adding the information required by
+    the parser.
 
     Parameters
     ----------
@@ -134,99 +170,20 @@ def read_schema(imodel=None, ext_schema_path=None, ext_schema_file=None) -> dict
 
     Returns
     -------
-    dict
+    SchemaDict
         Data model schema
     """
-    # 1. Validate input
-    if ext_schema_file:
-        if not os.path.isfile(ext_schema_file):
-            logging.error(f"Can't find input schema file {ext_schema_file}")
-            return
-        schema_files = Path(ext_schema_file)
-    elif ext_schema_path:
-        schema_path = os.path.abspath(ext_schema_path)
-        schema_name = os.path.basename(schema_path)
-        schema_files = os.path.join(schema_path, schema_name + ".json")
-        if not os.path.isfile(schema_files):
-            logging.error(f"Can't find input schema file {schema_files}")
-            return
-        schema_files = Path(schema_files)
-    else:
-        imodel = imodel.split("_")
-        if imodel[0] not in properties.supported_data_models:
-            logging.error("Input data model " f"{imodel[0]}" " not supported")
-            return
-        schema_files = collect_json_files(*imodel, base=f"{properties._base}.schemas")
-
-    if isinstance(schema_files, Path):
-        schema_files = [schema_files]
-
-    # 2. Get schema
-    schema = combine_dicts(schema_files, base=f"{properties._base}.schemas")
-    schema["name"] = schema_files
-
-    # 3. Expand schema
-    # Fill in the initial schema to "full complexity": to homogenize schema,
-    # explicitly add info that is implicit to given situations/data models
-
-    # One report per record: make sure later changes are reflected in MULTIPLE
-    # REPORTS PER RECORD case below if we ever use it!
-    # Currently only supported case: one report per record (line)
-    # 3.1. First check for no header case: sequential sections
-    return _read_schema(schema)
-
-
-def df_schema(df_columns, schema) -> dict:
-    """
-    Create simple data model schema dictionary.
-
-    Create a simple attribute dictionary for the elements
-    in a dataframe from its data model schema
-
-    Parameters
-    ----------
-    df_columns : list
-        The columns in the data frame (data elements from
-        the data model)
-    schema : dict
-        The data model schema
+    schema_files = _resolve_schema_files(
+        imodel=imodel,
+        ext_schema_path=ext_schema_path,
+        ext_schema_file=ext_schema_file,
+    )
 
+    raw_schema = combine_dicts(schema_files, base=f"{properties._base}.schemas")
 
-    Returns
-    -------
-    dict
-        Data elements attributes
-
-    """
+    enriched = {
+        **raw_schema,
+        "name": schema_files,
+    }
 
-    def clean_schema(columns, schema):
-        # Could optionally add cleaning of element descriptors that only apply
-        # to the initial reading of the data model: field_length, etc....
-        for element in list(schema):
-            if element not in columns:
-                schema.pop(element)
-
-    def get_index(idx, lst, section):
-        if len(lst) == 1:
-            return idx
-        return (section, idx)
-
-    flat_schema = dict()
-    for section in schema.get("sections"):
-        if schema["sections"].get(section).get("header").get("disable_read"):
-            flat_schema.update({section: {"column_type": "object"}})
-        else:
-            flat_schema.update(
-                {
-                    get_index(x, list(schema.get("sections")), section): schema[
-                        "sections"
-                    ]
-                    .get(section)
-                    .get("elements")
-                    .get(x)
-                    for x in schema["sections"].get(section).get("elements")
-                }
-            )
-
-    clean_schema(df_columns, flat_schema)
-    return flat_schema
+    return _normalize_schema(enriched)
diff --git a/cdm_reader_mapper/mdf_reader/utils/__init__.py b/cdm_reader_mapper/mdf_reader/utils/__init__.py
index 015b78b8..338bd945 100755
--- a/cdm_reader_mapper/mdf_reader/utils/__init__.py
+++ b/cdm_reader_mapper/mdf_reader/utils/__init__.py
@@ -1,6 +1,3 @@
 """Common Data Model (CDM) reader utilities."""
 
 from __future__ import annotations
-
-from .converters import converters  # noqa
-from .decoders import decoders  # noqa
diff --git a/cdm_reader_mapper/mdf_reader/utils/configurator.py b/cdm_reader_mapper/mdf_reader/utils/configurator.py
deleted file mode 100755
index 43b1358f..00000000
--- a/cdm_reader_mapper/mdf_reader/utils/configurator.py
+++ /dev/null
@@ -1,267 +0,0 @@
-"""Auxiliary functions and class for reading, converting, decoding and validating MDF files."""
-
-from __future__ import annotations
-
-import ast
-import csv
-import logging
-
-import numpy as np
-import pandas as pd
-
-from itertools import zip_longest
-
-from .. import properties
-from . import converters, decoders
-from .utilities import convert_dtypes
-
-
-class Configurator:
-    """Class for configuring MDF reader information."""
-
-    def __init__(
-        self,
-        df=pd.DataFrame(),
-        schema=None,
-        order=None,
-        valid=None,
-    ):
-        self.df = df
-        self.orders = order or []
-        self.valid = valid or []
-        self.schema = schema or {}
-
-    def _validate_sentinel(self, i, line, sentinel) -> bool:
-        slen = len(sentinel)
-        str_start = line[i : i + slen]
-        return str_start == sentinel
-
-    def _get_index(self, section, order) -> dict | tuple[str, dict]:
-        if len(self.orders) == 1:
-            return section
-        else:
-            return (order, section)
-
-    def _get_ignore(self, section_dict) -> bool:
-        ignore = section_dict.get("ignore")
-        if isinstance(ignore, str):
-            ignore = ast.literal_eval(ignore)
-        return ignore
-
-    def _get_dtype(self) -> str:
-        return properties.pandas_dtypes.get(self.sections_dict.get("column_type"))
-
-    def _get_converter(self) -> callable:
-        return converters.get(self.sections_dict.get("column_type"))
-
-    def _get_conv_kwargs(self) -> dict:
-        column_type = self.sections_dict.get("column_type")
-        if column_type is None:
-            return
-        return {
-            converter_arg: self.sections_dict.get(converter_arg)
-            for converter_arg in properties.data_type_conversion_args.get(column_type)
-        }
-
-    def _get_decoder(self) -> callable | None:
-        encoding = self.sections_dict.get("encoding")
-        if encoding is None:
-            return
-        column_type = self.sections_dict.get("column_type")
-        if column_type is None:
-            return
-        return decoders.get(encoding).get(column_type)
-
-    def _update_dtypes(self, dtypes, index) -> dict:
-        dtype = self._get_dtype()
-        if dtype:
-            dtypes[index] = dtype
-        return dtypes
-
-    def _update_converters(self, converters, index) -> dict:
-        converter = self._get_converter()
-        if converter:
-            converters[index] = converter
-        return converters
-
-    def _update_kwargs(self, kwargs, index) -> dict:
-        conv_kwargs = self._get_conv_kwargs()
-        if conv_kwargs:
-            kwargs[index] = conv_kwargs
-        return kwargs
-
-    def _update_decoders(self, decoders, index) -> dict:
-        decoder = self._get_decoder()
-        if decoder:
-            decoders[index] = decoder
-        return decoders
-
-    def get_configuration(self) -> dict:
-        """Get ICOADS data model specific information."""
-        disable_reads = []
-        dtypes = {}
-        converters = {}
-        kwargs = {}
-        decoders = {}
-        for order in self.orders:
-            self.order = order
-            header = self.schema["sections"][order]["header"]
-            disable_read = header.get("disable_read")
-            if disable_read is True:
-                disable_reads.append(order)
-                continue
-            sections = self.schema["sections"][order]["elements"]
-            for section in sections.keys():
-                self.sections_dict = sections[section]
-                index = self._get_index(section, order)
-                ignore = (order not in self.valid) or self._get_ignore(
-                    self.sections_dict
-                )
-                if ignore is True:
-                    continue
-                dtypes = self._update_dtypes(dtypes, index)
-                converters = self._update_converters(converters, index)
-                kwargs = self._update_kwargs(kwargs, index)
-                decoders = self._update_decoders(decoders, index)
-
-        dtypes, parse_dates = convert_dtypes(dtypes)
-        return {
-            "convert_decode": {
-                "converter_dict": converters,
-                "converter_kwargs": kwargs,
-                "decoder_dict": decoders,
-            },
-            "self": {
-                "dtypes": dtypes,
-                "disable_reads": disable_reads,
-                "parse_dates": parse_dates,
-                "encoding": self.schema["header"].get("encoding", "utf-8"),
-            },
-        }
-
-    def open_pandas(self) -> pd.DataFrame:
-        """Open TextParser to pd.DataSeries."""
-        return self.df.apply(lambda x: self._read_line(x[0]), axis=1)
-
-    def _process_section(
-        self, line: str, i: int, order: str, header: dict, data_dict: dict
-    ) -> int:
-        sections = self.schema["sections"][order]["elements"]
-        section_length = header.get("length", properties.MAX_FULL_REPORT_WIDTH)
-        delimiter = header.get("delimiter")
-        field_layout = header.get("field_layout")
-        sentinel = header.get("sentinel")
-        bad_sentinel = sentinel is not None and not self._validate_sentinel(
-            i, line, sentinel
-        )
-        k = i + section_length
-
-        if delimiter and header.get("format") == "delimited":
-            fields = list(csv.reader([line[i:]], delimiter=delimiter))[0]
-            for field_name, field in zip_longest(
-                sections.keys(), fields, fillvalue=None
-            ):
-                index = self._get_index(field_name, order)
-                data_dict[index] = field.strip() if field is not None else None
-                if field is not None:
-                    i += len(field)
-            return i
-
-        if delimiter and field_layout != "fixed_width":
-            logging.error(
-                f"Delimiter for {order} is set to {delimiter}. "
-                f"Please specify either format or field_layout in your header schema {header}."
-            )
-            return i
-
-        for section, section_dict in sections.items():
-            missing = True
-            index = self._get_index(section, order)
-            ignore = (order not in self.valid) or self._get_ignore(section_dict)
-            na_value = section_dict.get("missing_value")
-            field_length = section_dict.get(
-                "field_length", properties.MAX_FULL_REPORT_WIDTH
-            )
-
-            j = i if bad_sentinel else i + field_length
-            if j > k:
-                missing = False
-                j = k
-
-            if not ignore:
-                value = line[i:j]
-                if not value.strip() or value == na_value:
-                    value = True
-                if i == j and missing:
-                    value = False
-                data_dict[index] = value
-
-            if delimiter and line[j : j + len(delimiter)] == delimiter:
-                j += len(delimiter)
-
-            i = j
-
-        return i
-
-    def _read_line(self, line: str) -> pd.Series:
-        i = 0
-        data_dict = {}
-
-        for order in self.orders:
-            header = self.schema["sections"][order]["header"]
-
-            if header.get("disable_read") is True:
-                data_dict[order] = line[i : properties.MAX_FULL_REPORT_WIDTH]
-                continue
-
-            i = self._process_section(line, i, order, header, data_dict)
-
-        return pd.Series(data_dict)
-
-    def open_netcdf(self) -> pd.DataFrame:
-        """Open netCDF to pd.Series."""
-
-        def replace_empty_strings(series):
-            if series.dtype == "object":
-                series = series.str.decode("utf-8")
-                series = series.str.strip()
-                series = series.map(lambda x: True if x == "" else x)
-            return series
-
-        missing_values = []
-        attrs = {}
-        renames = {}
-        disables = []
-        for order in self.orders:
-            self.order = order
-            header = self.schema["sections"][order]["header"]
-            disable_read = header.get("disable_read")
-            if disable_read is True:
-                disables.append(order)
-                continue
-            sections = self.schema["sections"][order]["elements"]
-            for section in sections.keys():
-                self.sections_dict = sections[section]
-                index = self._get_index(section, order)
-                ignore = (order not in self.valid) or self._get_ignore(
-                    self.sections_dict
-                )
-                if ignore is True:
-                    continue
-                if section in self.df.data_vars:
-                    renames[section] = index
-                elif section in self.df.dims:
-                    renames[section] = index
-                elif section in self.df.attrs:
-                    attrs[index] = self.df.attrs[section]
-                else:
-                    missing_values.append(index)
-
-        df = self.df[renames.keys()].to_dataframe().reset_index()
-        attrs = {k: v.replace("\n", "; ") for k, v in attrs.items()}
-        df = df.rename(columns=renames)
-        df = df.assign(**attrs)
-        df[disables] = np.nan
-        df = df.apply(lambda x: replace_empty_strings(x))
-        df[missing_values] = False
-        return df
diff --git a/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py b/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py
new file mode 100755
index 00000000..121eced5
--- /dev/null
+++ b/cdm_reader_mapper/mdf_reader/utils/convert_and_decode.py
@@ -0,0 +1,370 @@
+"""pandas converting operators."""
+
+from __future__ import annotations
+
+from decimal import Decimal, InvalidOperation
+from typing import Callable, Any
+
+import pandas as pd
+
+from .. import properties
+from .utilities import convert_str_boolean
+
+
+def max_decimal_places(*decimals: Decimal) -> int:
+    """
+    Return the maximum number of decimal places among Decimal values.
+
+    Parameters
+    ----------
+    decimals : Decimal
+        One or more Decimal values.
+
+    Returns
+    -------
+    int
+        Maximum number of decimal places.
+    """
+    return max(
+        (-d.as_tuple().exponent if d.as_tuple().exponent < 0 else 0) for d in decimals
+    )
+
+
+def to_numeric(x: Any, scale: Decimal, offset: Decimal) -> Decimal | bool:
+    """
+    Convert a value to a scaled Decimal with offset applied.
+
+    Rules
+    -----
+    - Boolean values are returned unchanged
+    - Empty or invalid values return False
+    - Strings are stripped and spaces replaced with zeros
+    - Result is quantized to the maximum decimal precision
+      of input, scale, or offset
+
+    Parameters
+    ----------
+    x : Any
+        Input value to convert.
+    scale : Decimal
+        Scale factor.
+    offset : Decimal
+        Offset value.
+
+    Returns
+    -------
+    Decimal | bool
+        Converted Decimal value, boolean, or False if invalid.
+    """
+    x = convert_str_boolean(x)
+
+    if isinstance(x, bool):
+        return x
+
+    if isinstance(x, str):
+        x = x.strip()
+        x = x.replace(" ", "0")
+
+    try:
+        x_dec = Decimal(str(x))
+        decimal_places = max_decimal_places(offset, scale, x_dec)
+        result = offset + x_dec * scale
+
+        if decimal_places == 0:
+            return result
+
+        return result.quantize(Decimal("1." + "0" * decimal_places))
+
+    except (InvalidOperation, TypeError, ValueError):
+        return False
+
+
+class Decoders:
+    """
+    Registry-based decoder dispatcher for column-wise decoding.
+
+    Currently supports Base36 decoding for numeric-like fields.
+    """
+
+    def __init__(self, dtype: str, encoding: str = "base36") -> None:
+        """
+        Initialization.
+
+        Parameters
+        ----------
+        dtype : str
+            Target data type name (e.g. numeric field type)
+        encoding : str, default "base36"
+            Encoding scheme to use
+        """
+        self.dtype = dtype
+        self.encoding = encoding
+
+        self._registry = {"key": self.base36}
+
+        for numeric_type in properties.numeric_types:
+            self._registry[numeric_type] = self.base36
+
+    def decoder(self) -> Callable[[pd.Series], pd.Series] | None:
+        """
+        Return the decoder function for the configured dtype and encoding.
+
+        Returns
+        -------
+        callable or None
+            Decoder function accepting a pandas Series, or None if encoding
+            is unsupported.
+
+        Raises
+        ------
+        KeyError
+            If no decoder is registered for the given dtype.
+        """
+        if self.encoding != "base36":
+            return None
+
+        try:
+            return self._registry[self.dtype]
+        except KeyError as exc:
+            raise KeyError(f"No converter registered for '{self.dtype}'") from exc
+
+    def base36(self, data: pd.Series) -> pd.Series:
+        """
+        Decode a pandas Series from Base36 to stringified base-10 integers.
+
+        Boolean values are preserved.
+        Invalid values raise ValueError via `int(..., 36)`.
+
+        Parameters
+        ----------
+        data : pd.Series
+            Input Series containing base36-encoded values
+
+        Returns
+        -------
+        pd.Series
+            Decoded Series with stringified integers or booleans
+        """
+
+        def _base36(x):
+            x = convert_str_boolean(x)
+            if isinstance(x, bool):
+                return x
+            return str(int(str(x), 36))
+
+        return data.apply(_base36)
+
+
+class Converters:
+    """
+    Registry-based converter for pandas Series.
+
+    Converts object-typed Series into numeric, datetime, or cleaned object
+    representations based on the configured dtype.
+    """
+
+    def __init__(self, dtype: str) -> None:
+        """
+        Initialization.
+
+        Parameters
+        ----------
+        dtype : str
+            Target output dtype identifier
+        """
+        self.dtype = dtype
+        self.numeric_scale = 1.0 if self.dtype == "float" else 1
+        self.numeric_offset = 0.0 if self.dtype == "float" else 0
+
+        self.preprocessing_functions = {
+            "PPPP": lambda x: (
+                str(10000 + int(x)) if isinstance(x, str) and x.startswith("0") else x
+            )
+        }
+
+        self._registry = {
+            "datetime": self.object_to_datetime,
+            "str": self.object_to_object,
+            "object": self.object_to_object,
+            "key": self.object_to_object,
+        }
+
+        for numeric_type in properties.numeric_types:
+            self._registry[numeric_type] = self.object_to_numeric
+
+    def converter(self) -> Callable[..., pd.Series]:
+        """
+        Return the converter function registered for the configured dtype.
+
+        Returns
+        -------
+        callable
+            Converter function
+
+        Raises
+        ------
+        KeyError
+            If no converter is registered for the dtype
+        """
+        try:
+            return self._registry[self.dtype]
+        except KeyError as exc:
+            raise KeyError(f"No converter registered for '{self.dtype}'") from exc
+
+    def object_to_numeric(
+        self,
+        data: pd.Series,
+        scale: float | int | None = None,
+        offset: float | int | None = None,
+    ) -> pd.Series:
+        """
+        Convert object Series to numeric using Decimal arithmetic.
+
+        - Right spaces are treated as zeros
+        - Optional scale and offset may be applied
+        - Boolean values are preserved
+        - Invalid conversions return False
+
+        Parameters
+        ----------
+        data : pd.Series
+            Object-typed Series
+        scale : numeric, optional
+            Scale factor
+        offset : numeric, optional
+            Offset value
+
+        Returns
+        -------
+        pd.Series
+            Converted Series
+        """
+        if data.dtype != "object":
+            return data
+
+        scale = scale if scale else self.numeric_scale
+        offset = offset if offset else self.numeric_offset
+
+        scale = Decimal(str(scale))
+        offset = Decimal(str(offset))
+
+        column_name = data.name
+        if column_name in self.preprocessing_functions:
+            data = data.apply(self.preprocessing_functions[column_name])
+
+        return data.apply(lambda x: to_numeric(x, scale, offset))
+
+    def object_to_object(
+        self,
+        data: pd.Series,
+        disable_white_strip: bool | str = False,
+    ) -> pd.Series:
+        """
+        Clean object Series by stripping whitespace and nullifying empty strings.
+
+        Parameters
+        ----------
+        data : pd.Series
+            Object-typed Series
+        disable_white_strip : bool or {"l", "r"}, default False
+            Control whitespace stripping behavior
+
+        Returns
+        -------
+        pd.Series
+            Cleaned Series
+        """
+        if data.dtype != "object":
+            return data
+
+        if not disable_white_strip:
+            data = data.str.strip()
+        elif disable_white_strip == "l":
+            data = data.str.rstrip()
+        elif disable_white_strip == "r":
+            data = data.str.lstrip()
+
+        return data.apply(
+            lambda x: None if isinstance(x, str) and (x.isspace() or not x) else x
+        )
+
+    def object_to_datetime(
+        self,
+        data: pd.Series,
+        datetime_format: str = "%Y%m%d",
+    ) -> pd.Series:
+        """
+        Convert object Series to pandas datetime.
+
+        Invalid values are coerced to NaT.
+
+        Parameters
+        ----------
+        data : pd.Series
+            Object-typed Series
+        datetime_format : str, default "%Y%m%d"
+            Datetime parsing format
+
+        Returns
+        -------
+        pd.Series
+            Datetime Series
+        """
+        if data.dtype != "object":
+            return data
+
+        return pd.to_datetime(data, format=datetime_format, errors="coerce")
+
+
+def convert_and_decode(
+    data: pd.DataFrame,
+    convert_flag: bool = True,
+    decode_flag: bool = True,
+    converter_dict: dict[str, Callable[[pd.Series], pd.Series]] | None = None,
+    converter_kwargs: dict[str, dict] | None = None,
+    decoder_dict: dict[str, Callable[[pd.Series], pd.Series]] | None = None,
+) -> pd.DataFrame:
+    """Convert and decode data entries by using a pre-defined data model.
+
+    Overwrite attribute `data` with converted and/or decoded data.
+
+    Parameters
+    ----------
+    data : pd.DataFrame
+        Data to convert and decode.
+    convert_flag : bool, default True
+        If True, apply converters to the columns defined in `converter_dict`.
+    decode_flag : bool, default True
+        If True, apply decoders to the columns defined in `decoder_dict`.
+    converter_dict : dict[str, callable], optional
+        Column-specific converter functions. If None, defaults to empty dict.
+    converter_kwargs : dict[str, dict], optional
+        Keyword arguments for each converter function.
+    decoder_dict : dict[str, callable], optional
+        Column-specific decoder functions. If None, defaults to empty dict.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with converted and decoded columns.
+    """
+    converter_dict = converter_dict or {}
+    converter_kwargs = converter_kwargs or {}
+    decoder_dict = decoder_dict or {}
+
+    if decode_flag:
+        for column, dec_func in decoder_dict.items():
+            if column in data.columns:
+                decoded = dec_func(data[column])
+                decoded.index = data[column].index
+                data[column] = decoded
+
+    if convert_flag:
+        for column, conv_func in converter_dict.items():
+            if column in data.columns:
+                kwargs = converter_kwargs.get(column, {})
+                converted = conv_func(data[column], **kwargs)
+                converted.index = data[column].index
+                data[column] = converted
+
+    return data
diff --git a/cdm_reader_mapper/mdf_reader/utils/converters.py b/cdm_reader_mapper/mdf_reader/utils/converters.py
deleted file mode 100755
index 398be5f6..00000000
--- a/cdm_reader_mapper/mdf_reader/utils/converters.py
+++ /dev/null
@@ -1,126 +0,0 @@
-"""pandas converting operators."""
-
-from __future__ import annotations
-
-from decimal import Decimal
-
-import pandas as pd
-
-from .. import properties
-from .utilities import convert_str_boolean
-
-
-def max_decimal_places(*decimals):
-    """Get maximum number of decimal places for each Decimal number."""
-    decimal_places = [
-        -d.as_tuple().exponent if d.as_tuple().exponent < 0 else 0 for d in decimals
-    ]
-    return max(decimal_places)
-
-
-class df_converters:
-    """Class for converting pandas DataFrame."""
-
-    def __init__(self, dtype):
-        self.dtype = dtype
-        self.numeric_scale = 1.0 if self.dtype == "float" else 1
-        self.numeric_offset = 0.0 if self.dtype == "float" else 0
-        self.preprocessing_functions = {
-            "PPPP": lambda x: (
-                str(10000 + int(x)) if isinstance(x, str) and x.startswith("0") else x
-            )
-        }
-
-    def to_numeric(self, data, offset, scale) -> pd.Series:
-        """Convert object type elements of a pandas series to numeric type."""
-
-        def _to_numeric(x):
-            x = convert_str_boolean(x)
-            if isinstance(x, bool):
-                return x
-            if isinstance(x, str):
-                x = x.strip()
-                x.replace(" ", "0")
-            try:
-                x = Decimal(str(x))
-                decimal_places = max_decimal_places(offset, scale, x)
-                result = offset + x * scale
-                return result.quantize(Decimal("1." + "0" * decimal_places))
-            except ValueError:
-                return False
-
-        offset = Decimal(str(offset))
-        scale = Decimal(str(scale))
-
-        # Apply preprocessing if a function exists for this column
-        column_name = data.name
-        if column_name in self.preprocessing_functions:
-            data = data.apply(self.preprocessing_functions[column_name])
-
-        return data.apply(lambda x: _to_numeric(x))
-
-    def object_to_numeric(self, data, scale=None, offset=None) -> pd.Series:
-        """
-        Convert the object type elements of a pandas series to numeric type.
-
-        Right spaces are treated as zeros. Scale and offset can optionally be applied.
-        The final data type according to the class dtype.
-
-        Parameters
-        ----------
-        self : dtype, numeric_scale and numeric_offset
-            Pandas dataframe with a column per report sections.
-            The sections in the columns as a block strings.
-        data : pandas.Series
-            Series with data to convert. Data must be object type
-
-        Keyword Arguments
-        -----------------
-        scale : numeric, optional
-            Scale to apply after conversion to numeric
-        offset : numeric, optional
-            Offset to apply after conversion to numeric
-        column_name : str, optional
-            Name of the column being processed
-
-        Returns
-        -------
-        data : pandas.Series
-            Data series of type self.dtype
-
-        """
-        scale = scale if scale else self.numeric_scale
-        offset = offset if offset else self.numeric_offset
-        if data.dtype == "object":
-            data = self.to_numeric(data, offset, scale)
-        return data
-
-    def object_to_object(self, data, disable_white_strip=False) -> pd.Series:
-        """DOCUMENTATION."""
-        if data.dtype != "object":
-            return data
-
-        if not disable_white_strip:
-            data = data.str.strip()
-        elif disable_white_strip == "l":
-            data = data.str.rstrip()
-        elif disable_white_strip == "r":
-            data = data.str.lstrip()
-        return data.apply(
-            lambda x: None if isinstance(x, str) and (x.isspace() or not x) else x
-        )
-
-    def object_to_datetime(self, data, datetime_format="%Y%m%d") -> pd.DateTimeIndex:
-        """DOCUMENTATION."""
-        if data.dtype != "object":
-            return data
-        return pd.to_datetime(data, format=datetime_format, errors="coerce")
-
-
-converters = dict()
-for dtype in properties.numeric_types:
-    converters[dtype] = df_converters(dtype).object_to_numeric
-converters["datetime"] = df_converters("datetime").object_to_datetime
-converters["str"] = df_converters("str").object_to_object
-converters["object"] = df_converters("object").object_to_object
-converters["key"] = df_converters("key").object_to_object
diff --git a/cdm_reader_mapper/mdf_reader/utils/decoders.py b/cdm_reader_mapper/mdf_reader/utils/decoders.py
deleted file mode 100755
index 53b42205..00000000
--- a/cdm_reader_mapper/mdf_reader/utils/decoders.py
+++ /dev/null
@@ -1,33 +0,0 @@
-"""pandas decoding operators."""
-
-from __future__ import annotations
-
-from .. import properties
-from .utilities import convert_str_boolean
-
-import pandas as pd
-
-
-class df_decoders:
-    """DOCUMENTATION."""
-
-    def __init__(self, dtype):
-        # Return as object, conversion to actual type in converters only!
-        self.dtype = "object"
-
-    def base36(self, data) -> pd.Series:
-        """DOCUMENTATION."""
-
-        def _base36(x):
-            x = convert_str_boolean(x)
-            if isinstance(x, bool):
-                return x
-            return str(int(str(x), 36))
-
-        return data.apply(lambda x: _base36(x))
-
-
-decoders = {"base36": {}}
-for dtype in properties.numeric_types:
-    decoders["base36"][dtype] = df_decoders(dtype).base36
-decoders["base36"]["key"] = df_decoders("key").base36
diff --git a/cdm_reader_mapper/mdf_reader/utils/filereader.py b/cdm_reader_mapper/mdf_reader/utils/filereader.py
index 3f62fe0f..9b556cf6 100755
--- a/cdm_reader_mapper/mdf_reader/utils/filereader.py
+++ b/cdm_reader_mapper/mdf_reader/utils/filereader.py
@@ -2,206 +2,395 @@
 
 from __future__ import annotations
 
-import csv
 import logging
-import os
-from copy import deepcopy
-from io import StringIO
+
+from typing import Callable, Any, Sequence, Mapping
 
 import pandas as pd
 import xarray as xr
 
+from dataclasses import replace
+from pandas.io.parsers import TextFileReader
+
 from .. import properties
-from ..schemas import schemas
-from .configurator import Configurator
-from .utilities import validate_path
+from .utilities import (
+    process_textfilereader,
+    remove_boolean_values,
+)
+
+from .convert_and_decode import convert_and_decode
+from .validators import validate
+from .parser import (
+    update_xr_config,
+    update_pd_config,
+    parse_pandas,
+    parse_netcdf,
+    build_parser_config,
+    ParserConfig,
+)
+
+from cdm_reader_mapper.core.databundle import DataBundle
+
+
+def _apply_or_chunk(
+    data: pd.DataFrame | TextFileReader,
+    func: Callable[..., Any],
+    func_args: Sequence[Any] | None = None,
+    func_kwargs: Mapping[str, Any] | None = None,
+    **kwargs: Mapping[str, Any],
+):
+    """Apply a function directly or chunk-wise depending on input type."""
+    func_args = func_args or []
+    func_kwargs = func_kwargs or {}
+    if not isinstance(data, TextFileReader):
+        return func(data, *func_args, **func_kwargs)
+    return process_textfilereader(
+        data,
+        func,
+        func_args,
+        func_kwargs,
+        **kwargs,
+    )
+
+
+def _merge_kwargs(*dicts: Mapping[str, Any]) -> dict[str, Any]:
+    """Merge multiple keyword-argument dictionaries."""
+    merged = {}
+    for d in dicts:
+        for k in d:
+            if k in merged:
+                raise ValueError(f"Duplicate kwarg '{k}' in open_data()")
+            merged[k] = d[k]
+    return merged
+
+
+def _apply_multiindex(df: pd.DataFrame) -> pd.DataFrame:
+    """Convert tuple-based columns to a pandas MultiIndex."""
+    if not df.columns.map(lambda x: isinstance(x, tuple)).all():
+        return df
+
+    df.columns = pd.MultiIndex.from_tuples(
+        [col if isinstance(col, tuple) else (None, col) for col in df.columns],
+    )
+    return df
+
+
+def _select_years(
+    df: pd.DataFrame,
+    selection: tuple[int | None, int | None],
+    year_col,
+) -> pd.DataFrame:
+    """Filter rows of a DataFrame by a year range."""
+    year_init, year_end = selection
+    if year_init is None and year_end is None:
+        return df
+
+    years = pd.to_numeric(df[year_col], errors="coerce")
+
+    mask = pd.Series(True, index=df.index)
+
+    if year_init is not None:
+        mask &= years >= year_init
+
+    if year_end is not None:
+        mask &= years <= year_end
+
+    mask &= years.notna()
+
+    return df.loc[mask].reset_index(drop=True)
 
 
 class FileReader:
-    """Class to read marine-meteorological data."""
+    """
+    Class to read marine-meteorological data.
+
+    Provides a high-level interface to read, parse, filter, convert,
+    decode, and validate data from multiple sources (FWF, CSV, NetCDF).
+    """
 
     def __init__(
         self,
-        source,
-        imodel=None,
-        ext_schema_path=None,
-        ext_schema_file=None,
-        ext_table_path=None,
-        year_init=None,
-        year_end=None,
+        imodel: str | None = None,
+        ext_schema_path: str | None = None,
+        ext_schema_file: str | None = None,
     ):
-        # 0. VALIDATE INPUT
-        if not imodel and not ext_schema_path:
-            logging.error(
-                "A valid input data model name or path to data model must be provided"
-            )
-            return
-        if not os.path.isfile(source):
-            logging.error(f"Can't find input data file {source}")
-            return
-        if not validate_path("ext_schema_path", ext_schema_path):
-            return
-
-        self.source = source
-        self.imodel = imodel
-        self.year_init = year_init
-        self.year_end = year_end
-        self.ext_table_path = ext_table_path
-
-        # 1. GET DATA MODEL
-        # Schema reader will return empty if cannot read schema or is not valid
-        # and will log the corresponding error
-        # multiple_reports_per_line error also while reading schema
-        logging.info("READING DATA MODEL SCHEMA FILE...")
-        if ext_schema_path or ext_schema_file:
-            self.schema = schemas.read_schema(
-                ext_schema_path=ext_schema_path, ext_schema_file=ext_schema_file
-            )
+        """
+        Initialize FileReader with a data model and parser configuration.
+
+        Parameters
+        ----------
+        imodel : str
+            Name of the data model (e.g., 'ICOADS').
+        args, kwargs
+            Arguments passed to ``build_parser_config``.
+        """
+        self.imodel: str = imodel
+        self.config: ParserConfig = build_parser_config(
+            imodel=imodel,
+            ext_schema_path=ext_schema_path,
+            ext_schema_file=ext_schema_file,
+        )
+
+    def _process_data(
+        self,
+        data: pd.DataFrame | TextFileReader,
+        convert_flag: bool = False,
+        decode_flag: bool = False,
+        converter_dict: dict | None = None,
+        converter_kwargs: dict | None = None,
+        decoder_dict: dict | None = None,
+        validate_flag: bool = False,
+        ext_table_path: str | None = None,
+        sections: Sequence[str] | None = None,
+        excludes: Sequence[str] | None = None,
+        year_init: int | None = None,
+        year_end: int | None = None,
+        config: ParserConfig | None = None,
+        parse_mode: str = "pandas",
+    ) -> tuple[pd.DataFrame, pd.DataFrame, ParserConfig]:
+        """
+        Core processing of raw data: parse, filter, convert, decode, validate.
+
+        Parameters
+        ----------
+        data : pandas.DataFrame or TextFileReader
+            Input data.
+        convert_flag : bool
+            Whether to apply converters.
+        decode_flag : bool
+            Whether to apply decoders.
+        converter_dict : dict, optional
+            Mapping of columns to converter functions.
+        converter_kwargs : dict, optional
+            Keyword arguments for converters.
+        decoder_dict : dict, optional
+            Mapping of columns to decoder functions.
+        validate_flag : bool
+            Whether to apply validation.
+        ext_table_path : str, optional
+            Path to external validation tables.
+        sections : sequence of str, optional
+            Sections to include.
+        excludes : sequence of str, optional
+            Sections to exclude.
+        year_init : int, optional
+            Initial year for filtering.
+        year_end : int, optional
+            End year for filtering.
+        config : ParserConfig, optional
+            Parser configuration.
+        parse_mode : str
+            Parsing backend ('pandas' or 'netcdf').
+
+        Returns
+        -------
+        tuple of (data, mask, config)
+            - data : pandas.DataFrame with parsed, filtered, converted data
+            - mask : pandas.DataFrame with boolean mask for validation
+            - config : ParserConfig updated with final columns
+        """
+        config = config or self.config
+
+        if parse_mode == "pandas":
+            data = parse_pandas(data, config.order_specs, sections, excludes)
+        elif parse_mode == "netcdf":
+            data = parse_netcdf(data, config.order_specs, sections, excludes)
         else:
-            self.schema = schemas.read_schema(imodel=imodel)
-
-    def _adjust_schema(self, ds, dtypes) -> dict:
-        sections = deepcopy(self.schema["sections"])
-        for section in sections.keys():
-            elements = sections[section]["elements"]
-            for data_var in elements.keys():
-                not_in_data_vars = data_var not in ds.data_vars
-                not_in_glb_attrs = data_var not in ds.attrs
-                not_in_data_dims = data_var not in ds.dims
-                if not_in_data_vars and not_in_glb_attrs and not_in_data_dims:
-                    del self.schema["sections"][section]["elements"][data_var]
-                    continue
-                for attr, value in elements[data_var].items():
-                    if value != "__from_file__":
-                        continue
-                    if attr in ds[data_var].attrs:
-                        self.schema["sections"][section]["elements"][data_var][attr] = (
-                            ds[data_var].attrs[attr]
-                        )
-                    else:
-                        del self.schema["sections"][section]["elements"][data_var][attr]
-
-    def _select_years(self, df) -> pd.DataFrame:
-        def get_years_from_datetime(date):
-            try:
-                return date.year
-            except AttributeError:
-                return date
-
-        if self.year_init is None and self.year_end is None:
-            return df
+            raise ValueError("parse_mode must be 'pandas' or 'netcdf'")
+
+        data = _apply_multiindex(data)
 
         data_model = self.imodel.split("_")[0]
-        dates = df[properties.year_column[data_model]]
-        years = dates.apply(lambda x: get_years_from_datetime(x))
-        years = years.astype(int)
-
-        mask = pd.Series([True] * len(years))
-        if self.year_init:
-            mask[years < self.year_init] = False
-        if self.year_end:
-            mask[years > self.year_end] = False
-
-        index = mask[mask].index
-        return df.iloc[index].reset_index(drop=True)
-
-    def _read_pandas(self, **kwargs) -> pd.DataFrame | pd.io.parsers.TextFileReader:
-        if (enc := kwargs.get("encoding")) is not None:
-            logging.info(f"Reading with encoding = {enc}")
-        return pd.read_fwf(
-            self.source,
-            header=None,
-            quotechar="\0",
-            escapechar="\0",
-            dtype=object,
-            skip_blank_lines=False,
-            **kwargs,
-        )
+        year_col = properties.year_column[data_model]
 
-    def _read_netcdf(self, **kwargs) -> xr.Dataset:
-        ds = xr.open_mfdataset(self.source, **kwargs)
-        self._adjust_schema(ds, ds.dtypes)
-        return ds.squeeze()
+        data = _select_years(data, (year_init, year_end), year_col)
 
-    def _read_sections(
-        self,
-        TextParser,
-        order,
-        valid,
-        open_with,
-    ) -> pd.DataFrame:
-        if open_with == "pandas":
-            df = Configurator(
-                df=TextParser, schema=self.schema, order=order, valid=valid
-            ).open_pandas()
-        elif open_with == "netcdf":
-            df = Configurator(
-                df=TextParser, schema=self.schema, order=order, valid=valid
-            ).open_netcdf()
+        converter_dict = converter_dict or config.convert_decode["converter_dict"]
+        converter_kwargs = converter_kwargs or config.convert_decode["converter_kwargs"]
+        decoder_dict = decoder_dict or config.convert_decode["decoder_dict"]
+
+        data = convert_and_decode(
+            data,
+            convert_flag=convert_flag,
+            decode_flag=decode_flag,
+            converter_dict=converter_dict,
+            converter_kwargs=converter_kwargs,
+            decoder_dict=decoder_dict,
+        )
+
+        if validate_flag:
+            mask = validate(
+                data,
+                imodel=self.imodel,
+                ext_table_path=ext_table_path,
+                attributes=config.validation,
+                disables=config.disable_reads,
+            )
         else:
-            raise ValueError("open_with has to be one of ['pandas', 'netcdf']")
+            mask = pd.DataFrame(True, index=data.index, columns=data.columns)
 
-        self.columns = df.columns
-        return self._select_years(df)
+        data = remove_boolean_values(data, config.dtypes)
+        config = replace(config, columns=data.columns)
 
-    def get_configurations(self, order, valid) -> dict:
-        """DOCUMENTATION."""
-        config_dict = Configurator(
-            schema=self.schema, order=order, valid=valid
-        ).get_configuration()
-        for attr, val in config_dict["self"].items():
-            setattr(self, attr, val)
-        del config_dict["self"]
-        return config_dict
+        return data, mask, config
 
     def open_data(
         self,
-        order,
-        valid,
-        chunksize,
-        open_with="pandas",
-        encoding: str | None = None,
-    ) -> pd.DataFrame | pd.io.parsers.TextFileReader:
-        """DOCUMENTATION."""
-        encoding = encoding or self.schema["header"].get("encoding")
+        source: str,
+        open_with: str = "pandas",
+        pd_kwargs: dict | None = None,
+        xr_kwargs: dict | None = None,
+        convert_kwargs: dict | None = None,
+        decode_kwargs: dict | None = None,
+        validate_kwargs: dict | None = None,
+        select_kwargs: dict | None = None,
+    ) -> (
+        tuple[pd.DataFrame, pd.DataFrame, ParserConfig]
+        | tuple[TextFileReader, TextFileReader, ParserConfig]
+    ):
+        """
+        Open and parse source data according to parser configuration.
+
+        Parameters
+        ----------
+        source : str
+            Path or pattern for input file(s).
+        open_with : str
+            Parser backend: 'pandas' or 'netcdf'.
+        pd_kwargs: dict, optional
+            Additional key-word arguments for parsing pandas-readable data.
+        xr_kwargs: dict, optional
+            Additional key-word arguments for parsing xarray-readable data.
+        convert_kwargs: dict, optional
+            Additional key-word arguments for data conversion.
+        decode_kwargs: dict, optional
+            Additional key-word arguments for data decoding.
+        validate_kwargs: dict, optional
+            Additional key-word arguments for data validation.
+        select_kwargs : dict, optional
+            Additional key-word arguments for selecting/filtering data.
+
+        Returns
+        -------
+        tuple
+            (data, mask, config) or chunked equivalents if using TextFileReader.
+        """
+        pd_kwargs = dict(pd_kwargs or {})
+        xr_kwargs = dict(xr_kwargs or {})
+        convert_kwargs = convert_kwargs or {}
+        decode_kwargs = decode_kwargs or {}
+        validate_kwargs = validate_kwargs or {}
+        select_kwargs = select_kwargs or {}
+
+        func_kwargs = _merge_kwargs(
+            convert_kwargs,
+            decode_kwargs,
+            validate_kwargs,
+            select_kwargs,
+        )
+        func_kwargs["parse_mode"] = open_with
+
         if open_with == "netcdf":
-            TextParser = self._read_netcdf()
+            to_parse = xr.open_mfdataset(source, **xr_kwargs).squeeze()
+            config = update_xr_config(to_parse, self.config)
+            write_kwargs, read_kwargs = {}, {}
         elif open_with == "pandas":
-            TextParser = self._read_pandas(
-                encoding=encoding,
-                widths=[properties.MAX_FULL_REPORT_WIDTH],
-                skiprows=self.skiprows,
-                chunksize=chunksize,
+            config = update_pd_config(pd_kwargs, self.config)
+            pd_kwargs["encoding"] = config.encoding
+            pd_kwargs.setdefault("widths", [properties.MAX_FULL_REPORT_WIDTH])
+            pd_kwargs.setdefault("header", None)
+            pd_kwargs.setdefault("quotechar", "\0")
+            pd_kwargs.setdefault("escapechar", "\0")
+            pd_kwargs.setdefault("dtype", object)
+            pd_kwargs.setdefault("skip_blank_lines", False)
+
+            write_kwargs = {"encoding": pd_kwargs["encoding"]}
+            chunksize = pd_kwargs.get("chunksize")
+            read_kwargs = (
+                {"chunksize": chunksize, "dtype": config.dtypes},
+                {"chunksize": chunksize, "dtype": "boolean"},
             )
-        else:
-            raise ValueError("open_with has to be one of ['pandas', 'netcdf']")
 
-        if isinstance(TextParser, pd.DataFrame) or isinstance(TextParser, xr.Dataset):
-            return self._read_sections(TextParser, order, valid, open_with=open_with)
+            to_parse = pd.read_fwf(source, **pd_kwargs)
         else:
-            data_buffer = StringIO()
-            for i, df_ in enumerate(TextParser):
-                df = self._read_sections(df_, order, valid, open_with=open_with)
-                df.to_csv(
-                    data_buffer,
-                    header=False,
-                    mode="a",
-                    encoding=encoding,
-                    index=False,
-                    quoting=csv.QUOTE_NONE,
-                    sep=properties.internal_delimiter,
-                    quotechar="\0",
-                    escapechar="\0",
-                )
-            data_buffer.seek(0)
-            data = pd.read_csv(
-                data_buffer,
-                names=df.columns,
-                chunksize=self.chunksize,
-                dtype=object,
-                parse_dates=self.parse_dates,
-                delimiter=properties.internal_delimiter,
-                quotechar="\0",
-                escapechar="\0",
-            )
-            return data
+            raise ValueError("open_with must be 'pandas' or 'netcdf'")
+
+        func_kwargs["config"] = config
+
+        return _apply_or_chunk(
+            to_parse,
+            self._process_data,
+            func_kwargs=func_kwargs,
+            makecopy=False,
+            write_kwargs=write_kwargs,
+            read_kwargs=read_kwargs,
+        )
+
+    def read(
+        self,
+        source: str,
+        pd_kwargs: dict | None = None,
+        xr_kwargs: dict | None = None,
+        convert_kwargs: dict | None = None,
+        decode_kwargs: dict | None = None,
+        validate_kwargs: dict | None = None,
+        select_kwargs: dict | None = None,
+    ) -> DataBundle:
+        """
+        Read and process data from the given source.
+
+        Parameters
+        ----------
+        source : str
+            Path to input file(s).
+        pd_kwargs: dict, optional
+            Additional key-word arguments for parsing pandas-readable data.
+        xr_kwargs: dict, optional
+            Additional key-word arguments for parsing xarray-readable data.
+        convert_kwargs: dict, optional
+            Additional key-word arguments for data conversion.
+        decode_kwargs: dict, optional
+            Additional key-word arguments for data decoding.
+        validate_kwargs: dict, optional
+            Additional key-word arguments for data validation.
+        select_kwargs : dict, optional
+            Additional key-word arguments for selecting/filtering data.
+
+        Notes
+        -----
+        All kwargs are forwarded to ``open_data`` to customize the
+        parsing, conversion, decoding, validation, and selection steps.
+
+        Returns
+        -------
+        DataBundle
+            Container with processed data, mask, columns, dtypes, and metadata.
+        """
+        logging.info(f"EXTRACTING DATA FROM MODEL: {self.imodel}")
+        logging.info("Reading and parsing source data...")
+
+        result = self.open_data(
+            source,
+            open_with=properties.open_file.get(self.imodel, "pandas"),
+            pd_kwargs=pd_kwargs,
+            xr_kwargs=xr_kwargs,
+            convert_kwargs=convert_kwargs,
+            decode_kwargs=decode_kwargs,
+            validate_kwargs=validate_kwargs,
+            select_kwargs=select_kwargs,
+        )
+
+        if not isinstance(result, tuple) or len(result) != 3:
+            raise RuntimeError("open_data() must return (data, mask, config)")
+
+        data, mask, config = result
+
+        return DataBundle(
+            data=data,
+            columns=config.columns,
+            dtypes=config.dtypes,
+            parse_dates=config.parse_dates,
+            encoding=config.encoding,
+            mask=mask,
+            imodel=self.imodel,
+        )
diff --git a/cdm_reader_mapper/mdf_reader/utils/parser.py b/cdm_reader_mapper/mdf_reader/utils/parser.py
new file mode 100755
index 00000000..3ba2e9ae
--- /dev/null
+++ b/cdm_reader_mapper/mdf_reader/utils/parser.py
@@ -0,0 +1,645 @@
+"""Auxiliary functions and class for reading, converting, decoding and validating MDF files."""
+
+from __future__ import annotations
+
+import csv
+import logging
+
+from dataclasses import dataclass, replace
+from copy import deepcopy
+from itertools import zip_longest
+from typing import TypedDict, Any, Iterable
+
+import numpy as np
+import pandas as pd
+import xarray as xr
+
+from .. import properties
+from ..schemas.schemas import read_schema, SchemaDict
+from .utilities import convert_dtypes
+
+from .convert_and_decode import Converters, Decoders
+
+
+class OrderSpec(TypedDict):
+    """
+    Parsing specification for a single section.
+
+    Defines the header configuration, element layout, and parsing mode
+    (fixed-width or delimited) for a section.
+    """
+
+    header: dict[str, Any]
+    elements: dict[str, dict[str, Any]]
+    is_delimited: bool
+
+
+@dataclass(frozen=True)
+class ParserConfig:
+    """
+    Configuration for dataset parsing.
+
+    Parameters
+    ----------
+    order_specs : dict
+        Column ordering specifications.
+    disable_reads : list[str]
+        Columns or sources to skip during parsing.
+    dtypes : dict
+        Column data type mappings.
+    parse_dates : list[str]
+        Columns to parse as datetimes.
+    convert_decode : dict
+        Value conversion or decoding rules.
+    validation : dict
+        Validation rules for parsed data.
+    encoding : str
+        Text encoding used when reading input data.
+    columns : pd.Index or pd.MultiIndex or None, optional
+        Explicit column index to apply. If None, inferred from input.
+    """
+
+    order_specs: OrderSpec
+    disable_reads: list[str]
+    dtypes: dict
+    parse_dates: list[str]
+    convert_decode: dict
+    validation: dict
+    encoding: str
+    columns: pd.Index | pd.MultiIndex | None = None
+
+
+def _get_index(section: str, order: str, length: int) -> str | tuple[str, str]:
+    """Build an index key based on section count."""
+    return section if length == 1 else (order, section)
+
+
+def _get_ignore(section_dict: dict[str, Any]) -> bool:
+    """Determine whether a section should be ignored."""
+    ignore = section_dict.get("ignore", False)
+    if isinstance(ignore, str):
+        ignore = ignore.lower() in {"true", "1", "yes"}
+    return bool(ignore)
+
+
+def _convert_dtype_to_default(dtype: str | None) -> str | None:
+    """Normalize deprecated or aliased dtype strings."""
+    if dtype is None:
+        return None
+    elif dtype == "float":
+        return dtype
+    elif dtype == "int":
+        return properties.pandas_int
+    elif "float" in dtype.lower():
+        logging.warning(f"Set column type from deprecated {dtype} to float.")
+        return "float"
+    elif "int" in dtype.lower():
+        logging.warning(f"Set column type from deprecated {dtype} to int.")
+        return properties.pandas_int
+    return dtype
+
+
+def _parse_fixed_width(
+    line: str,
+    i: int,
+    header: dict[str, Any],
+    elements: dict[str, dict[str, Any]],
+    sections: set | None,
+    excludes: set,
+    out: dict[Any, Any],
+) -> int:
+    """Parse a fixed-width section of a line into an output dictionary."""
+    section_length = header.get("length", properties.MAX_FULL_REPORT_WIDTH)
+    delimiter = header.get("delimiter")
+    sentinel = header.get("sentinel")
+
+    section_end = i + section_length
+    bad_sentinel = sentinel is not None and not line.startswith(sentinel, i)
+    line_len = len(line)
+    delim_len = len(delimiter) if delimiter else 0
+
+    for spec in elements.values():
+        field_length = spec.get("field_length", 0)
+        index = spec.get("index")
+        ignore = spec.get("ignore", False)
+        missing_value = spec.get("missing_value")
+
+        missing = True
+        j = i if bad_sentinel else i + field_length
+        if j > section_end:
+            missing = False
+            j = section_end
+
+        if not ignore:
+            key = index[0] if isinstance(index, tuple) else index
+            if (sections is None or key in sections) and key not in excludes:
+                if i < j:
+                    value = line[i:j]
+                    if not value.strip() or value == missing_value:
+                        value = True
+                else:
+                    value = False if missing else True
+
+                out[index] = value
+
+        if (
+            delimiter
+            and j + delim_len <= line_len
+            and line[j : j + delim_len] == delimiter
+        ):
+            j += delim_len
+
+        i = j
+
+    return i
+
+
+def _parse_delimited(
+    line: str,
+    i: int,
+    header: dict[str, Any],
+    elements: dict[str, dict[str, Any]],
+    sections: set | None,
+    excludes: set,
+    out: dict[Any, Any],
+) -> int:
+    """Parse a delimiter-separated section of a line into an output dictionary."""
+    delimiter = header["delimiter"]
+    fields = next(csv.reader([line[i:]], delimiter=delimiter))
+
+    for element, value in zip_longest(elements.keys(), fields):
+        index = elements[element]["index"]
+        key = index[0] if isinstance(index, tuple) else index
+
+        if (sections is None or key in sections) and key not in excludes:
+            out[index] = value.strip() if value is not None else None
+
+    return len(line)
+
+
+def _parse_line(
+    line: str,
+    order_specs: dict[str, OrderSpec],
+    sections: set | None,
+    excludes: set,
+) -> dict[str, dict[Any, Any]]:
+    """Parse a line using the provided parser configuration."""
+    i = 0
+    out = {}
+    max_width = properties.MAX_FULL_REPORT_WIDTH
+
+    for order, spec in order_specs.items():
+        header = spec["header"]
+        elements = spec["elements"]
+
+        if header.get("disable_read"):
+            if order not in excludes:
+                out[order] = line[i : i + max_width]
+            i += header.get("length", max_width)
+            continue
+
+        if spec["is_delimited"]:
+            i = _parse_delimited(line, i, header, elements, sections, excludes, out)
+        else:
+            i = _parse_fixed_width(line, i, header, elements, sections, excludes, out)
+
+    return out
+
+
+def parse_pandas(
+    df: pd.DataFrame,
+    order_specs: dict[str, OrderSpec],
+    sections: Iterable[str] | None = None,
+    excludes: Iterable[str] | None = None,
+) -> pd.DataFrame:
+    """
+    Parse a pandas DataFrame containing raw record lines.
+
+    Each row of the input DataFrame is expected to contain a single
+    fixed-width or delimiter-separated record, which is parsed according
+    to the provided order specifications.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        Input DataFrame with exactly one column (column index ``0``),
+        where each row contains a raw record string.
+    order_specs : dict[str, OrderSpec]
+        Mapping of section names to parsing specifications. Each specification
+        defines the header configuration, element layout, and parsing mode
+        for a section.
+    sections : iterable of str or None
+        Section names to include. If None, all sections are parsed.
+    excludes : iterable of str or None
+        Section names to exclude from parsing.
+
+    Returns
+    -------
+    pandas.DataFrame
+        DataFrame constructed from parsed records. Columns are derived
+        from element indices and may be strings or tuples.
+
+    Examples
+    --------
+    Example ``order_specs`` structure::
+
+        order_specs = {
+            "core": {
+                "header": {
+                    "sentinel": None,
+                    "length": 108,
+                },
+                "elements": {
+                    "YR": {
+                        "index": ("core", "YR"),
+                        "field_length": 4,
+                        "ignore": False,
+                        "column_type": "Int64",
+                        "missing_value": None,
+                    },
+                    "MO": {
+                        "index": ("core", "MO"),
+                        "field_length": 2,
+                        "ignore": False,
+                        "column_type": "Int64",
+                        "missing_value": None,
+                    },
+                },
+                "is_delimited": False,
+            }
+        }
+
+    Notes
+    -----
+    - Ignored elements (``ignore=True``) are skipped.
+    - Disabled sections (``disable_read=True``) are included as raw strings in the output.
+    - Missing elements are filled with ``False``.
+    - Object-type columns are stripped, decoded from UTF-8 if necessary, and empty
+      strings are replaced with ``True``.
+    - No type conversion is performed at this stage.
+    """
+    col = df.columns[0]
+
+    sections = set(sections) if sections is not None else None
+    excludes = set(excludes) if excludes else set()
+
+    records = df[col].map(
+        lambda line: _parse_line(line, order_specs, sections, excludes)
+    )
+    return pd.DataFrame.from_records(records.to_list())
+
+
+def parse_netcdf(
+    ds: xr.Dataset,
+    order_specs: dict[str, OrderSpec],
+    sections: Iterable[str] | None = None,
+    excludes: Iterable[str] | None = None,
+) -> pd.DataFrame:
+    """
+    Parse an xarray Dataset into a pandas DataFrame based on order specifications.
+
+    This function converts an xarray Dataset into a tabular pandas DataFrame
+    according to parsing rules defined in `order_specs`. Data variables, dimensions,
+    and global attributes are mapped to columns as specified, with ignored or missing
+    elements handled automatically.
+
+    Parameters
+    ----------
+    ds : xarray.Dataset
+        Input Dataset containing data variables, dimensions, and attributes.
+    order_specs : dict[str, OrderSpec]
+        Mapping of section names to parsing specifications. Each specification
+        defines the header configuration, element layout, and parsing mode
+        for a section.
+    sections : iterable of str or None
+        Section names to include. If None, all sections are parsed.
+    excludes : iterable of str or None
+        Section names to exclude from parsing.
+
+    Returns
+    -------
+    pandas.DataFrame
+        DataFrame constructed from the Dataset according to the parsing specification.
+        Columns are derived from element indices. Missing fields are filled with
+        False, disabled sections with NaN, and empty strings are converted to True.
+
+    Examples
+    --------
+    Example ``order_specs`` structure::
+
+        order_specs = {
+            "global_attributes": {
+                "header": {
+                    "disable_read": True,
+                },
+                "elements": {
+                    "title": {
+                        "index": ("global_attributes", "title"),
+                        "ignore": False,
+                        "column_type": "str",
+                        "missing_value": None,
+                    },
+                    "institution": {
+                        "index": ("global_attributes", "institution"),
+                        "ignore": False,
+                        "column_type": "str",
+                        "missing_value": None,
+                    },
+                },
+                "is_delimited": False,
+            }
+        }
+
+    Notes
+    -----
+    - Variables, dimensions, and global attributes in `ds` are mapped to columns
+      according to the element `index`.
+    - Ignored elements (`ignore=True`) are skipped.
+    - Disabled sections (`disable_read=True`) are added as columns filled with NaN.
+    - Missing elements are added as columns filled with False.
+    - Object-type columns are decoded from UTF-8, stripped, and empty strings
+      replaced with True.
+    """
+    sections = set(sections) if sections is not None else None
+    excludes = set(excludes) if excludes else set()
+
+    missing_values = []
+    attrs = {}
+    renames = {}
+    disables = []
+
+    data_vars = ds.data_vars
+    dims = ds.dims
+    ds_attrs = ds.attrs
+
+    for order, ospec in order_specs.items():
+        if sections is not None and order not in sections:
+            continue
+        if order in excludes:
+            continue
+
+        header = ospec.get("header", {})
+        if header.get("disable_read") is True:
+            disables.append(order)
+            continue
+
+        for element, espec in ospec.get("elements", {}).items():
+            if espec.get("ignore"):
+                continue
+
+            index = espec["index"]
+
+            if element in data_vars or element in dims:
+                renames[element] = index
+            elif element in ds_attrs:
+                attrs[index] = ds_attrs[element]
+            else:
+                missing_values.append(index)
+
+    df = ds[list(renames)].to_dataframe().reset_index()
+    df = df[list(renames)].rename(columns=renames)
+
+    if attrs:
+        df = df.assign(**{k: v.replace("\n", "; ") for k, v in attrs.items()})
+
+    if disables:
+        df[disables] = np.nan
+
+    obj_cols = df.select_dtypes(include="object").columns
+    for col in obj_cols:
+        print(df[col])
+        print(df[col].str)
+        s = df[col].str.decode("utf-8").str.strip()
+        df[col] = s.map(lambda x: True if x == "" else x)
+
+    if missing_values:
+        df[missing_values] = False
+
+    return df
+
+
+def build_parser_config(
+    imodel: str | None = None,
+    ext_schema_path: str | None = None,
+    ext_schema_file: str | None = None,
+) -> ParserConfig:
+    """
+    Build a ParserConfig from a normalized schema definition.
+
+    This function reads a schema definition and constructs a fully populated
+    :py:class:`ParserConfig` instance. The resulting configuration contains
+    parsing order specifications, data types, converters, decoders, validation
+    rules, and encoding information required to parse raw input records.
+
+    Parameters
+    ----------
+    imodel : str or None, optional
+        Internal model identifier used to locate the schema.
+    ext_schema_path : str or None, optional
+        Path to an external schema directory.
+    ext_schema_file : str or None, optional
+        Filename of an external schema definition.
+
+    Returns
+    -------
+    ParserConfig
+        Fully initialized parser configuration derived from the schema.
+
+    Notes
+    -----
+    - Section parsing order is derived from ``schema["header"]["parsing_order"]``.
+    - Sections marked with ``disable_read=True`` are recorded in
+      ``ParserConfig.disable_reads``.
+    - Elements marked as ignored or disabled are excluded from dtype,
+      conversion, and validation setup.
+    - Column indices may be strings or tuples depending on the number of
+      sections in the schema.
+    - Deprecated or aliased column types are normalized via
+      ``_convert_dtype_to_default``.
+    - Converter and decoder functions are resolved dynamically based on
+      column type and encoding.
+    - Validation rules may include value ranges and code tables, as defined
+      in the schema.
+    """
+    schema: SchemaDict = read_schema(
+        imodel=imodel,
+        ext_schema_path=ext_schema_path,
+        ext_schema_file=ext_schema_file,
+    )
+
+    orders = [
+        order
+        for group in schema["header"]["parsing_order"]
+        for section_list in group.values()
+        for order in section_list
+    ]
+    olength = len(orders)
+
+    dtypes: dict[Any, Any] = {}
+    validation: dict[Any, dict[str, Any]] = {}
+    order_specs: dict[str, OrderSpec] = {}
+    disable_reads: list[str] = []
+    converters: dict[Any, Any] = {}
+    converter_kwargs: dict[Any, dict[str, Any]] = {}
+    decoders: dict[Any, Any] = {}
+
+    for order in orders:
+        section = schema["sections"][order]
+        header = section["header"]
+        elements = section.get("elements", {})
+
+        if header.get("disable_read"):
+            disable_reads.append(order)
+
+        element_specs: dict[str, dict[str, Any]] = {}
+        for name, meta in elements.items():
+            index = _get_index(name, order, olength)
+            ignore = _get_ignore(meta)
+
+            element_specs[name] = {
+                "index": index,
+                "ignore": ignore,
+                "missing_value": meta.get("missing_value"),
+                "field_length": meta.get(
+                    "field_length", properties.MAX_FULL_REPORT_WIDTH
+                ),
+            }
+
+            if ignore or meta.get("disable_read", False):
+                continue
+
+            ctype = _convert_dtype_to_default(meta.get("column_type"))
+            dtype = properties.pandas_dtypes.get(ctype)
+            if dtype is not None:
+                dtypes[index] = dtype
+
+            conv_func = Converters(ctype).converter()
+            if conv_func:
+                converters[index] = conv_func
+
+            conv_args = {
+                k: meta.get(k)
+                for k in properties.data_type_conversion_args.get(ctype, [])
+            }
+            if conv_args:
+                converter_kwargs[index] = conv_args
+
+            encoding = meta.get("encoding")
+            if encoding:
+                dec_func = Decoders(ctype, encoding).decoder()
+                if dec_func:
+                    decoders[index] = dec_func
+
+            validation[index] = {}
+            if ctype:
+                validation[index]["column_type"] = ctype
+            for k in ("valid_min", "valid_max", "codetable"):
+                if meta.get(k) is not None:
+                    validation[index][k] = meta[k]
+
+        order_specs[order] = OrderSpec(
+            header=header,
+            elements=element_specs,
+            is_delimited=header.get("format") == "delimited",
+        )
+
+    dtypes, parse_dates = convert_dtypes(dtypes)
+
+    return ParserConfig(
+        order_specs=order_specs,
+        disable_reads=disable_reads,
+        dtypes=dtypes,
+        parse_dates=parse_dates,
+        convert_decode={
+            "converter_dict": converters,
+            "converter_kwargs": converter_kwargs,
+            "decoder_dict": decoders,
+        },
+        validation=validation,
+        encoding=schema["header"].get("encoding", "utf-8"),
+    )
+
+
+def update_xr_config(ds: xr.Dataset, config: ParserConfig) -> ParserConfig:
+    """
+    Update a ParserConfig instance using metadata from an xarray Dataset.
+
+    This function adjusts the parser configuration based on the contents of
+    the provided Dataset. Elements not present in the Dataset are marked as
+    ignored, and validation rules marked as ``"__from_file__"`` are populated
+    from Dataset variable attributes when available.
+
+    Parameters
+    ----------
+    ds : xarray.Dataset
+        Input Dataset containing data variables, dimensions, and attributes.
+    config : ParserConfig
+        Existing parser configuration.
+
+    Returns
+    -------
+    ParserConfig
+        Updated parser configuration with modified order specifications and
+        validation rules derived from the Dataset.
+    """
+    new_order_specs = deepcopy(config.order_specs)
+    new_validation = deepcopy(config.validation)
+
+    for order, ospecs in new_order_specs.items():
+        elements = ospecs["elements"]
+
+        for element, especs in elements.items():
+            if (
+                element not in ds.data_vars
+                and element not in ds.attrs
+                and element not in ds.dims
+            ):
+                especs["ignore"] = True
+                continue
+
+            index = especs.get("index")
+            if index not in new_validation:
+                continue
+
+            for attr in list(new_validation[index].keys()):
+                if new_validation[index][attr] != "__from_file__":
+                    continue
+
+                ds_attrs = ds[element].attrs
+                if attr in ds_attrs:
+                    new_validation[index][attr] = ds_attrs[attr]
+                else:
+                    new_validation[index].pop(attr, None)
+
+    return replace(
+        config,
+        order_specs=new_order_specs,
+        validation=new_validation,
+    )
+
+
+def update_pd_config(pd_kwargs: dict[str, Any], config: ParserConfig) -> ParserConfig:
+    """
+    Update a ParserConfig instance using pandas keyword arguments.
+
+    Currently, only the ``encoding`` option is supported. If an encoding
+    is provided in ``pd_kwargs``, a new ParserConfig instance is returned
+    with the updated encoding. Otherwise, the original configuration is
+    returned unchanged.
+
+    Parameters
+    ----------
+    pd_kwargs : dict[str, Any]
+        Keyword arguments intended for pandas I/O functions.
+    config : ParserConfig
+        Existing parser configuration.
+
+    Returns
+    -------
+    ParserConfig
+        Updated parser configuration if applicable, otherwise the original
+        configuration.
+    """
+    if "encoding" in pd_kwargs and pd_kwargs["encoding"]:
+        return replace(config, encoding=pd_kwargs["encoding"])
+    return config
diff --git a/cdm_reader_mapper/mdf_reader/utils/utilities.py b/cdm_reader_mapper/mdf_reader/utils/utilities.py
index 67f4930b..5b47ef2c 100755
--- a/cdm_reader_mapper/mdf_reader/utils/utilities.py
+++ b/cdm_reader_mapper/mdf_reader/utils/utilities.py
@@ -2,12 +2,229 @@
 
 from __future__ import annotations
 
+import ast
+import csv
 import logging
 import os
 
+from io import StringIO
+from pathlib import Path
+from typing import Any, Iterable, Callable
+
+import pandas as pd
+
+from .. import properties
+
+from cdm_reader_mapper.common.pandas_TextParser_hdlr import make_copy
+
+
+def as_list(x: str | Iterable[Any] | None) -> list[Any] | None:
+    """
+    Ensure the input is a list; keep None as None.
+
+    Parameters
+    ----------
+    x : str, iterable, or None
+        Input value to convert. Strings become single-element lists.
+        Other iterables are converted to a list preserving iteration order.
+        If None is passed, None is returned.
+
+    Returns
+    -------
+    list or None
+        Converted list or None if input was None.
+
+    Notes
+    -----
+    Sets are inherently unordered; the resulting list may not have a predictable order.
+    """
+    if x is None:
+        return None
+    if isinstance(x, str):
+        return [x]
+    return list(x)
+
+
+def as_path(value: str | os.PathLike, name: str) -> Path:
+    """
+    Ensure the input is a Path-like object.
+
+    Parameters
+    ----------
+    value : str or os.PathLike
+        The value to convert to a Path.
+    name : str
+        Name of the parameter, used in error messages.
+
+    Returns
+    -------
+    pathlib.Path
+        Path object representing `value`.
+
+    Raises
+    ------
+    TypeError
+        If `value` is not a string or Path-like object.
+    """
+    if isinstance(value, (str, os.PathLike)):
+        return Path(value)
+    raise TypeError(f"{name} must be str or Path-like")
+
+
+def join(col: Any | Iterable[Any]) -> str:
+    """
+    Join multi-level columns as a colon-separated string.
+
+    Parameters
+    ----------
+    col : any or iterable of any
+        A column name, which may be a single value or a list/tuple of values.
+
+    Returns
+    -------
+    str
+        Colon-separated string if input is iterable, or string of the single value.
+    """
+    if isinstance(col, (list, tuple)):
+        return ":".join(str(c) for c in col)
+    return str(col)
+
+
+def update_dtypes(dtypes: dict[str, Any], columns: Iterable[str]) -> dict[str, Any]:
+    """
+    Filter dtypes dictionary to only include columns present in 'columns'.
+
+    Parameters
+    ----------
+    dtypes : dict
+        Dictionary mapping column names to their data types.
+    columns : iterable of str
+        List of columns to keep.
+
+    Returns
+    -------
+    dict
+        Filtered dictionary containing only keys present in 'columns'.
+    """
+    if isinstance(dtypes, dict):
+        dtypes = {k: v for k, v in dtypes.items() if k in columns}
+    return dtypes
+
+
+def update_column_names(
+    dtypes: dict[str, Any] | str, col_o: str, col_n: str
+) -> dict[str, Any] | str:
+    """
+    Rename a column in a dtypes dictionary if it exists.
+
+    Parameters
+    ----------
+    dtypes : dict or str
+        Dictionary mapping column names to data types, or a string.
+    col_o : str
+        Original column name to rename.
+    col_n : str
+        New column name.
+
+    Returns
+    -------
+    dict or str
+        Updated dictionary with column renamed, or string unchanged.
+    """
+    if isinstance(dtypes, str):
+        return dtypes
+    if col_o != col_n and col_o in dtypes.keys():
+        dtypes[col_n] = dtypes[col_o]
+        del dtypes[col_o]
+    return dtypes
+
+
+def update_column_labels(columns: Iterable[str | tuple]) -> pd.Index | pd.MultiIndex:
+    """
+    Convert string column labels to tuples if needed, producing a pandas Index or MultiIndex.
+
+    This function attempts to parse each column label:
+    - If the label is a string representation of a tuple (e.g., "('A','B')"), it will be converted to a tuple.
+    - If the label is a string containing a colon (e.g., "A:B"), it will be split into a tuple ("A", "B").
+    - Otherwise, the label is left unchanged.
+
+    If all resulting labels are tuples, a pandas MultiIndex is returned.
+    Otherwise, a regular pandas Index is returned.
+
+    Parameters
+    ----------
+    columns : iterable of str or tuple
+        Column labels to convert.
+
+    Returns
+    -------
+    pd.Index or pd.MultiIndex
+        Converted column labels as a pandas Index or MultiIndex.
+    """
+    new_cols = []
+    all_tuples = True
+
+    for col in columns:
+        try:
+            col_ = ast.literal_eval(col)
+        except Exception:
+            if isinstance(col, str) and ":" in col:
+                col_ = tuple(col.split(":"))
+            else:
+                col_ = col
+        all_tuples &= isinstance(col_, tuple)
+        new_cols.append(col_)
+
+    if all_tuples:
+        return pd.MultiIndex.from_tuples(new_cols)
+    return pd.Index(new_cols)
+
+
+def read_csv(filepath, col_subset=None, **kwargs) -> pd.DataFrame:
+    """
+    Safe CSV reader that handles missing files and column subsets.
+
+    Parameters
+    ----------
+    filepath : str or Path or None
+        Path to the CSV file.
+    col_subset : list of str, optional
+        Subset of columns to read from the CSV.
+    kwargs : any
+        Additional keyword arguments passed to pandas.read_csv.
+
+    Returns
+    -------
+    pd.DataFrame
+        The CSV as a DataFrame. Empty if file does not exist.
+    """
+    if filepath is None or not Path(filepath).is_file():
+        logging.warning(f"File not found: {filepath}")
+        return pd.DataFrame()
+
+    df = pd.read_csv(filepath, delimiter=",", **kwargs)
+    df.columns = update_column_labels(df.columns)
+    if col_subset is not None:
+        df = df[col_subset]
+
+    return df
+
 
 def convert_dtypes(dtypes) -> tuple[str]:
-    """Convert datetime to object."""
+    """
+    Convert datetime columns to object dtype and return columns to parse as dates.
+
+    Parameters
+    ----------
+    dtypes : dict[str, str]
+        Dictionary mapping column names to pandas dtypes.
+
+    Returns
+    -------
+    tuple
+        - Updated dtypes dictionary (datetime converted to object).
+        - List of columns originally marked as datetime.
+    """
     parse_dates = []
     for key, value in dtypes.items():
         if value == "datetime":
@@ -17,60 +234,57 @@ def convert_dtypes(dtypes) -> tuple[str]:
 
 
 def validate_arg(arg_name, arg_value, arg_type) -> bool:
-    """Validate input argument is as expected type.
+    """
+    Validate that the input argument is of the expected type.
 
     Parameters
     ----------
     arg_name : str
-        Name of the argument
-    arg_value : arg_type
-        Value of the argument
+        Name of the argument.
+    arg_value : Any
+        Value of the argument.
     arg_type : type
-        Type of the argument
+        Expected type of the argument.
 
     Returns
     -------
-    boolean:
-        Returns True if type of `arg_value` equals `arg_type`
+    bool
+        True if `arg_value` is of type `arg_type` or None.
+
+    Raises
+    ------
+    ValueError
+        If `arg_value` is not of type `arg_type` and not None.
     """
     if arg_value and not isinstance(arg_value, arg_type):
-        logging.error(
-            f"Argument {arg_name} must be {arg_type}, input type is {type(arg_value)}"
+        raise ValueError(
+            f"Argument {arg_name} must be {arg_type} or None, not {type(arg_value)}"
         )
-        return False
-    return True
-
-
-def validate_path(arg_name, arg_value) -> bool:
-    """Validate input argument is an existing directory.
-
-    Parameters
-    ----------
-    arg_name : str
-        Name of the argument
-    arg_value : str
-        Value of the argument
 
-    Returns
-    -------
-    boolean
-        Returns True if `arg_name` is an existing directory.
-    """
-    if arg_value and not os.path.isdir(arg_value):
-        logging.error(f"{arg_name} could not find path {arg_value}")
-        return False
     return True
 
 
-def adjust_dtype(dtype, df) -> dict:
-    """Adjust dtypes to DataFrame."""
+def _adjust_dtype(dtype, df) -> dict:
+    """Filter dtype dictionary to only include columns present in the DataFrame."""
     if not isinstance(dtype, dict):
         return dtype
     return {k: v for k, v in dtype.items() if k in df.columns}
 
 
 def convert_str_boolean(x) -> str | bool:
-    """Convert str boolean value to boolean value."""
+    """
+    Convert string boolean values 'True'/'False' to Python booleans.
+
+    Parameters
+    ----------
+    x : Any
+        Input value.
+
+    Returns
+    -------
+    bool or original value
+        True if 'True', False if 'False', else original value.
+    """
     if x == "True":
         x = True
     if x == "False":
@@ -78,11 +292,135 @@ def convert_str_boolean(x) -> str | bool:
     return x
 
 
-def remove_boolean_values(x) -> str | None:
-    """Remove boolean values."""
+def _remove_boolean_values(x) -> str | None:
+    """Remove boolean values or string representations of boolean."""
     x = convert_str_boolean(x)
-    if x is True:
-        return
-    if x is False:
-        return
+    if x is True or x is False:
+        return None
     return x
+
+
+def remove_boolean_values(data, dtypes) -> pd.DataFrame:
+    """
+    Remove boolean values from a DataFrame and adjust dtypes.
+
+    Parameters
+    ----------
+    data : pd.DataFrame
+        Input data.
+    dtypes : dict
+        Dictionary mapping column names to desired dtypes.
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with booleans removed and dtype adjusted.
+    """
+    data = data.map(_remove_boolean_values)
+    dtype = _adjust_dtype(dtypes, data)
+    return data.astype(dtype)
+
+
+def process_textfilereader(
+    reader: Iterable[pd.DataFrame],
+    func: Callable,
+    func_args: tuple = (),
+    func_kwargs: dict[str, Any] | None = None,
+    read_kwargs: dict[str, Any] | tuple[dict[str, Any], ...] | None = None,
+    write_kwargs: dict[str, Any] | None = None,
+    makecopy: bool = True,
+) -> tuple[pd.DataFrame, ...]:
+    """
+    Process a stream of DataFrames using a function and return processed results.
+
+    Each DataFrame from `reader` is passed to `func`, which can return one or more
+    DataFrames or other outputs. DataFrame outputs are concatenated in memory and
+    returned as a tuple along with any additional non-DataFrame outputs.
+
+    Parameters
+    ----------
+    reader : Iterable[pd.DataFrame]
+        An iterable of DataFrames (e.g., a CSV reader returning chunks).
+    func : Callable
+        Function to apply to each DataFrame.
+    func_args : tuple, optional
+        Positional arguments passed to `func`.
+    func_kwargs : dict, optional
+        Keyword arguments passed to `func`.
+    read_kwargs : dict or tuple of dict, optional
+        Arguments to pass to `pd.read_csv` when reconstructing output DataFrames.
+    write_kwargs : dict, optional
+        Arguments to pass to `DataFrame.to_csv` when buffering output.
+    makecopy : bool, default True
+        If True, makes a copy of each input DataFrame before processing.
+
+    Returns
+    -------
+    tuple
+        A tuple containing:
+            - One or more processed DataFrames (in the same order as returned by `func`)
+            - Any additional outputs from `func` that are not DataFrames
+    """
+    if func_kwargs is None:
+        func_kwargs = {}
+    if read_kwargs is None:
+        read_kwargs = {}
+    if write_kwargs is None:
+        write_kwargs = {}
+
+    buffers = []
+    columns = []
+
+    if makecopy is True:
+        reader = make_copy(reader)
+
+    output_add = []
+
+    for df in reader:
+        outputs = func(df, *func_args, **func_kwargs)
+        if not isinstance(outputs, tuple):
+            outputs = (outputs,)
+
+        output_dfs = []
+        first_chunk = not buffers
+
+        for out in outputs:
+            if isinstance(out, pd.DataFrame):
+                output_dfs.append(out)
+            elif first_chunk:
+                output_add.append(out)
+
+        if not buffers:
+            buffers = [StringIO() for _ in output_dfs]
+            columns = [out.columns for out in output_dfs]
+
+        for buffer, out_df in zip(buffers, output_dfs):
+            out_df.to_csv(
+                buffer,
+                header=False,
+                mode="a",
+                index=False,
+                quoting=csv.QUOTE_NONE,
+                sep=properties.internal_delimiter,
+                quotechar="\0",
+                escapechar="\0",
+                **write_kwargs,
+            )
+
+    if isinstance(read_kwargs, dict):
+        read_kwargs = tuple(read_kwargs for _ in range(len(buffers)))
+
+    result_dfs = []
+    for buffer, cols, rk in zip(buffers, columns, read_kwargs):
+        buffer.seek(0)
+        result_dfs.append(
+            pd.read_csv(
+                buffer,
+                names=cols,
+                delimiter=properties.internal_delimiter,
+                quotechar="\0",
+                escapechar="\0",
+                **rk,
+            )
+        )
+    return tuple(result_dfs + output_add)
diff --git a/cdm_reader_mapper/mdf_reader/utils/validators.py b/cdm_reader_mapper/mdf_reader/utils/validators.py
index 8a4d2738..d4d84057 100755
--- a/cdm_reader_mapper/mdf_reader/utils/validators.py
+++ b/cdm_reader_mapper/mdf_reader/utils/validators.py
@@ -1,233 +1,222 @@
-"""Validate entries."""
+"""Data validation module."""
 
 from __future__ import annotations
 
 import logging
-
 import numpy as np
 import pandas as pd
 
+from typing import Any, Iterable
+
 from .. import properties
 from ..codes import codes
-from ..schemas import schemas
 from .utilities import convert_str_boolean
 
 
-def validate_datetime(elements, data) -> pd.DataFrame:
-    """DOCUMENTATION."""
-
-    def is_date_object(object):
-        if hasattr(object, "year"):
-            return True
-
-    mask = pd.DataFrame(index=data.index, data=False, columns=elements)
-    mask[elements] = (
-        data[elements].apply(np.vectorize(is_date_object)) | data[elements].isna()
-    )
-    return mask
-
-
-def validate_numeric(elements, data, schema) -> pd.DataFrame:
-    """DOCUMENTATION."""
-
-    # Find thresholds in schema. Flag if not available -> warn
-    def _to_numeric(x):
-        if x is None:
-            return np.nan
-        x = convert_str_boolean(x)
-        if isinstance(x, bool):
-            return x
-        try:
-            return float(x)
-        except ValueError:
-            return False
-
-    data[elements] = data[elements].map(_to_numeric)
-    mask = pd.DataFrame(index=data.index, data=False, columns=elements)
-    lower = {x: schema.get(x).get("valid_min", -np.inf) for x in elements}
-    upper = {x: schema.get(x).get("valid_max", np.inf) for x in elements}
-
-    set_elements = [
-        x for x in lower.keys() if lower.get(x) != -np.inf and upper.get(x) != np.inf
-    ]
-
-    if len([x for x in elements if x not in set_elements]) > 0:
-        logging.warning(
-            "Data numeric elements with missing upper or lower threshold: {}".format(
-                ",".join([str(x) for x in elements if x not in set_elements])
-            )
-        )
-        logging.warning(
-            "Corresponding upper and/or lower bounds set to +/-inf for validation"
-        )
-    mask[elements] = (
-        (data[elements] >= [lower.get(x) for x in elements])
-        & (data[elements] <= [upper.get(x) for x in elements])
-    ) | data[elements].isna()
-    return mask
-
-
-def validate_str(elements, data) -> pd.DataFrame:
-    """DOCUMENTATION."""
-    return pd.DataFrame(index=data.index, data=True, columns=elements)
-
-
-def validate_codes(elements, data, schema, imodel, ext_table_path) -> pd.DataFrame:
-    """DOCUMENTATION."""
-    mask = pd.DataFrame(index=data.index, data=False, columns=elements)
-    for element in elements:
-        code_table_name = schema.get(element).get("codetable")
-        if not code_table_name:
-            logging.error(f"Code table not defined for element {element}")
-            logging.warning("Element mask set to False")
-            continue
+def _is_false(x: Any) -> bool:
+    """Check if a value is exactly False."""
+    return x is False
 
-        table = codes.read_table(
-            code_table_name,
-            imodel=imodel,
-            ext_table_path=ext_table_path,
-        )
-        if not table:
-            continue
 
-        dtype = properties.pandas_dtypes.get(schema.get(element).get("column_type"))
+def _is_true(x: Any) -> bool:
+    """Check if a value is exactly False."""
+    return x is True
+
+
+def validate_datetime(series: pd.Series) -> pd.Series:
+    """
+    Validate that entries in a pandas Series can be converted to datetime.
+
+    Missing values are treated as valid.
+
+    Parameters
+    ----------
+    series : pd.Series
+        Series of object values to validate
+
+    Returns
+    -------
+    pd.Series
+        Boolean Series indicating valid entries
+    """
+    dates = pd.to_datetime(series, errors="coerce")
+    return dates.notna() | series.isna()
+
+
+def validate_numeric(
+    series: pd.Series, valid_min: float, valid_max: float
+) -> pd.Series:
+    """
+    Validate that entries in a pandas Series are numeric and within a range.
+
+    - Converts boolean-like strings to bools.
+    - Invalid or missing values are marked as False unless missing (NaN).
+
+    Parameters
+    ----------
+    series : pd.Series
+        Series of object values to validate
+    valid_min : float
+        Minimum valid value
+    valid_max : float
+        Maximum valid value
+
+    Returns
+    -------
+    pd.Series
+        Boolean Series indicating valid entries
+    """
+    converted = series.apply(convert_str_boolean)
+    numeric = pd.to_numeric(converted, errors="coerce")
+    valid_range = numeric.between(valid_min, valid_max)
+    return valid_range | series.isna()
+
 
-        table_keys = list(table.keys())
-        validation_df = data[element]
-        value = validation_df.astype(dtype).astype("str")
-        valid = validation_df.notna()
-        mask_ = value.isin(table_keys)
-        mask[element] = mask_.where(valid, True) | validation_df.isna()
+def validate_str(series: pd.Series) -> pd.Series:
+    """
+    Validate that entries in a pandas Series are strings.
 
-    return mask
+    Currently all values are treated as valid.
 
+    Parameters
+    ----------
+    series : pd.Series
+        Series of object values to validate
+
+    Returns
+    -------
+    pd.Series
+        Boolean Series with all True
+    """
+    return pd.Series(True, index=series.index, dtype="boolean")
 
-def _get_elements(elements, element_atts, key) -> list[str]:
-    def _condition(x):
-        column_types = element_atts.get(x).get("column_type")
-        if key == "numeric_types":
-            return column_types in properties.numeric_types
-        return column_types == key
 
-    return [x for x in elements if _condition(x)]
+def validate_codes(
+    series: pd.Series, code_table: Iterable[Any], column_type: str
+) -> pd.Series:
+    """
+    Validate that entries in a pandas Series exist in a provided code table.
 
+    Missing values are treated as valid.
 
-def _element_tuples(numeric_elements, datetime_elements, coded_elements) -> bool:
-    ele_tpl = [
-        isinstance(x, tuple)
-        for x in numeric_elements + datetime_elements + coded_elements
-    ]
-    return any(ele_tpl)
+    Parameters
+    ----------
+    series : pd.Series
+        Series of object values to validate
+    code_table : Iterable
+        Allowed codes for validation
+    column_type : str
+        Column type for dtype lookup (via properties.pandas_dtypes)
 
+    Returns
+    -------
+    pd.Series
+        Boolean Series indicating valid entries
+    """
+    if not code_table:
+        logging.error(f"Code table not found for element {series.name}")
+        return pd.Series(False, index=series.index)
 
-def _mask_boolean(x, boolean) -> bool:
-    x = convert_str_boolean(x)
-    if x is boolean:
-        return True
-    return False
+    keys = set(code_table)
+    dtype = properties.pandas_dtypes.get(column_type, object)
+    converted = series.astype(dtype)
+    as_str = converted.astype(str)
+    return converted.isna() | as_str.isin(keys)
 
 
 def validate(
-    data,
-    imodel,
-    ext_table_path,
-    schema,
-    disables=None,
+    data: pd.DataFrame,
+    imodel: str,
+    ext_table_path: str,
+    attributes: dict[str, dict[str, Any]],
+    disables: list[str] | None = None,
 ) -> pd.DataFrame:
-    """Validate data.
+    """
+    Validate a pandas DataFrame according to a data model and code tables.
+
+    Each column is validated based on its `column_type` attribute. Supports:
+      - Numeric types: checked against valid_min and valid_max
+      - Keys: checked against a code table
+      - Datetime and string: validated using simple validators
+      - Explicit boolean literals ("True"/"False") override column validation
 
     Parameters
     ----------
-    data: pd.DataFrame
-        DataFrame for validation.
-    imodel: str
-        Name of internally available input data model.
-        e.g. icoads_r300_d704
-    ext_table_path: str
-        Path to the code tables for an external data model
-    schema: dict
-        Data model schema.
-    disables: list, optional
-        List of column names to be ignored.
+    data : pd.DataFrame
+        Input data to validate.
+    imodel : str
+        Name of the internal data model, e.g., 'icoads_r300_d704'.
+    ext_table_path : str
+        Path to external code tables for validation.
+    attributes : dict[str, dict]
+        Dictionary of column attributes (e.g., type, valid ranges, codetable).
+    disables : list[str], optional
+        Columns to skip during validation.
 
     Returns
     -------
     pd.DataFrame
-        Validated boolean mask.
+        Boolean mask of the same shape as `data`. True indicates a valid entry.
     """
-    logging.basicConfig(
-        format="%(levelname)s\t[%(asctime)s](%(filename)s)\t%(message)s",
-        level=logging.INFO,
-        datefmt="%Y%m%d %H:%M:%S",
-        filename=None,
-    )
-    # Check input
     if not isinstance(data, pd.DataFrame):
         logging.error("input data must be a pandas DataFrame.")
-        return
+        return None
 
-    mask = pd.DataFrame(index=data.index, columns=data.columns, dtype="boolean")
+    mask = pd.DataFrame(pd.NA, index=data.index, columns=data.columns, dtype="boolean")
     if data.empty:
         return mask
 
-    # Get the data elements from the input data: might be just a subset of
-    # data model and flatten the schema to get a simple and sequential list
-    # of elements included in the input data
-    elements = [x for x in data if x not in disables]
-    element_atts = schemas.df_schema(elements, schema)
-
-    # See what elements we need to validate
-    numeric_elements = _get_elements(elements, element_atts, "numeric_types")
-    datetime_elements = _get_elements(elements, element_atts, "datetime")
-    coded_elements = _get_elements(elements, element_atts, "key")
-    str_elements = _get_elements(elements, element_atts, "str")
-
-    if _element_tuples(numeric_elements, datetime_elements, coded_elements):
-        validated_columns = pd.MultiIndex.from_tuples(
-            list(set(numeric_elements + coded_elements + datetime_elements))
-        )
-    else:
-        validated_columns = list(
-            set(numeric_elements + coded_elements + datetime_elements)
-        )
-
-    mask[numeric_elements] = validate_numeric(numeric_elements, data, element_atts)
-
-    # 2. Table coded elements
-    # See following: in multiple keys code tables, the non parameter element,
-    # won't have a code_table attribute in the element_atts:
-    # So we need to check the code_table.keys files in addition to the element_atts
-    # Additionally, a YEAR key can fail in one table, but be compliant with anbother, then, how would we mask this?
-    #               also, a YEAR defined as an integer, will undergo its own check.....
-    # So I think we need to check nested keys as a whole, and mask only the actual parameterized element:
-    # Get the full list of keys combinations (tuples, triplets...) and check the column combination against that: if it fails, mark the element!
-    # Need to see how to grab the YEAR part of a datetime when YEAR comes from a datetime element
-    # pd.DatetimeIndex(df['_datetime']).year
-    if len(coded_elements) > 0:
-        mask[coded_elements] = validate_codes(
-            coded_elements,
-            data,
-            element_atts,
-            imodel,
-            ext_table_path,
-        )
-
-    # 3. Datetime elements
-    mask[datetime_elements] = validate_datetime(datetime_elements, data)
-
-    # 4. str elements
-    mask[str_elements] = validate_str(str_elements, data)
-
-    # 5. Set False values
-    mask[validated_columns] = mask[validated_columns].mask(
-        data[validated_columns].map(_mask_boolean, boolean=False),
-        False,
-    )
-
-    mask[validated_columns] = mask[validated_columns].mask(
-        data[validated_columns].map(_mask_boolean, boolean=True),
-        True,
-    )
-
-    mask[disables] = np.nan
+    disables = disables or []
+    elements = [col for col in data.columns if col not in disables]
+    element_atts = {
+        element: attributes[element] for element in elements if element in attributes
+    }
+
+    validated_columns = []
+    validated_dtypes = set(properties.numeric_types) | {"datetime", "key"}
+
+    basic_functions = {
+        "datetime": validate_datetime,
+        "str": validate_str,
+    }
+
+    for column in data.columns:
+        if column in disables or column not in attributes:
+            continue
+
+        series = data[column]
+        column_atts = element_atts.get(column, {})
+        column_type = column_atts.get("column_type")
+
+        if column_type in properties.numeric_types:
+            valid_min = column_atts.get("valid_min", -np.inf)
+            valid_max = column_atts.get("valid_max", np.inf)
+            column_mask = validate_numeric(series, valid_min, valid_max)
+        elif column_type == "key":
+            code_table_name = column_atts.get("codetable")
+            code_table = codes.read_table(
+                code_table_name, imodel=imodel, ext_table_path=ext_table_path
+            )
+            column_mask = validate_codes(series, code_table, column_type)
+        elif column_type in basic_functions:
+            column_mask = basic_functions[column_type](series)
+        else:
+            logging.warning(
+                f"Unknown column_type '{column_type}' for column '{column}'"
+            )
+            continue
+
+        mask[column] = column_mask
+        if column_type in validated_dtypes:
+            validated_columns.append(column)
+
+    # Explicit boolean literals ("True"/"False") override validation results
+    if validated_columns:
+        validated_columns = list(dict.fromkeys(validated_columns))
+        to_bool = data[validated_columns].applymap(convert_str_boolean)
+        false_mask = to_bool.applymap(_is_false)
+        true_mask = to_bool.applymap(_is_true)
+        mask[validated_columns] = mask[validated_columns].mask(false_mask, False)
+        mask[validated_columns] = mask[validated_columns].mask(true_mask, True)
+
     return mask.astype("boolean")
diff --git a/cdm_reader_mapper/mdf_reader/writer.py b/cdm_reader_mapper/mdf_reader/writer.py
index 9dabc272..a2d45fcf 100755
--- a/cdm_reader_mapper/mdf_reader/writer.py
+++ b/cdm_reader_mapper/mdf_reader/writer.py
@@ -5,33 +5,21 @@
 import json
 import logging
 from io import StringIO as StringIO
+from pathlib import Path
 
 import pandas as pd
 
-from cdm_reader_mapper.common import get_filename
-from cdm_reader_mapper.common.pandas_TextParser_hdlr import make_copy
+from .utils.utilities import join, update_column_names, update_dtypes
 
-
-def _update_dtypes(dtypes, columns) -> dict:
-    if isinstance(dtypes, dict):
-        dtypes = {k: v for k, v in dtypes.items() if k in columns}
-    return dtypes
-
-
-def _update_col_names(dtypes, col_o, col_n) -> str | dict:
-    if isinstance(dtypes, str):
-        return dtypes
-    if col_o in dtypes.keys():
-        dtypes[col_n] = dtypes[col_o]
-        del dtypes[col_o]
-    return dtypes
+from ..common import get_filename
+from ..common.pandas_TextParser_hdlr import make_copy
 
 
 def write_data(
     data,
     mask=None,
-    dtypes={},
-    parse_dates=False,
+    dtypes: dict | None = None,
+    parse_dates: list | bool = False,
     encoding="utf-8",
     out_dir=".",
     prefix=None,
@@ -100,30 +88,29 @@ def write_data(
     ----
     Use this function after reading MDF data.
     """
-
-    def _join(col):
-        if isinstance(col, (list, tuple)):
-            return ":".join(col)
-        return col
+    dtypes = dtypes or {}
+    if isinstance(parse_dates, bool):
+        parse_dates = []
 
     if not isinstance(data, pd.io.parsers.TextFileReader):
-        data = [data]
+        data_list = [data]
     else:
-        data = make_copy(data)
+        data_list = make_copy(data)
 
     if mask is None:
         mask = pd.DataFrame()
 
     if not isinstance(mask, pd.io.parsers.TextFileReader):
-        mask = [mask]
+        mask_list = [mask]
     else:
-        mask = make_copy(mask)
+        mask_list = make_copy(mask)
 
-    info = {}
-    info["dtypes"] = dtypes
-    info["parse_dates"] = [_join(parse_date) for parse_date in parse_dates]
+    info = {"dtypes": dtypes.copy(), "parse_dates": [join(p) for p in parse_dates]}
 
     logging.info(f"WRITING DATA TO FILES IN: {out_dir}")
+    out_dir = Path(out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
     filename_data = get_filename(
         [prefix, "data", suffix], path=out_dir, extension=extension
     )
@@ -133,37 +120,40 @@ def _join(col):
     filename_info = get_filename(
         [prefix, "info", suffix], path=out_dir, extension="json"
     )
-    for i, (data_df, mask_df) in enumerate(zip(data, mask)):
+
+    for i, (data_df, mask_df) in enumerate(zip(data_list, mask_list)):
         if col_subset is not None:
             data_df = data_df[col_subset]
             mask_df = mask_df[col_subset]
-        header = False
-        mode = "a"
+
+        if isinstance(data_df, pd.Series):
+            data_df = data_df.to_frame()
+        if isinstance(mask_df, pd.Series):
+            mask_df = mask_df.to_frame()
+
+        mode = "w" if i == 0 else "a"
+        header = [join(c) for c in data_df.columns] if i == 0 else False
+
         if i == 0:
-            mode = "w"
-            header = []
-            info["dtypes"] = _update_dtypes(info["dtypes"], data_df.columns)
+            info["dtypes"] = update_dtypes(info["dtypes"], data_df.columns)
             for col in data_df.columns:
-                col_ = _join(col)
-                header.append(col_)
-                info["dtypes"] = _update_col_names(info["dtypes"], col, col_)
+                info["dtypes"] = update_column_names(info["dtypes"], col, join(col))
 
-            info["parse_dates"] = [
-                parse_date for parse_date in info["parse_dates"] if parse_date in header
-            ]
+            info["parse_dates"] = [p for p in info["parse_dates"] if p in header]
             info["encoding"] = encoding
 
-        kwargs = {
-            "header": header,
-            "mode": mode,
-            "encoding": encoding,
-            "index": False,
-            "sep": delimiter,
-        }
-        data_df.to_csv(filename_data, **kwargs)
+        csv_kwargs = dict(
+            header=header,
+            mode=mode,
+            index=False,
+            sep=delimiter,
+            encoding=encoding,
+            **kwargs,
+        )
+
+        data_df.to_csv(filename_data, **csv_kwargs)
         if not mask_df.empty:
-            mask_df.to_csv(filename_mask, **kwargs)
+            mask_df.to_csv(filename_mask, **csv_kwargs)
 
-    if info:
-        with open(filename_info, "w") as fileObj:
-            json.dump(info, fileObj, indent=4)
+    with open(filename_info, "w") as fileObj:
+        json.dump(info, fileObj, indent=4)
diff --git a/cdm_reader_mapper/metmetpy/correct.py b/cdm_reader_mapper/metmetpy/correct.py
index 66901f24..85f85603 100755
--- a/cdm_reader_mapper/metmetpy/correct.py
+++ b/cdm_reader_mapper/metmetpy/correct.py
@@ -64,8 +64,8 @@
 
 import pandas as pd
 
-from cdm_reader_mapper.common import logging_hdlr, pandas_TextParser_hdlr
-from cdm_reader_mapper.common.json_dict import collect_json_files, combine_dicts
+from ..common import logging_hdlr, pandas_TextParser_hdlr
+from ..common.json_dict import collect_json_files, combine_dicts
 
 from . import properties
 from .datetime import correction_functions as corr_f_dt
diff --git a/cdm_reader_mapper/metmetpy/validate.py b/cdm_reader_mapper/metmetpy/validate.py
index 87640116..f8180a02 100755
--- a/cdm_reader_mapper/metmetpy/validate.py
+++ b/cdm_reader_mapper/metmetpy/validate.py
@@ -61,8 +61,8 @@
 
 import pandas as pd
 
-from cdm_reader_mapper.common import logging_hdlr, pandas_TextParser_hdlr
-from cdm_reader_mapper.common.json_dict import collect_json_files, combine_dicts
+from ..common import logging_hdlr, pandas_TextParser_hdlr
+from ..common.json_dict import collect_json_files, combine_dicts
 
 from . import properties
 from .datetime import model_datetimes
diff --git a/tests/test_mdf_reader.py b/tests/test_mdf_reader.py
index 8c194735..59c75f50 100755
--- a/tests/test_mdf_reader.py
+++ b/tests/test_mdf_reader.py
@@ -2,14 +2,24 @@
 
 import os
 
+import numpy as np
 import pandas as pd
 import pytest
 
-from cdm_reader_mapper import test_data
+from cdm_reader_mapper import test_data, DataBundle
 from cdm_reader_mapper.mdf_reader.reader import (
     read_mdf,
     read_data,
+    validate_read_mdf_args,
 )
+from cdm_reader_mapper.mdf_reader.utils.filereader import _apply_multiindex
+
+
+def _get_columns(columns, select):
+    if isinstance(columns, pd.MultiIndex):
+        return columns.get_level_values(0).isin(select)
+    mask = [(type(c) is tuple and c[0] in select) or (c in select) for c in columns]
+    return np.array(mask)
 
 
 def _drop_rows(df, drops):
@@ -37,19 +47,22 @@ def _read_mdf_test_data(data_model, select=None, drop=None, drop_idx=None, **kwa
         result.mask = result.mask.read()
 
     if select:
-        expected.data = expected.data[select]
-        expected.mask = expected.mask[select]
+        selected = _get_columns(expected.data.columns, select)
+        expected.data = expected.data.loc[:, selected]
+        expected.mask = expected.mask.loc[:, selected]
 
     if drop:
-        result.data = result.data.drop(columns=drop)
-        result.mask = result.mask.drop(columns=drop)
-        expected.data = expected.data.drop(columns=drop)
-        expected.mask = expected.mask.drop(columns=drop)
+        unselected = _get_columns(expected.data.columns, drop)
+        expected.data = expected.data.loc[:, ~unselected]
+        expected.mask = expected.mask.loc[:, ~unselected]
 
     if drop_idx:
         expected.data = _drop_rows(expected.data, drop_idx)
         expected.mask = _drop_rows(expected.mask, drop_idx)
 
+    expected.data = _apply_multiindex(expected.data)
+    expected.mask = _apply_multiindex(expected.mask)
+
     pd.testing.assert_frame_equal(result.data, expected.data)
     pd.testing.assert_frame_equal(result.mask, expected.mask)
 
@@ -78,7 +91,7 @@ def _read_mdf_test_data(data_model, select=None, drop=None, drop_idx=None, **kwa
         "gdac",
     ],
 )
-def test_read_mdf_test_data(data_model):
+def test_read_mdf_test_data_basic(data_model):
     _read_mdf_test_data(data_model)
 
 
@@ -137,19 +150,38 @@ def test_read_mdf_test_data_kwargs(data_model, kwargs):
     "data_model, kwargs, select",
     [
         ("icoads_r300_d714", {"sections": ["c99"], "chunksize": 3}, ["c99"]),
+        ("icoads_r300_d714", {"sections": ["c99"]}, ["c99"]),
+        ("icoads_r300_d714", {"sections": "c99"}, ["c99"]),
         (
             "icoads_r300_d714",
-            {"sections": ["core", "c99"], "chunksize": 3},
+            {"sections": ["core", "c99"]},
             ["core", "c99"],
         ),
+        ("craid", {"sections": ["drifter_measurements"]}, ["drifter_measurements"]),
     ],
 )
 def test_read_mdf_test_data_select(data_model, kwargs, select):
     _read_mdf_test_data(data_model, select=select, **kwargs)
 
 
-def test_read_mdf_test_data_drop():
-    _read_mdf_test_data("icoads_r300_mixed", drop=["c99"], encoding="cp1252")
+@pytest.mark.parametrize(
+    "data_model, kwargs, drop",
+    [
+        ("icoads_r300_d714", {"excludes": ["c98"]}, ["c98"]),
+        ("icoads_r300_d714", {"excludes": "c98"}, ["c98"]),
+        ("icoads_r300_d714", {"excludes": ["c5", "c98"]}, ["c5", "c98"]),
+        ("icoads_r300_mixed", {"excludes": ["c99"], "encoding": "cp1252"}, ["c99"]),
+        ("icoads_r300_mixed", {"excludes": "c99", "encoding": "cp1252"}, ["c99"]),
+        (
+            "craid",
+            {"excludes": ["drifter_measurements", "drifter_history"]},
+            ["drifter_measurements", "drifter_history"],
+        ),
+        ("gdac", {"excludes": "AAAA"}, ["AAAA"]),
+    ],
+)
+def test_read_mdf_test_data_exclude(data_model, kwargs, drop):
+    _read_mdf_test_data(data_model, drop=drop, **kwargs)
 
 
 @pytest.mark.parametrize(
@@ -168,3 +200,278 @@ def test_read_mdf_test_data_drop():
 )
 def test_read_mdf_test_data_drop_idx(data_model, kwargs, drop_idx):
     _read_mdf_test_data(data_model, drop_idx=drop_idx, **kwargs)
+
+
+def test_read_data_basic():
+    data_model = "icoads_r300_d721"
+    data = test_data[f"test_{data_model}"]["mdf_data"]
+    mask = test_data[f"test_{data_model}"]["mdf_mask"]
+    info = test_data[f"test_{data_model}"]["mdf_info"]
+    db = read_data(data, mask, info)
+
+    assert isinstance(db, DataBundle)
+
+    for attr in [
+        "data",
+        "mask",
+        "columns",
+        "dtypes",
+        "parse_dates",
+        "encoding",
+        "imodel",
+        "mode",
+    ]:
+        assert hasattr(db, attr)
+
+    assert isinstance(db.data, pd.DataFrame)
+    assert isinstance(db.mask, pd.DataFrame)
+    assert isinstance(db.columns, pd.MultiIndex)
+    assert isinstance(db.dtypes, dict)
+    assert isinstance(db.parse_dates, list)
+    assert isinstance(db.encoding, str)
+    assert db.encoding == "cp1252"
+    assert db.imodel is None
+    assert isinstance(db.mode, str)
+    assert db.mode == "data"
+    assert len(db) == 5
+    assert db.shape == (5, 341)
+    assert db.size == 1705
+
+
+def test_read_data_no_mask():
+    data_model = "icoads_r300_d721"
+    data = test_data[f"test_{data_model}"]["mdf_data"]
+    info = test_data[f"test_{data_model}"]["mdf_info"]
+    db = read_data(data, info=info)
+
+    assert isinstance(db, DataBundle)
+
+    for attr in [
+        "data",
+        "mask",
+        "columns",
+        "dtypes",
+        "parse_dates",
+        "encoding",
+        "imodel",
+        "mode",
+    ]:
+        assert hasattr(db, attr)
+
+    assert isinstance(db.data, pd.DataFrame)
+    assert isinstance(db.mask, pd.DataFrame)
+    assert isinstance(db.columns, pd.MultiIndex)
+    assert isinstance(db.dtypes, dict)
+    assert isinstance(db.parse_dates, list)
+    assert isinstance(db.encoding, str)
+    assert db.encoding == "cp1252"
+    assert db.imodel is None
+    assert isinstance(db.mode, str)
+    assert db.mode == "data"
+    assert len(db) == 5
+    assert db.shape == (5, 341)
+    assert db.size == 1705
+
+
+def test_read_data_no_info():
+    data_model = "icoads_r300_d721"
+    data = test_data[f"test_{data_model}"]["mdf_data"]
+
+    db = read_data(data)
+
+    assert isinstance(db, DataBundle)
+
+    for attr in [
+        "data",
+        "mask",
+        "columns",
+        "dtypes",
+        "parse_dates",
+        "encoding",
+        "imodel",
+        "mode",
+    ]:
+        assert hasattr(db, attr)
+
+    assert isinstance(db.data, pd.DataFrame)
+    assert isinstance(db.mask, pd.DataFrame)
+    assert isinstance(db.columns, pd.MultiIndex)
+    assert db.dtypes == "object"
+    assert db.parse_dates is False
+    assert db.encoding is None
+    assert db.imodel is None
+    assert isinstance(db.mode, str)
+    assert db.mode == "data"
+    assert len(db) == 5
+    assert db.shape == (5, 341)
+    assert db.size == 1705
+
+
+def test_read_data_col_subset():
+    data_model = "icoads_r300_d721"
+    data = test_data[f"test_{data_model}"]["mdf_data"]
+    info = test_data[f"test_{data_model}"]["mdf_info"]
+    db = read_data(data, info=info, col_subset="core")
+
+    assert isinstance(db, DataBundle)
+
+    for attr in [
+        "data",
+        "mask",
+        "columns",
+        "dtypes",
+        "parse_dates",
+        "encoding",
+        "imodel",
+        "mode",
+    ]:
+        assert hasattr(db, attr)
+
+    assert isinstance(db.data, pd.DataFrame)
+    assert isinstance(db.mask, pd.DataFrame)
+    assert isinstance(db.columns, pd.Index)
+    assert isinstance(db.dtypes, dict)
+    assert isinstance(db.parse_dates, list)
+    assert isinstance(db.encoding, str)
+    assert db.encoding == "cp1252"
+    assert db.imodel is None
+    assert isinstance(db.mode, str)
+    assert db.mode == "data"
+    assert len(db) == 5
+    assert db.shape == (5, 48)
+    assert db.size == 240
+
+
+def test_read_data_encoding():
+    data_model = "icoads_r300_d721"
+    data = test_data[f"test_{data_model}"]["mdf_data"]
+    db = read_data(data, encoding="cp1252")
+
+    assert isinstance(db, DataBundle)
+
+    for attr in [
+        "data",
+        "mask",
+        "columns",
+        "dtypes",
+        "parse_dates",
+        "encoding",
+        "imodel",
+        "mode",
+    ]:
+        assert hasattr(db, attr)
+
+    assert isinstance(db.data, pd.DataFrame)
+    assert isinstance(db.mask, pd.DataFrame)
+    assert isinstance(db.columns, pd.Index)
+    assert db.dtypes == "object"
+    assert db.parse_dates is False
+    assert isinstance(db.encoding, str)
+    assert db.encoding == "cp1252"
+    assert db.imodel is None
+    assert isinstance(db.mode, str)
+    assert db.mode == "data"
+    assert len(db) == 5
+    assert db.shape == (5, 341)
+    assert db.size == 1705
+
+
+def test_validate_read_mdf_args_pass(tmp_path):
+    source = tmp_path / "file.mdf"
+    source.touch()
+
+    validate_read_mdf_args(
+        source=source,
+        imodel=object(),
+        ext_schema_path=None,
+        ext_schema_file=None,
+        year_init=2000,
+        year_end=2020,
+        chunksize=100,
+        skiprows=0,
+    )
+
+
+def test_validate_read_mdf_args_invalid_source(tmp_path):
+    with pytest.raises(FileNotFoundError):
+        validate_read_mdf_args(
+            source=tmp_path / "missing.mdf",
+            imodel=object(),
+            ext_schema_path=None,
+            ext_schema_file=None,
+            year_init=None,
+            year_end=None,
+            chunksize=None,
+            skiprows=0,
+        )
+
+
+def test_validate_read_mdf_args_missing_all_sources(tmp_path):
+    source = tmp_path / "file.mdf"
+    source.touch()
+
+    with pytest.raises(
+        ValueError,
+        match="One of imodel or ext_schema_path/ext_schema_file must be provided",
+    ):
+        validate_read_mdf_args(
+            source=source,
+            imodel=None,
+            ext_schema_path=None,
+            ext_schema_file=None,
+            year_init=None,
+            year_end=None,
+            chunksize=None,
+            skiprows=0,
+        )
+
+
+def test_validate_read_mdf_args_invalid_chunksize(tmp_path):
+    source = tmp_path / "file.mdf"
+    source.touch()
+
+    with pytest.raises(ValueError, match="chunksize must be a positive integer"):
+        validate_read_mdf_args(
+            source=source,
+            imodel=object(),
+            ext_schema_path=None,
+            ext_schema_file=None,
+            year_init=None,
+            year_end=None,
+            chunksize=0,
+            skiprows=0,
+        )
+
+
+def test_validate_read_mdf_args_invalid_skiprows(tmp_path):
+    source = tmp_path / "file.mdf"
+    source.touch()
+
+    with pytest.raises(ValueError, match="skiprows must be >= 0"):
+        validate_read_mdf_args(
+            source=source,
+            imodel=object(),
+            ext_schema_path=None,
+            ext_schema_file=None,
+            year_init=None,
+            year_end=None,
+            chunksize=None,
+            skiprows=-1,
+        )
+
+
+def test_validate_read_mdf_args_invalid_years(tmp_path):
+    source = tmp_path / "file.mdf"
+    source.touch()
+
+    with pytest.raises(ValueError, match="year_init must be <= year_end"):
+        validate_read_mdf_args(
+            source=source,
+            imodel=object(),
+            ext_schema_path=None,
+            ext_schema_file=None,
+            year_init=2021,
+            year_end=2020,
+            chunksize=None,
+            skiprows=0,
+        )
diff --git a/tests/test_mdf_writer.py b/tests/test_mdf_writer.py
new file mode 100755
index 00000000..fc0a4ca1
--- /dev/null
+++ b/tests/test_mdf_writer.py
@@ -0,0 +1,110 @@
+from __future__ import annotations
+
+import json
+
+import pandas as pd
+import pytest  # noqa
+
+from pandas.testing import assert_frame_equal
+
+from cdm_reader_mapper.mdf_reader.writer import (
+    write_data,
+)
+
+
+def test_write_data_basic(tmp_path):
+    data = pd.DataFrame(
+        {
+            "A": [1, 2, 3],
+            "B": ["1", "2", "3"],
+        }
+    )
+    mask = pd.DataFrame(
+        {
+            "A": [True, True, False],
+            "B": [False, True, True],
+        }
+    )
+    info = {
+        "dtypes": {"A": "int", "B": "str"},
+        "parse_dates": [],
+        "encoding": "utf-8",
+    }
+
+    write_data(
+        data,
+        mask=mask,
+        out_dir=tmp_path,
+        prefix="test_write",
+        suffix="basic",
+        **info,
+    )
+
+    data_file = tmp_path / "test_write-data-basic.csv"
+    mask_file = tmp_path / "test_write-mask-basic.csv"
+    info_file = tmp_path / "test_write-info-basic.json"
+
+    assert data_file.is_file()
+    assert mask_file.is_file()
+    assert info_file.is_file()
+
+    with open(info_file) as read_file:
+        info_res = json.load(read_file)
+
+    assert info_res == info
+
+    data_res = pd.read_csv(data_file, dtype=info["dtypes"])
+    assert_frame_equal(data, data_res)
+
+    mask_res = pd.read_csv(mask_file, dtype="bool")
+    assert_frame_equal(mask, mask_res)
+
+
+def test_write_data_col_subset(tmp_path):
+    data = pd.DataFrame(
+        {
+            "A": [1, 2, 3],
+            "B": ["1", "2", "3"],
+        }
+    )
+    mask = pd.DataFrame(
+        {
+            "A": [True, True, False],
+            "B": [False, True, True],
+        }
+    )
+    info = {
+        "dtypes": {"A": "int"},
+        "parse_dates": [],
+        "encoding": "utf-8",
+    }
+    subset = "A"
+
+    write_data(
+        data,
+        mask=mask,
+        out_dir=tmp_path,
+        prefix="test_write",
+        suffix="subset",
+        col_subset=subset,
+        **info,
+    )
+
+    data_file = tmp_path / "test_write-data-subset.csv"
+    mask_file = tmp_path / "test_write-mask-subset.csv"
+    info_file = tmp_path / "test_write-info-subset.json"
+
+    assert data_file.is_file()
+    assert mask_file.is_file()
+    assert info_file.is_file()
+
+    with open(info_file) as read_file:
+        info_res = json.load(read_file)
+
+    assert info_res == info
+
+    data_res = pd.read_csv(data_file, dtype=info["dtypes"])
+    assert_frame_equal(data[[subset]], data_res)
+
+    mask_res = pd.read_csv(mask_file, dtype="bool")
+    assert_frame_equal(mask[[subset]], mask_res)
diff --git a/tests/test_reader_codes.py b/tests/test_reader_codes.py
new file mode 100755
index 00000000..66b6c5ed
--- /dev/null
+++ b/tests/test_reader_codes.py
@@ -0,0 +1,39 @@
+from __future__ import annotations
+
+import pytest
+from pathlib import Path
+import json
+
+from cdm_reader_mapper.mdf_reader.codes.codes import read_table
+
+
+@pytest.fixture
+def tmp_json_file(tmp_path: Path) -> tuple[Path, dict]:
+    """Create a temporary JSON file and return path and data."""
+    data = {"A": {"value": 1}, "B": {"value": 2}}
+    file_path = tmp_path / "test_table.json"
+    file_path.write_text(json.dumps(data), encoding="utf-8")
+    return file_path, data
+
+
+def test_read_table_with_imodel():
+    result = read_table("ICOADS.C99.SEALUMI", imodel="icoads_r300_d781")
+    assert isinstance(result, dict)
+    assert result == {"0": "no", "1": "yes", "9": "missing", "8": "unknown"}
+
+
+def test_read_table_with_external_file(tmp_json_file):
+    file_path, expected_data = tmp_json_file
+    result = read_table("test_table", ext_table_path=str(file_path.parent))
+    assert isinstance(result, dict)
+    assert result == expected_data
+
+
+def test_read_table_with_missing_file():
+    with pytest.raises(FileNotFoundError):
+        read_table("nonexistent_table", ext_table_path="tmp")
+
+
+def test_read_table_requires_input():
+    with pytest.raises(ValueError):
+        read_table("table_without_path_or_model")
diff --git a/tests/test_reader_convert_and_decode.py b/tests/test_reader_convert_and_decode.py
new file mode 100755
index 00000000..56c3c170
--- /dev/null
+++ b/tests/test_reader_convert_and_decode.py
@@ -0,0 +1,205 @@
+from __future__ import annotations
+
+import pandas as pd
+import pytest
+
+from decimal import Decimal
+
+from cdm_reader_mapper.mdf_reader.utils.convert_and_decode import (
+    max_decimal_places,
+    to_numeric,
+    Decoders,
+    Converters,
+    convert_and_decode,
+)
+from cdm_reader_mapper.mdf_reader import properties
+
+
+@pytest.fixture
+def sample_series():
+    return pd.Series(["A", "Z", "10", "1Z"])
+
+
+@pytest.fixture
+def numeric_series():
+    return pd.Series(["1", "2 ", "3", "False", "bad"], dtype="object", name="NUM")
+
+
+@pytest.fixture
+def sample_df():
+    return pd.DataFrame(
+        {
+            "NUM": ["1", "2 ", "3", "False", "bad"],  # object type
+            "KEY": ["a", "b", "c", "d", "e"],  # for decoder
+        }
+    )
+
+
+def test_max_decimal_places():
+    assert max_decimal_places(Decimal("1"), Decimal("2.34")) == 2
+    assert max_decimal_places(Decimal("1.200"), Decimal("3.4")) == 3
+    assert max_decimal_places(Decimal("5")) == 0
+
+
+@pytest.mark.parametrize(
+    "value, scale, offset, expected",
+    [
+        ("10", Decimal("0.1"), Decimal("0"), Decimal("1.0")),
+        ("10", Decimal("1"), Decimal("5"), Decimal("15")),
+        ("3.5", Decimal("2"), Decimal("1.00"), Decimal("8.00")),
+        (" 2 ", Decimal("1"), Decimal("0"), Decimal("2")),
+        ("", Decimal("1"), Decimal("0"), False),
+        ("abc", Decimal("1"), Decimal("0"), False),
+    ],
+)
+def test_to_numeric_valid(value, scale, offset, expected):
+    assert to_numeric(value, scale, offset) == expected
+
+
+def test_to_numeric_boolean_passthrough():
+    assert to_numeric(True, Decimal("1"), Decimal("0")) is True
+    assert to_numeric(False, Decimal("1"), Decimal("0")) is False
+
+
+def test_to_numeric_space_replacement():
+    assert to_numeric("1 2", Decimal("1"), Decimal("0")) == Decimal("102")
+
+
+def test_to_numeric_precision_preserved():
+    result = to_numeric("1.234", Decimal("0.1"), Decimal("0.00"))
+    assert result == Decimal("0.123")
+
+
+def test_base36_decoding_basic(sample_series):
+    dec = Decoders(dtype="key")
+    decoder = dec.decoder()
+
+    result = decoder(sample_series)
+
+    assert list(result) == ["10", "35", "36", "71"]
+
+
+def test_base36_preserves_boolean():
+    series = pd.Series(["True", "False", "A"])
+    dec = Decoders(dtype="key")
+
+    result = dec.decoder()(series)
+
+    assert result.tolist() == [True, False, "10"]
+
+
+def test_converter_numeric(numeric_series):
+    conv = Converters(dtype=next(iter(properties.numeric_types)))
+    func = conv.converter()
+
+    result = func(numeric_series)
+
+    assert result.iloc[0] == Decimal("1")
+    assert result.iloc[1] == Decimal("2")
+    assert result.iloc[2] == Decimal("3")
+    assert result.iloc[3] is False
+    assert result.iloc[4] is False
+
+
+def test_numeric_with_scale_offset():
+    conv = Converters(dtype="float")
+    series = pd.Series(["1", "2"])
+
+    result = conv.object_to_numeric(series, scale=10, offset=5)
+
+    assert result.tolist() == [Decimal("15"), Decimal("25")]
+
+
+def test_preprocessing_function_pppp():
+    conv = Converters(dtype=next(iter(properties.numeric_types)))
+    series = pd.Series(["0123"], name="PPPP")
+
+    result = conv.object_to_numeric(series)
+
+    assert result.iloc[0] == Decimal("10123")
+
+
+def test_object_to_object_strip():
+    conv = Converters(dtype="object")
+    series = pd.Series([" a ", "", "   ", "b"])
+
+    result = conv.object_to_object(series)
+
+    assert result.tolist() == ["a", None, None, "b"]
+
+
+def test_object_to_object_disable_strip():
+    conv = Converters(dtype="object")
+    series = pd.Series([" a ", "b "])
+
+    result = conv.object_to_object(series, disable_white_strip="l")
+
+    assert result.tolist() == [" a", "b"]
+
+
+def test_object_to_datetime():
+    conv = Converters(dtype="datetime")
+    series = pd.Series(["20240101", "bad"])
+
+    result = conv.object_to_datetime(series)
+
+    assert pd.notna(result.iloc[0])
+    assert pd.isna(result.iloc[1])
+
+
+def test_unknown_dtype_raises():
+    with pytest.raises(KeyError):
+        Converters("unknown").converter()
+
+
+def test_convert_and_decode_basic():
+    df = pd.DataFrame({"A": ["1", "2", "3"], "B": ["x", "y", "z"]})
+
+    converter_dict = {
+        "A": lambda s: s.apply(lambda x: Decimal(x) * 2),
+        "B": lambda s: s.str.upper(),
+    }
+    converter_kwargs = {"A": {}, "B": {}}
+
+    decoder_dict = {"A": lambda s: s.apply(lambda x: str(int(x) + 1))}
+
+    out = convert_and_decode(
+        df.copy(),
+        convert_flag=True,
+        decode_flag=True,
+        converter_dict=converter_dict,
+        converter_kwargs=converter_kwargs,
+        decoder_dict=decoder_dict,
+    )
+
+    assert out["A"].iloc[0] == Decimal(4)
+    assert out["A"].iloc[1] == Decimal(6)
+    assert out["B"].iloc[0] == "X"
+
+
+def test_convert_and_decode_with_converters_and_decoders(sample_df):
+    df = sample_df.copy()
+
+    conv = Converters(dtype="int")
+    converter_dict = {"NUM": conv.converter()}
+    converter_kwargs = {"NUM": {}}
+
+    dec = Decoders(dtype="key")
+    decoder_dict = {"KEY": dec.decoder()}
+
+    out = convert_and_decode(
+        df,
+        convert_flag=True,
+        decode_flag=True,
+        converter_dict=converter_dict,
+        converter_kwargs=converter_kwargs,
+        decoder_dict=decoder_dict,
+    )
+
+    expected_nums = [Decimal("1"), Decimal("2"), Decimal("3"), False, False]
+    for i, val in enumerate(expected_nums):
+        assert out["NUM"].iloc[i] == val
+
+    expected_keys = ["10", "11", "12", "13", "14"]
+    for i, val in enumerate(expected_keys):
+        assert out["KEY"].iloc[i] == val
diff --git a/tests/test_reader_filereader.py b/tests/test_reader_filereader.py
new file mode 100755
index 00000000..89badf3a
--- /dev/null
+++ b/tests/test_reader_filereader.py
@@ -0,0 +1,353 @@
+from __future__ import annotations
+
+import pytest
+
+import pandas as pd
+import xarray as xr
+
+from io import StringIO
+
+from pandas.io.parsers import TextFileReader
+from pandas.testing import assert_frame_equal, assert_index_equal
+
+from cdm_reader_mapper import DataBundle
+
+from cdm_reader_mapper.mdf_reader.utils.parser import OrderSpec, ParserConfig
+
+from cdm_reader_mapper.mdf_reader.utils.filereader import (
+    _apply_or_chunk,
+    _merge_kwargs,
+    _apply_multiindex,
+    _select_years,
+    FileReader,
+)
+
+
+def f(x, y):
+    return x + y
+
+
+def test_merge_kwargs_success():
+    out = _merge_kwargs({"a": 1}, {"b": 2})
+    assert out == {"a": 1, "b": 2}
+
+
+def test_merge_kwargs_duplicate_key():
+    with pytest.raises(ValueError):
+        _merge_kwargs({"a": 1}, {"a": 2})
+
+
+def test_apply_multiindex_no_tuples():
+    df = pd.DataFrame({"a": [1], "b": [2]})
+    out = _apply_multiindex(df)
+    assert out.columns.equals(df.columns)
+
+
+def test_apply_multiindex_with_tuples():
+    df = pd.DataFrame({("core", "YR"): [2010], ("core", "MO"): [7]})
+    out = _apply_multiindex(df)
+    assert isinstance(out.columns, pd.MultiIndex)
+    assert out.columns.tolist() == [("core", "YR"), ("core", "MO")]
+
+
+def test_select_years_no_selection():
+    df = pd.DataFrame({"YR": [2000, 2001]})
+    out = _select_years(df, (None, None), "YR")
+    pd.testing.assert_frame_equal(out, df)
+
+
+def test_select_years_range():
+    df = pd.DataFrame({"YR": [1999, 2000, 2001, 2002]})
+    out = _select_years(df, (2000, 2001), "YR")
+    assert out["YR"].tolist() == [2000, 2001]
+
+
+def test_select_years_handles_non_numeric():
+    df = pd.DataFrame({"YR": ["2000", "bad", "2001"]})
+    out = _select_years(df, (2000, 2001), "YR")
+    assert out["YR"].tolist() == ["2000", "2001"]
+
+
+def test_apply_or_chunk_dataframe():
+    df = pd.DataFrame({"test": [1, 2, 3, 4]})
+    out = _apply_or_chunk(df, f, func_args=[2])
+    assert isinstance(out, pd.DataFrame)
+    assert_frame_equal(out, pd.DataFrame({"test": [3, 4, 5, 6]}))
+
+
+def test_apply_or_chunk_textfilereader():
+    buffer = StringIO("test\n1\n2\n3\n4")
+    read_kwargs = {"chunksize": 2}
+    reader = pd.read_csv(buffer, **read_kwargs)
+    (out,) = _apply_or_chunk(reader, f, func_args=[2], read_kwargs=read_kwargs)
+    assert isinstance(out, TextFileReader)
+    assert_frame_equal(out.read(), pd.DataFrame({"test": [3, 4, 5, 6]}))
+
+
+@pytest.fixture
+def dtypes():
+    return {
+        ("core", "YR"): "Int64",
+        ("core", "MO"): "Int64",
+        ("core", "DY"): "Int64",
+        ("core", "HR"): "Int64",
+    }
+
+
+@pytest.fixture
+def fake_pandas_df():
+    data = {
+        "0": [
+            "2010 7 1  100",
+            "2010 7 2  200",
+            "2010 7 3  300",
+        ]
+    }
+    return pd.DataFrame(data)
+
+
+@pytest.fixture
+def fake_pandas_df_file(fake_pandas_df, tmp_path):
+    file_path = tmp_path / "fake_dataframe.csv"
+    fake_pandas_df.to_csv(file_path, header=False, index=False)
+    return file_path
+
+
+@pytest.fixture
+def fake_xr_dataset():
+    return xr.Dataset(
+        {
+            "YR": ("time", [2010, 2010, 2010]),
+            "MO": ("time", [7, 7, 7]),
+            "DY": ("time", [1, 2, 3]),
+            "HR": ("time", [10, 20, 30]),
+        },
+        coords={"time": [0, 1, 2]},
+        attrs={"source": "fake"},
+    )
+
+
+@pytest.fixture
+def fake_xr_dataset_file(fake_xr_dataset, tmp_path):
+    file_path = tmp_path / "fake_dataset.nc"
+    fake_xr_dataset.to_netcdf(file_path)
+    return file_path
+
+
+@pytest.fixture
+def fake_out_dataset(dtypes):
+    data = {
+        ("core", "YR"): [2010, 2010, 2010],
+        ("core", "MO"): [7, 7, 7],
+        ("core", "DY"): [1, 2, 3],
+        ("core", "HR"): [10, 20, 30],
+    }
+    df = pd.DataFrame(data)
+
+    for col, dtype in dtypes.items():
+        df[col] = df[col].astype(dtype)
+
+    return df
+
+
+@pytest.fixture
+def fake_config(dtypes):
+    order_specs = {
+        "core": OrderSpec(
+            header={"length": 12, "field_layout": "fixed_width"},
+            elements={
+                "YR": {"index": ("core", "YR"), "ignore": False, "field_length": 4},
+                "MO": {"index": ("core", "MO"), "ignore": False, "field_length": 2},
+                "DY": {"index": ("core", "DY"), "ignore": False, "field_length": 2},
+                "HR": {"index": ("core", "HR"), "ignore": False, "field_length": 4},
+            },
+            is_delimited=False,
+        )
+    }
+    return ParserConfig(
+        order_specs=order_specs,
+        disable_reads=[],
+        dtypes=dtypes,
+        parse_dates=[],
+        convert_decode={
+            "converter_dict": {},
+            "converter_kwargs": {},
+            "decoder_dict": {},
+        },
+        validation={},
+        encoding="utf-8",
+    )
+
+
+@pytest.fixture
+def reader_pd(fake_config):
+    r = FileReader("icoads")
+    # override config for test
+    r.config = fake_config
+    return r
+
+
+@pytest.fixture
+def reader_xr(fake_config):
+    r = FileReader("craid")
+    # override config for test
+    r.config = fake_config
+    return r
+
+
+def test_process_data_pandas(reader_pd, fake_pandas_df, fake_out_dataset):
+    data, mask, config = reader_pd._process_data(
+        fake_pandas_df,
+        convert_flag=False,
+        decode_flag=False,
+        converter_dict=None,
+        converter_kwargs=None,
+        decoder_dict=None,
+        validate_flag=False,
+        ext_table_path=None,
+        sections=None,
+        excludes=None,
+        config=reader_pd.config,
+        parse_mode="pandas",
+    )
+
+    assert isinstance(data, pd.DataFrame)
+    assert isinstance(mask, pd.DataFrame)
+    assert_index_equal(data.columns, mask.columns)
+    assert len(data) == len(mask)
+
+    assert config.columns is not None
+
+    assert_frame_equal(data, fake_out_dataset)
+    assert_index_equal(data.columns, config.columns)
+
+    assert mask.all().all()
+
+
+def test_process_data_netcdf(reader_xr, fake_xr_dataset, fake_out_dataset):
+    data, mask, config = reader_xr._process_data(
+        fake_xr_dataset,
+        convert_flag=False,
+        decode_flag=False,
+        converter_dict=None,
+        converter_kwargs=None,
+        decoder_dict=None,
+        validate_flag=False,
+        ext_table_path=None,
+        sections=None,
+        excludes=None,
+        config=reader_xr.config,
+        parse_mode="netcdf",
+    )
+
+    assert isinstance(data, pd.DataFrame)
+    assert isinstance(mask, pd.DataFrame)
+    assert_index_equal(data.columns, mask.columns)
+    assert len(data) == len(mask)
+
+    assert config.columns is not None
+
+    assert_frame_equal(data, fake_out_dataset)
+    assert_index_equal(data.columns, config.columns)
+
+    assert mask.all().all()
+
+
+def test_open_data_pandas(reader_pd, fake_pandas_df_file, fake_out_dataset):
+    data, mask, config = reader_pd.open_data(
+        fake_pandas_df_file,
+        open_with="pandas",
+    )
+    assert isinstance(data, pd.DataFrame)
+    assert isinstance(mask, pd.DataFrame)
+    assert_index_equal(data.columns, mask.columns)
+    assert len(data) == len(mask)
+
+    assert config.columns is not None
+
+    assert_frame_equal(data, fake_out_dataset)
+    assert_index_equal(data.columns, config.columns)
+
+    assert mask.all().all()
+
+
+def test_open_data_netcdf(reader_xr, fake_xr_dataset_file, fake_out_dataset):
+    data, mask, config = reader_xr.open_data(
+        fake_xr_dataset_file,
+        open_with="netcdf",
+    )
+    assert isinstance(data, pd.DataFrame)
+    assert isinstance(mask, pd.DataFrame)
+    assert_index_equal(data.columns, mask.columns)
+    assert len(data) == len(mask)
+
+    assert config.columns is not None
+
+    assert_frame_equal(data, fake_out_dataset)
+    assert_index_equal(data.columns, config.columns)
+
+    assert mask.all().all()
+
+
+def test_read_pandas(reader_pd, fake_pandas_df_file, dtypes, fake_out_dataset):
+    databundle = reader_pd.read(
+        fake_pandas_df_file,
+    )
+    assert isinstance(databundle, DataBundle)
+    assert hasattr(databundle, "data")
+    assert hasattr(databundle, "mask")
+    assert hasattr(databundle, "columns")
+    assert hasattr(databundle, "dtypes")
+    assert hasattr(databundle, "parse_dates")
+    assert hasattr(databundle, "encoding")
+    assert hasattr(databundle, "imodel")
+
+    data = databundle.data
+    mask = databundle.mask
+
+    assert isinstance(data, pd.DataFrame)
+    assert isinstance(mask, pd.DataFrame)
+    assert_index_equal(data.columns, mask.columns)
+    assert len(data) == len(mask)
+    assert_frame_equal(data, fake_out_dataset)
+
+    assert_index_equal(data.columns, databundle.columns)
+
+    assert mask.all().all()
+
+    assert databundle.dtypes == dtypes
+    assert databundle.parse_dates == []
+    assert databundle.encoding == "utf-8"
+    assert databundle.imodel == reader_pd.imodel
+
+
+def test_read_netcdf(reader_xr, fake_xr_dataset_file, dtypes, fake_out_dataset):
+    databundle = reader_xr.read(
+        fake_xr_dataset_file,
+    )
+    assert isinstance(databundle, DataBundle)
+    assert hasattr(databundle, "data")
+    assert hasattr(databundle, "mask")
+    assert hasattr(databundle, "columns")
+    assert hasattr(databundle, "dtypes")
+    assert hasattr(databundle, "parse_dates")
+    assert hasattr(databundle, "encoding")
+    assert hasattr(databundle, "imodel")
+
+    data = databundle.data
+    mask = databundle.mask
+
+    assert isinstance(data, pd.DataFrame)
+    assert isinstance(mask, pd.DataFrame)
+    assert_index_equal(data.columns, mask.columns)
+    assert len(data) == len(mask)
+    assert_frame_equal(data, fake_out_dataset)
+
+    assert_index_equal(data.columns, databundle.columns)
+
+    assert mask.all().all()
+
+    assert databundle.dtypes == dtypes
+    assert databundle.parse_dates == []
+    assert databundle.encoding == "utf-8"
+    assert databundle.imodel == reader_xr.imodel
diff --git a/tests/test_reader_parser.py b/tests/test_reader_parser.py
new file mode 100755
index 00000000..b7f4e254
--- /dev/null
+++ b/tests/test_reader_parser.py
@@ -0,0 +1,596 @@
+from __future__ import annotations
+
+import pytest  # noqa
+
+import logging
+
+import pandas as pd
+import xarray as xr  # noqa
+
+from pandas.testing import assert_frame_equal
+
+from types import MethodType
+
+from cdm_reader_mapper.mdf_reader.utils.parser import (
+    _get_index,
+    _get_ignore,
+    _convert_dtype_to_default,
+    _parse_fixed_width,
+    _parse_delimited,
+    _parse_line,
+    parse_pandas,
+    parse_netcdf,  # noqa
+    update_pd_config,
+    update_xr_config,
+    ParserConfig,
+    build_parser_config,
+)
+
+
+@pytest.fixture
+def order_specs():
+    return {
+        "core": {
+            "header": {},
+            "elements": {
+                "YR": {"index": ("core", "YR"), "field_length": 4},
+                "MO": {"index": ("core", "MO"), "field_length": 2},
+                "DY": {"index": ("core", "DY"), "field_length": 2},
+                "HR": {"index": ("core", "HR"), "field_length": 4},
+            },
+            "is_delimited": False,
+        },
+        "c1": {
+            "header": {"sentinel": " 165"},
+            "elements": {
+                "ATTI": {"index": ("c1", "ATTI"), "field_length": 2},
+                "ATTL": {"index": ("c1", "ATTL"), "field_length": 2},
+                "BSI": {"index": ("c1", "BSI"), "field_length": 1},
+            },
+            "is_delimited": False,
+        },
+        "c5": {
+            "header": {"sentinel": " 594"},
+            "elements": {
+                "ATTI": {"index": ("c5", "ATTI"), "field_length": 2},
+                "ATTL": {"index": ("c5", "ATTL"), "field_length": 2},
+                "OS": {"index": ("c5", "OS"), "field_length": 1},
+                "OP": {"index": ("c5", "OP"), "field_length": 1},
+            },
+            "is_delimited": False,
+        },
+        "c98": {
+            "header": {"sentinel": "9815"},
+            "elements": {
+                "ATTI": {"index": ("c98", "ATTI"), "field_length": 2},
+                "ATTL": {"index": ("c98", "ATTL"), "field_length": 2, "ignore": True},
+                "UID": {"index": ("c98", "UID"), "field_length": 6},
+            },
+            "is_delimited": False,
+        },
+        "c99_data": {
+            "header": {"delimiter": "}"},
+            "elements": {
+                "control_No": {"index": ("c99_data", "control_No")},
+                "name": {"index": ("c99_data", "name")},
+            },
+            "is_delimited": True,
+        },
+    }
+
+
+@pytest.fixture
+def base_config_pd():
+    return ParserConfig(
+        order_specs={},
+        disable_reads=[],
+        dtypes={},
+        parse_dates=[],
+        convert_decode={},
+        validation={},
+        encoding="utf-8",
+        columns=None,
+    )
+
+
+@pytest.fixture
+def base_config_xr():
+    return ParserConfig(
+        order_specs={
+            "core": {
+                "elements": {
+                    "TEMP": {
+                        "index": ("core", "TEMP"),
+                        "ignore": False,
+                    },
+                    "PRES": {
+                        "index": ("core", "PRES"),
+                        "ignore": False,
+                    },
+                }
+            }
+        },
+        disable_reads=[],
+        dtypes={},
+        parse_dates=[],
+        convert_decode={},
+        validation={
+            ("core", "TEMP"): {"units": "__from_file__"},
+            ("core", "PRES"): {"units": "__from_file__"},
+        },
+        encoding="utf-8",
+        columns=None,
+    )
+
+
+def test_get_index_single_length():
+    assert _get_index("AT", "_SECTION_", 1) == "AT"
+
+
+def test_get_index_multiple_length():
+    assert _get_index("AT", "core", 2) == ("core", "AT")
+
+
+@pytest.mark.parametrize(
+    "value, expected",
+    [
+        (True, True),
+        (False, False),
+        ("true", True),
+        ("True", True),
+        ("1", True),
+        ("yes", True),
+        ("false", False),
+        ("0", False),
+        ("no", False),
+    ],
+)
+def test_get_ignore_string_and_bool_values(value, expected):
+    assert _get_ignore({"ignore": value}) is expected
+
+
+def test_get_ignore_missing_key():
+    assert _get_ignore({}) is False
+
+
+def test_convert_dtype_none():
+    assert _convert_dtype_to_default(None) is None
+
+
+def test_convert_dtype_float():
+    assert _convert_dtype_to_default("float") == "float"
+
+
+def test_convert_dtype_int():
+    assert _convert_dtype_to_default("int") == "Int64"
+
+
+def test_convert_deprecated_float(caplog):
+    with caplog.at_level(logging.WARNING):
+        result = _convert_dtype_to_default("Float64")
+    assert result == "float"
+    assert "deprecated" in caplog.text
+
+
+def test_convert_deprecated_int(caplog):
+    with caplog.at_level(logging.WARNING):
+        result = _convert_dtype_to_default("Int32")
+    assert result == "Int64"
+    assert "deprecated" in caplog.text
+
+
+def test_convert_unknown_dtype():
+    assert _convert_dtype_to_default("string") == "string"
+
+
+@pytest.mark.parametrize(
+    "line, header, elements, exp_end, exp_out",
+    [
+        (
+            "2010 7 1    ",
+            {},
+            {
+                "YR": {"index": ("core", "YR"), "field_length": 4},
+                "MO": {"index": ("core", "MO"), "field_length": 2},
+                "DY": {"index": ("core", "DY"), "field_length": 2},
+                "HR": {"index": ("core", "HR"), "field_length": 4},
+            },
+            12,
+            {
+                ("core", "YR"): "2010",
+                ("core", "MO"): " 7",
+                ("core", "DY"): " 1",
+                ("core", "HR"): True,
+            },
+        ),
+        (
+            " 165 ",
+            {"sentinel": " 165"},
+            {
+                "ATTI": {"index": ("c1", "ATTI"), "field_length": 2},
+                "ATTL": {"index": ("c1", "ATTL"), "field_length": 2},
+                "BSI": {"index": ("c1", "BSI"), "field_length": 1},
+            },
+            5,
+            {
+                ("c1", "ATTI"): " 1",
+                ("c1", "ATTL"): "65",
+                ("c1", "BSI"): True,
+            },
+        ),
+        (
+            "9815IS7NQU",
+            {"sentinel": " 594"},
+            {
+                "ATTI": {"index": ("c5", "ATTI"), "field_length": 2},
+                "ATTL": {"index": ("c5", "ATTL"), "field_length": 2},
+                "OS": {"index": ("c5", "OS"), "field_length": 1},
+                "OP": {"index": ("c5", "OP"), "field_length": 1},
+            },
+            0,
+            {
+                ("c5", "ATTI"): False,
+                ("c5", "ATTL"): False,
+                ("c5", "OS"): False,
+                ("c5", "OP"): False,
+            },
+        ),
+        (
+            "9815IS7NQU",
+            {"sentinel": "9815"},
+            {
+                "ATTI": {"index": ("c98", "ATTI"), "field_length": 2},
+                "ATTL": {"index": ("c98", "ATTL"), "field_length": 2, "ignore": True},
+                "UID": {"index": ("c98", "UID"), "field_length": 6},
+            },
+            10,
+            {
+                ("c98", "ATTI"): "98",
+                ("c98", "UID"): "IS7NQU",
+            },
+        ),
+    ],
+)
+def test_parse_fixed_width(line, header, elements, exp_end, exp_out):
+    out = {}
+    end = _parse_fixed_width(
+        line=line,
+        i=0,
+        header=header,
+        elements=elements,
+        sections=None,
+        excludes=set(),
+        out=out,
+    )
+
+    assert end == exp_end
+    assert out == exp_out
+
+
+@pytest.mark.parametrize(
+    "sections, excludes, exp_out",
+    [
+        (
+            ["core"],
+            set(),
+            {
+                ("core", "YR"): "2010",
+                ("core", "MO"): " 7",
+                ("core", "DY"): " 1",
+                ("core", "HR"): True,
+            },
+        ),
+        (["c1"], set(), {}),
+        (None, ["core"], {}),
+        (
+            None,
+            ["c1"],
+            {
+                ("core", "YR"): "2010",
+                ("core", "MO"): " 7",
+                ("core", "DY"): " 1",
+                ("core", "HR"): True,
+            },
+        ),
+    ],
+)
+def test_parse_fixed_width_kwargs(sections, excludes, exp_out):
+    out = {}
+    elements = {
+        "YR": {"index": ("core", "YR"), "field_length": 4},
+        "MO": {"index": ("core", "MO"), "field_length": 2},
+        "DY": {"index": ("core", "DY"), "field_length": 2},
+        "HR": {"index": ("core", "HR"), "field_length": 4},
+    }
+    end = _parse_fixed_width(
+        line="2010 7 1    ",
+        i=0,
+        header={},
+        elements=elements,
+        sections=sections,
+        excludes=excludes,
+        out=out,
+    )
+
+    assert end == 12
+    assert out == exp_out
+
+
+def test_parse_delimited():
+    line = "13615}Peder Aneus"
+    header = {"delimiter": "}"}
+    elements = {
+        "control_No": {"index": ("c99_data", "control_No")},
+        "name": {"index": ("c99_data", "name")},
+    }
+    out = {}
+    end = _parse_delimited(
+        line=line,
+        i=0,
+        header=header,
+        elements=elements,
+        sections=None,
+        excludes=set(),
+        out=out,
+    )
+
+    assert end == len(line)
+    assert out == {
+        ("c99_data", "control_No"): "13615",
+        ("c99_data", "name"): "Peder Aneus",
+    }
+
+
+def test_parse_line(order_specs):
+    line = "2010 7 1     165 9815IS7NQU13615}Peder Aneus"
+    out = _parse_line(
+        line=line,
+        order_specs=order_specs,
+        sections=None,
+        excludes=set(),
+    )
+
+    assert out == {
+        ("core", "YR"): "2010",
+        ("core", "MO"): " 7",
+        ("core", "DY"): " 1",
+        ("core", "HR"): True,
+        ("c1", "ATTI"): " 1",
+        ("c1", "ATTL"): "65",
+        ("c1", "BSI"): True,
+        ("c5", "ATTI"): False,
+        ("c5", "ATTL"): False,
+        ("c5", "OS"): False,
+        ("c5", "OP"): False,
+        ("c98", "ATTI"): "98",
+        ("c98", "UID"): "IS7NQU",
+        ("c99_data", "control_No"): "13615",
+        ("c99_data", "name"): "Peder Aneus",
+    }
+
+
+def test_parse_pandas(order_specs):
+    df = pd.DataFrame(
+        [
+            "2010 7 1     165 9815IS7NQU13615}Peder Aneus",
+            "2010 7 20100 165 9815IS7NQU13615}Peder Aneus",
+            "2010 7 30200 165 9815IS7NQU13615}Peder Aneus",
+        ]
+    )
+    out = parse_pandas(
+        df=df,
+        order_specs=order_specs,
+    )
+
+    data = {
+        ("core", "YR"): ["2010", "2010", "2010"],
+        ("core", "MO"): [" 7", " 7", " 7"],
+        ("core", "DY"): [" 1", " 2", " 3"],
+        ("core", "HR"): [True, "0100", "0200"],
+        ("c1", "ATTI"): [" 1", " 1", " 1"],
+        ("c1", "ATTL"): ["65", "65", "65"],
+        ("c1", "BSI"): [True, True, True],
+        ("c5", "ATTI"): [False, False, False],
+        ("c5", "ATTL"): [False, False, False],
+        ("c5", "OS"): [False, False, False],
+        ("c5", "OP"): [False, False, False],
+        ("c98", "ATTI"): ["98", "98", "98"],
+        ("c98", "UID"): ["IS7NQU", "IS7NQU", "IS7NQU"],
+        ("c99_data", "control_No"): ["13615", "13615", "13615"],
+        ("c99_data", "name"): ["Peder Aneus", "Peder Aneus", "Peder Aneus"],
+    }
+
+    exp = pd.DataFrame(data, columns=list(data.keys()))
+
+    assert_frame_equal(out, exp)
+
+
+def test_parse_netcdf(order_specs):
+    ds = xr.Dataset(
+        {
+            "YR": ("time", [2010, 2010, 2010]),
+            "MO": ("time", [7, 7, 7]),
+            "DY": ("time", [1, 2, 3]),
+            "HR": ("time", [10, 20, 30]),
+        },
+        coords={"time": [0, 1, 2]},
+        attrs={"source": "fake"},
+    )
+    out = parse_netcdf(
+        ds=ds,
+        order_specs=order_specs,
+    )
+
+    data = {
+        ("core", "YR"): [2010, 2010, 2010],
+        ("core", "MO"): [7, 7, 7],
+        ("core", "DY"): [1, 2, 3],
+        ("core", "HR"): [10, 20, 30],
+        ("c1", "ATTI"): [False, False, False],
+        ("c1", "ATTL"): [False, False, False],
+        ("c1", "BSI"): [False, False, False],
+        ("c5", "ATTI"): [False, False, False],
+        ("c5", "ATTL"): [False, False, False],
+        ("c5", "OS"): [False, False, False],
+        ("c5", "OP"): [False, False, False],
+        ("c98", "ATTI"): [False, False, False],
+        ("c98", "UID"): [False, False, False],
+        ("c99_data", "control_No"): [False, False, False],
+        ("c99_data", "name"): [False, False, False],
+    }
+
+    exp = pd.DataFrame(data, columns=list(data.keys()))
+
+    assert_frame_equal(out, exp)
+
+
+def test_update_pd_config_updates_encoding(base_config_pd):
+    pd_kwargs = {"encoding": "latin-1"}
+
+    new_config = update_pd_config(pd_kwargs, base_config_pd)
+
+    assert new_config.encoding == "latin-1"
+    assert base_config_pd.encoding == "utf-8"
+    assert new_config is not base_config_pd
+
+
+def test_update_pd_config_no_encoding_key(base_config_pd):
+    pd_kwargs = {"sep": ","}
+
+    new_config = update_pd_config(pd_kwargs, base_config_pd)
+
+    assert new_config is base_config_pd
+
+
+def test_update_pd_config_empty_encoding(base_config_pd):
+    pd_kwargs = {"encoding": ""}
+
+    new_config = update_pd_config(pd_kwargs, base_config_pd)
+
+    assert new_config is base_config_pd
+
+
+def test_update_pd_config_none_encoding(base_config_pd):
+    pd_kwargs = {"encoding": None}
+
+    new_config = update_pd_config(pd_kwargs, base_config_pd)
+
+    assert new_config is base_config_pd
+
+
+def test_update_xr_config_ignores_missing_elements(base_config_xr):
+    ds = xr.Dataset(
+        data_vars={
+            "TEMP": xr.DataArray([1, 2, 3], attrs={"units": "K"}),
+        }
+    )
+
+    new_config = update_xr_config(ds, base_config_xr)
+
+    elements = new_config.order_specs["core"]["elements"]
+    assert elements["PRES"]["ignore"] is True
+    assert elements["TEMP"]["ignore"] is False
+
+
+def test_update_xr_config_populates_validation_from_attrs(base_config_xr):
+    ds = xr.Dataset(
+        data_vars={
+            "TEMP": xr.DataArray([1, 2, 3], attrs={"units": "K"}),
+            "PRES": xr.DataArray([1010, 1011, 1012], attrs={"units": "hPa"}),
+        }
+    )
+
+    new_config = update_xr_config(ds, base_config_xr)
+
+    assert new_config.validation[("core", "TEMP")]["units"] == "K"
+    assert new_config.validation[("core", "PRES")]["units"] == "hPa"
+
+
+def test_update_xr_config_removes_missing_validation_attrs(base_config_xr):
+    ds = xr.Dataset(
+        data_vars={
+            "TEMP": xr.DataArray([1, 2, 3], attrs={}),
+            "PRES": xr.DataArray([1010, 1011, 1012], attrs={"units": "hPa"}),
+        }
+    )
+
+    new_config = update_xr_config(ds, base_config_xr)
+
+    assert "units" not in new_config.validation[("core", "TEMP")]
+    assert new_config.validation[("core", "PRES")]["units"] == "hPa"
+
+
+def test_update_xr_config_does_not_mutate_original(base_config_xr):
+    ds = xr.Dataset(
+        data_vars={
+            "TEMP": xr.DataArray([1, 2, 3], attrs={"units": "K"}),
+        }
+    )
+
+    _ = update_xr_config(ds, base_config_xr)
+
+    assert base_config_xr.order_specs["core"]["elements"]["PRES"]["ignore"] is False
+    assert base_config_xr.validation[("core", "TEMP")]["units"] == "__from_file__"
+
+
+def test_build_parser_config_imodel():
+    config = build_parser_config("icoads")
+
+    assert isinstance(config, ParserConfig)
+
+    assert hasattr(config, "order_specs")
+    assert isinstance(config.order_specs, dict)
+    assert "core" in config.order_specs
+    spec = config.order_specs["core"]
+    assert isinstance(spec, dict)
+    assert "header" in spec
+    assert isinstance(spec["header"], dict)
+    assert "elements" in spec
+    assert isinstance(spec["elements"], dict)
+    assert "is_delimited" in spec
+    assert isinstance(spec["is_delimited"], bool)
+
+    assert hasattr(config, "disable_reads")
+    assert isinstance(config.disable_reads, list)
+    assert all(isinstance(x, str) for x in config.disable_reads)
+
+    assert hasattr(config, "dtypes")
+    assert isinstance(config.dtypes, dict)
+    assert all(isinstance(x, tuple) for x in config.dtypes.keys())
+    assert all(isinstance(x, str) for x in config.dtypes.values())
+
+    assert hasattr(config, "parse_dates")
+    assert isinstance(config.parse_dates, list)
+    assert config.parse_dates == []
+
+    assert hasattr(config, "convert_decode")
+    assert isinstance(config.convert_decode, dict)
+
+    assert "converter_dict" in config.convert_decode
+    converter_dict = config.convert_decode["converter_dict"]
+    assert isinstance(converter_dict, dict)
+    assert all(isinstance(x, tuple) for x in converter_dict.keys())
+    assert all(isinstance(x, MethodType) for x in converter_dict.values())
+
+    assert "converter_kwargs" in config.convert_decode
+    converter_kwargs = config.convert_decode["converter_kwargs"]
+    assert isinstance(converter_kwargs, dict)
+    assert all(isinstance(x, tuple) for x in converter_kwargs.keys())
+    assert all(isinstance(x, dict) for x in converter_kwargs.values())
+
+    assert "decoder_dict" in config.convert_decode
+    decoder_dict = config.convert_decode["converter_dict"]
+    assert isinstance(decoder_dict, dict)
+    assert all(isinstance(x, tuple) for x in decoder_dict.keys())
+    assert all(isinstance(x, MethodType) for x in decoder_dict.values())
+
+    assert hasattr(config, "validation")
+    assert isinstance(config.validation, dict)
+    assert all(isinstance(x, tuple) for x in config.validation.keys())
+    assert all(isinstance(x, dict) for x in config.validation.values())
+
+    assert hasattr(config, "encoding")
+    assert isinstance(config.encoding, str)
+
+    assert hasattr(config, "columns")
+    assert config.columns is None
diff --git a/tests/test_reader_schemas.py b/tests/test_reader_schemas.py
new file mode 100755
index 00000000..07598933
--- /dev/null
+++ b/tests/test_reader_schemas.py
@@ -0,0 +1,115 @@
+from __future__ import annotations
+
+import pytest
+import json
+
+from cdm_reader_mapper.mdf_reader.schemas.schemas import (
+    _resolve_schema_files,
+    _normalize_schema,
+    read_schema,
+)
+
+
+@pytest.fixture
+def tmp_schema_file(tmp_path):
+    schema_data = {
+        "header": {"delimiter": ","},
+        "sections": {"sec1": {"elements": {"a": 1, "b": 2}}},
+    }
+    path = tmp_path / "schema"
+    path.mkdir(exist_ok=True)
+    file_path = tmp_path / "schema" / "schema.json"
+    file_path.write_text(json.dumps(schema_data))
+    return file_path, schema_data
+
+
+def test_resolve_schema_file_by_file(tmp_schema_file):
+    file_path, _ = tmp_schema_file
+    result = _resolve_schema_files(ext_schema_file=str(file_path))
+    assert isinstance(result, list)
+    assert result[0] == file_path
+
+
+def test_resolve_schema_file_by_path(tmp_path):
+    dir_path = tmp_path / "myschema"
+    dir_path.mkdir()
+    schema_file = dir_path / "myschema.json"
+    schema_file.write_text(json.dumps({"header": {}}))
+
+    result = _resolve_schema_files(ext_schema_path=str(dir_path))
+    assert len(result) == 1
+    assert result[0] == schema_file.resolve()
+
+
+def test_resolve_schema_file_missing_file(tmp_path):
+    missing_file = tmp_path / "does_not_exist.json"
+    with pytest.raises(FileNotFoundError):
+        _resolve_schema_files(ext_schema_file=str(missing_file))
+
+
+def test_resolve_schema_file_missing_path(tmp_path):
+    missing_dir = tmp_path / "nonexistent_dir"
+    with pytest.raises(FileNotFoundError):
+        _resolve_schema_files(ext_schema_path=str(missing_dir))
+
+
+def test_resolve_schema_file_no_input():
+    with pytest.raises(ValueError):
+        _resolve_schema_files()
+
+
+def test_normalize_schema_with_sections():
+    schema = {
+        "header": {"delimiter": ","},
+        "sections": {"sec1": {"elements": {"a": 1}}},
+    }
+    result = _normalize_schema(schema)
+    assert "sections" in result
+    assert result["header"]["parsing_order"] == [{"s": ["sec1"]}]
+
+
+def test_normalize_schema_missing_sections_and_elements():
+    schema = {"header": {"delimiter": ","}}
+    with pytest.raises(KeyError):
+        _normalize_schema(schema)
+
+
+def test_normalize_schema_preserves_existing_parsing_order():
+    schema = {
+        "header": {"delimiter": ",", "parsing_order": [{"s": ["sec1"]}]},
+        "sections": {"sec1": {"elements": {"x": 1}}},
+    }
+    result = _normalize_schema(schema)
+    assert result["header"]["parsing_order"] == [{"s": ["sec1"]}]
+
+
+def test_read_schema_with_imodel():
+    result = read_schema(imodel="icoads")
+    assert isinstance(result, dict)
+    assert "header" in result
+    assert "sections" in result
+    assert "name" in result
+
+
+def test_read_schema_with_ext_file(tmp_schema_file):
+    file_path, _ = tmp_schema_file
+
+    result = read_schema(ext_schema_file=str(file_path))
+    assert isinstance(result, dict)
+    assert "sections" in result
+    assert result["sections"]["sec1"]["elements"] == {"a": 1, "b": 2}
+    assert result["name"] == [file_path]
+
+
+def test_read_schema_with_ext_path(tmp_schema_file):
+    file_path, _ = tmp_schema_file
+    result = read_schema(ext_schema_path=str(file_path.parent))
+    assert isinstance(result, dict)
+    assert "sections" in result
+    assert result["sections"]["sec1"]["elements"] == {"a": 1, "b": 2}
+    assert result["name"] == [file_path]
+
+
+def test_read_schema_requires_input():
+    with pytest.raises(ValueError):
+        read_schema(imodel=None, ext_schema_path=None, ext_schema_file=None)
diff --git a/tests/test_reader_utilities.py b/tests/test_reader_utilities.py
new file mode 100755
index 00000000..f4a46639
--- /dev/null
+++ b/tests/test_reader_utilities.py
@@ -0,0 +1,272 @@
+from __future__ import annotations
+
+import pandas as pd
+import pytest
+
+from io import StringIO
+from pandas.io.parsers import TextFileReader
+from pathlib import Path
+
+from cdm_reader_mapper.mdf_reader.utils.utilities import (
+    as_list,
+    as_path,
+    join,
+    update_dtypes,
+    update_column_names,
+    update_column_labels,
+    read_csv,
+    convert_dtypes,
+    validate_arg,
+    _adjust_dtype,
+    convert_str_boolean,
+    _remove_boolean_values,
+    remove_boolean_values,
+    process_textfilereader,
+)
+
+
+def make_parser(text: str, chunksize: int = 1) -> pd.io.parsers.TextFileReader:
+    """Helper: create a TextFileReader similar to user code."""
+    buffer = StringIO(text)
+    return pd.read_csv(buffer, chunksize=chunksize)
+
+
+@pytest.fixture
+def sample_reader() -> pd.io.parsers.TextFileReader:
+    buffer = StringIO("A,B\n1,2\n3,4\n")
+    return pd.read_csv(buffer, chunksize=1)
+
+
+@pytest.fixture
+def tmp_csv_file(tmp_path):
+    data = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
+    file_path = tmp_path / "test.csv"
+    data.to_csv(file_path, index=False)
+    return file_path, data
+
+
+def sample_func(df):
+    df_new = df * 2
+    extra = {"note": "first_chunk_only"}
+    return df_new, extra
+
+
+def sample_func_only_df(df):
+    return df * 2
+
+
+@pytest.mark.parametrize(
+    "input_value, expected",
+    [
+        (None, None),
+        ("hello", ["hello"]),
+        ([1, 2, 3], [1, 2, 3]),
+        ((4, 5), [4, 5]),
+    ],
+)
+def test_as_list(input_value, expected):
+    result = as_list(input_value)
+    assert result == expected
+
+
+def test_as_list_with_set_order_warning():
+    s = {"a", "b"}  # sets are unordered
+    result = as_list(s)
+    assert set(result) == s
+
+
+def test_as_path_with_string(tmp_path):
+    p = tmp_path / "file.txt"
+    result = as_path(str(p), "test_param")
+    assert isinstance(result, Path)
+    assert result == p
+
+
+def test_as_path_with_pathlike(tmp_path):
+    p = tmp_path / "file.txt"
+    result = as_path(p, "test_param")
+    assert isinstance(result, Path)
+    assert result == p
+
+
+def test_as_path_with_invalid_type():
+    with pytest.raises(TypeError):
+        as_path(123, "number_param")
+
+
+@pytest.mark.parametrize(
+    "input_col, expected",
+    [
+        ("single", "single"),
+        (["a", "b"], "a:b"),
+        (("x", "y", "z"), "x:y:z"),
+        ([1, 2], "1:2"),
+        (42, "42"),
+    ],
+)
+def test_join(input_col, expected):
+    assert join(input_col) == expected
+
+
+def test_update_dtypes():
+    dtypes = {"A": int, "B": float, "C": str}
+    columns = ["A", "C"]
+    expected = {"A": int, "C": str}
+    assert update_dtypes(dtypes, columns) == expected
+
+
+def test_update_dtypes_with_empty_columns():
+    dtypes = {"A": int, "B": float}
+    assert update_dtypes(dtypes, []) == {}
+
+
+def test_update_column_names_dict():
+    dtypes = {"A": int, "B": float}
+    updated = update_column_names(dtypes.copy(), "A", "X")
+    assert updated == {"X": int, "B": float}
+
+
+def test_update_column_names_no_change():
+    dtypes = {"A": int}
+    updated = update_column_names(dtypes.copy(), "B", "Y")
+    assert updated == {"A": int}
+
+
+def test_update_column_names_string_input():
+    value = "some string"
+    assert update_column_names(value, "A", "X") == "some string"
+
+
+def test_update_column_labels_simple_strings():
+    cols = ["A", "B", "C"]
+    result = update_column_labels(cols)
+    assert isinstance(result, pd.Index)
+    assert list(result) == ["A", "B", "C"]
+
+
+def test_update_column_labels_colon_strings():
+    cols = ["A:B", "C:D"]
+    result = update_column_labels(cols)
+    assert isinstance(result, pd.MultiIndex)
+    assert result.tolist() == [("A", "B"), ("C", "D")]
+
+
+def test_update_column_labels_tuple_strings():
+    cols = ["('A','B')", "('C','D')"]
+    result = update_column_labels(cols)
+    assert isinstance(result, pd.MultiIndex)
+    assert result.tolist() == [("A", "B"), ("C", "D")]
+
+
+def test_update_column_labels_mixed():
+    cols = ["A", "('B','C')", "D:E"]
+    result = update_column_labels(cols)
+    assert isinstance(result, pd.Index)  # Not all tuples
+    assert result.tolist() == ["A", ("B", "C"), ("D", "E")]
+
+
+def test_read_csv_file_exists(tmp_csv_file):
+    file_path, data = tmp_csv_file
+    df = read_csv(file_path)
+    pd.testing.assert_frame_equal(df, data)
+
+
+def test_read_csv_file_missing(tmp_path):
+    missing_file = tmp_path / "missing.csv"
+    df = read_csv(missing_file)
+    assert df.empty
+
+
+def test_read_csv_with_col_subset(tmp_csv_file):
+    file_path, _ = tmp_csv_file
+    df = read_csv(file_path, col_subset=["B"])
+    assert list(df.columns) == ["B"]
+
+
+def test_convert_dtypes_basic():
+    dtypes = {"A": "int", "B": "datetime", "C": "float"}
+    updated, dates = convert_dtypes(dtypes)
+    assert updated["B"] == "object"
+    assert dates == ["B"]
+
+
+def test_validate_arg_correct_type():
+    assert validate_arg("x", 5, int)
+
+
+def test_validate_arg_none():
+    assert validate_arg("x", None, int)
+
+
+def test_validate_arg_wrong_type():
+    with pytest.raises(ValueError):
+        validate_arg("x", "hello", int)
+
+
+def test_convert_str_boolean():
+    assert convert_str_boolean("True") is True
+    assert convert_str_boolean("False") is False
+    assert convert_str_boolean("hello") == "hello"
+    assert convert_str_boolean(1) == 1
+
+
+def test_remove_boolean_values_helper():
+    assert _remove_boolean_values("True") is None
+    assert _remove_boolean_values("False") is None
+    assert _remove_boolean_values(True) is None
+    assert _remove_boolean_values(False) is None
+    assert _remove_boolean_values("abc") == "abc"
+
+
+def test_adjust_dtype():
+    df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
+    dtype = {"A": "int", "B": "float", "C": "str"}
+    adjusted = _adjust_dtype(dtype, df)
+    assert adjusted == {"A": "int", "B": "float"}
+    assert _adjust_dtype("str", df) == "str"
+
+
+def test_remove_boolean_values():
+    df = pd.DataFrame({"A": ["True", "False", "hello"], "B": [1, 2, 3]})
+    dtypes = {"A": "object", "B": "int"}
+    result = remove_boolean_values(df, dtypes)
+    assert result.loc[0, "A"] is None
+    assert result.loc[1, "A"] is None
+    assert result.loc[2, "A"] == "hello"
+    assert result["B"].dtype.name == "int64"
+
+
+def test_process_textfilereader(sample_reader):
+    reader_out, extra_out = process_textfilereader(
+        sample_reader, sample_func, read_kwargs={"chunksize": 1}
+    )
+    assert isinstance(reader_out, TextFileReader)
+    df_out = reader_out.read()
+    assert df_out.shape == (2, 2)
+    assert df_out["A"].iloc[0] == 2
+    assert df_out["B"].iloc[1] == 8
+    assert extra_out == {"note": "first_chunk_only"}
+
+
+def test_process_textfilereader_only_df(sample_reader):
+    (reader_out,) = process_textfilereader(
+        sample_reader, sample_func_only_df, read_kwargs={"chunksize": 1}
+    )
+    print(reader_out)
+    assert isinstance(reader_out, TextFileReader)
+    df_out = reader_out.read()
+    assert df_out.shape == (2, 2)
+    assert df_out["A"].iloc[0] == 2
+    assert df_out["B"].iloc[1] == 8
+
+
+def test_process_textfilereader_makecopy_flag(sample_reader):
+    reader_out, extra_out = process_textfilereader(
+        sample_reader, sample_func, makecopy=True, read_kwargs={"chunksize": 1}
+    )
+    assert isinstance(reader_out, TextFileReader)
+    df_out = reader_out.read()
+    assert df_out.shape == (2, 2)
+    assert df_out["A"].iloc[0] == 2
+    assert df_out["B"].iloc[1] == 8
+    assert extra_out == {"note": "first_chunk_only"}
diff --git a/tests/test_reader_validator.py b/tests/test_reader_validator.py
new file mode 100755
index 00000000..d7c17b7a
--- /dev/null
+++ b/tests/test_reader_validator.py
@@ -0,0 +1,110 @@
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from cdm_reader_mapper.mdf_reader.utils.validators import (
+    _is_true,
+    _is_false,
+    validate_datetime,
+    validate_numeric,
+    validate_str,
+    validate_codes,
+    validate,
+)
+
+
+@pytest.fixture
+def sample_series():
+    return pd.Series(["20200101", "bad", None, "20221231"], dtype="object")
+
+
+@pytest.fixture
+def numeric_series():
+    return pd.Series(["1", "2", "3", "False", "bad"], dtype="object")
+
+
+@pytest.fixture
+def code_series():
+    return pd.Series(["A", "B", "C", None, "X"], dtype="object")
+
+
+def test_is_true_false():
+    assert _is_true(True) is True
+    assert _is_true(False) is False
+    assert _is_false(False) is True
+    assert _is_false(True) is False
+    assert _is_true(1) is False
+    assert _is_false(0) is False
+
+
+def test_validate_datetime(sample_series):
+    result = validate_datetime(sample_series)
+    expected = pd.Series([True, False, True, True])
+    pd.testing.assert_series_equal(result, expected)
+
+
+def test_validate_numeric(numeric_series):
+    result = validate_numeric(numeric_series, 1, 3)
+    expected = pd.Series([True, True, True, False, False])
+    pd.testing.assert_series_equal(result, expected)
+
+
+def test_validate_str(numeric_series):
+    result = validate_str(numeric_series)
+    expected = pd.Series([True] * len(numeric_series), dtype="boolean")
+    pd.testing.assert_series_equal(result, expected)
+
+
+def test_validate_codes(code_series):
+    codes = ["A", "B", "C"]
+    result = validate_codes(code_series, codes, "str")
+    expected = pd.Series([True, True, True, True, False])
+    pd.testing.assert_series_equal(result, expected)
+
+
+@pytest.fixture
+def sample_df():
+    return pd.DataFrame(
+        {
+            "NUM": ["1", "2", "bad", np.nan, "5"],
+            "KEY": ["0", "1", "2", "9", np.nan],
+            "STR": ["foo", "bar", "baz", "", np.nan],
+            "DATE": ["20220101", "20220202", "bad_date", np.nan, "20220505"],
+            "BOOL": ["True", "False", "TRUE", "FALSE", None],
+        }
+    )
+
+
+@pytest.fixture
+def attributes():
+    return {
+        "NUM": {"column_type": "int", "valid_min": 1, "valid_max": 5},
+        "KEY": {"column_type": "key", "codetable": "ICOADS.C0.A"},
+        "STR": {"column_type": "str"},
+        "DATE": {"column_type": "datetime"},
+        "BOOL": {"column_type": "int"},  # treat boolean literals as numeric override
+    }
+
+
+def test_validate_all_columns(sample_df, attributes):
+    mask = validate(
+        sample_df, imodel="icoads", ext_table_path=None, attributes=attributes
+    )
+
+    expected_num = [True, True, False, True, True]
+    assert mask["NUM"].tolist() == expected_num
+
+    expected_key = [True, True, True, False, True]
+    assert mask["KEY"].tolist() == expected_key
+
+    expected_key = [True, True, True, True, True]
+    assert mask["STR"].tolist() == expected_key
+
+    expected_date = [True, True, False, True, True]
+    assert mask["DATE"].tolist() == expected_date
+
+    expected_bool = [True, False, False, False, True]
+    print(mask["BOOL"])
+    assert mask["BOOL"].tolist() == expected_bool