From 9ea326d2715b18aa42cf7d896b6a85911a77a212 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Wed, 26 Oct 2022 12:45:35 -0700
Subject: [PATCH 01/50] Re-add defects

---
 src/atomate2/cp2k/builders/defect.py |   0
 src/atomate2/cp2k/flows/defect.py    | 245 +++++++++++++++++++
 src/atomate2/cp2k/jobs/defect.py     | 119 +++++++++
 src/atomate2/cp2k/schemas/defect.py  | 345 +++++++++++++++++++++++++++
 src/atomate2/cp2k/sets/defect.py     |  55 +++++
 5 files changed, 764 insertions(+)
 create mode 100644 src/atomate2/cp2k/builders/defect.py
 create mode 100644 src/atomate2/cp2k/flows/defect.py
 create mode 100644 src/atomate2/cp2k/jobs/defect.py
 create mode 100644 src/atomate2/cp2k/schemas/defect.py
 create mode 100644 src/atomate2/cp2k/sets/defect.py

diff --git a/src/atomate2/cp2k/builders/defect.py b/src/atomate2/cp2k/builders/defect.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/atomate2/cp2k/flows/defect.py b/src/atomate2/cp2k/flows/defect.py
new file mode 100644
index 0000000000..7f48d318a2
--- /dev/null
+++ b/src/atomate2/cp2k/flows/defect.py
@@ -0,0 +1,245 @@
+
+"""Flows used in the calculation of defect properties."""
+
+from __future__ import annotations
+from copy import deepcopy
+
+import logging
+from dataclasses import dataclass, field
+from typing import Iterable, Literal, Mapping
+from pathlib import Path
+from numpy.typing import NDArray
+import itertools
+
+from jobflow import Flow, Job, Maker, OutputReference, job
+from pymatgen.core.structure import Structure
+from pymatgen.io.common import VolumetricData
+from pymatgen.entries.computed_entries import ComputedStructureEntry
+from pymatgen.analysis.defects.core import Defect
+from pymatgen.analysis.defects.thermo import DefectEntry
+from pymatgen.analysis.defects.supercells import get_sc_fromstruct
+
+from atomate2.cp2k.jobs.base import BaseCp2kMaker 
+from atomate2.cp2k.jobs.core import StaticMaker, HybridStaticMaker, RelaxMaker, HybridRelaxMaker, CellOptMaker, HybridCellOptMaker
+
+from atomate2.cp2k.schemas.defect import DefectDoc
+from atomate2.cp2k.sets.core import (
+    StaticSetGenerator, RelaxSetGenerator, CellOptSetGenerator
+)
+
+from atomate2.cp2k.sets.defect import (
+    DefectStaticSetGenerator, DefectRelaxSetGenerator, DefectCellOptSetGenerator,
+    DefectHybridStaticSetGenerator, DefectHybridRelaxSetGenerator, DefectHybridCellOptSetGenerator
+)
+from atomate2.cp2k.jobs.defect import (
+    BaseDefectMaker, DefectStaticMaker, DefectRelaxMaker, DefectCellOptMaker,
+    DefectHybridStaticMaker, DefectHybridRelaxMaker, DefectHybridCellOptMaker
+)
+
+from atomate2.cp2k.flows.core import HybridStaticFlowMaker, HybridRelaxFlowMaker, HybridCellOptFlowMaker
+
+logger = logging.getLogger(__name__)
+
+# TODO close to being able to put this in common. Just need a switch that decides which core flow/job to use based on software
+@dataclass
+class FormationEnergyMaker(Maker):
+    """
+    Run a collection of defect jobs and (possibly) the bulk supercell
+    for determination of defect formation energies.
+
+    Parameters
+    ----------
+    name: This flow's name. i.e. "defect formation energy" 
+    run_bulk: whether to run the bulk supercell as a static ("static")
+        calculation, a full relaxation ("relax"), or to skip it (False)
+    hybrid_functional: If provided, this activates hybrid version of the
+        workflow. Provide functional as a parameter that the input set
+        can recognize. e.g. "PBE0" or "HSE06"
+    initialize_with_pbe: If hybrid functional is provided, this enables
+        the use of a static PBE run before the hybrid calc to provide a
+        starting guess for CP2K HF module.
+    supercell_matrix: If provided, the defect supercell wil lbe created
+        by this 3x3 matrix. Else other parameters will be used.
+    max_atoms: Maximum number of atoms allowed in the supercell.
+    min_atoms: Minimum number of atoms allowed in the supercell.
+    min_length: Minimum length of the smallest supercell lattice vector.
+    force_diagonal: If True, return a transformation with a diagonal transformation matrix.
+    """
+
+    name: str = "defect formation energy"
+    run_bulk: Literal["static", "relax"] | bool = field(default="static") 
+    hybrid_functional: str | None = field(default=None)
+    initialize_with_pbe: bool = field(default=True)
+    
+    supercell_matrix: NDArray = field(default=None)
+    min_atoms: int = field(default=80)
+    max_atoms: int = field(default=240)
+    min_length: int = field(default=10)
+    force_diagonal: bool = field(default=False)
+
+    def __post_init__(self):
+        if self.run_bulk:
+            if self.run_bulk == 'relax':
+                if self.hybrid_functional:
+                    self.bulk_maker = HybridCellOptFlowMaker(
+                        initialize_with_pbe=self.initialize_with_pbe,
+                        hybrid_functional=self.hybrid_functional,
+                        hybrid_maker=HybridCellOptMaker(
+                            input_set_generator=DefectHybridCellOptSetGenerator()
+                        )
+                        )
+                else:
+                    self.bulk_maker = CellOptMaker(
+                        input_set_generator=DefectCellOptSetGenerator()
+                        )
+            elif self.run_bulk == "static":
+                if self.hybrid_functional:
+                    self.bulk_maker = HybridStaticFlowMaker( 
+                        hybrid_functional=self.hybrid_functional,
+                        hybrid_maker=HybridStaticMaker(
+                            input_set_generator=DefectHybridStaticSetGenerator()
+                        )
+                        )
+                else:
+                    self.bulk_maker = StaticMaker(
+                        input_set_generator=DefectStaticSetGenerator()
+                        )
+
+        if self.hybrid_functional:
+            self.def_maker = HybridRelaxFlowMaker(
+                hybrid_functional=self.hybrid_functional,
+                initialize_with_pbe=self.initialize_with_pbe,
+                initialize_maker=DefectStaticMaker(),
+                hybrid_maker=HybridRelaxMaker()
+            )
+        else:
+            self.def_maker = DefectRelaxMaker()
+
+
+        self.def_maker.supercell_matrix = self.supercell_matrix
+        self.def_maker.max_atoms = self.max_atoms
+        self.def_maker.min_atoms = self.min_atoms
+        self.def_maker.min_length = self.min_length
+        self.def_maker.force_diagonal = self.force_diagonal
+
+    def make(
+        self, defects: Iterable[Defect], 
+        run_all_charges: bool = False, 
+        dielectric: NDArray | int | float | None = None,
+        prev_cp2k_dir: str | Path | None = None):
+        """Make a flow to run multiple defects in order to calculate their formation 
+        energy diagram.
+
+        Parameters
+        ----------
+        defects: list[Defect]
+            List of defects objects to calculate the formation energy diagram for.
+        prev_cp2k_dir: str | Path | None
+            If provided, this acts as prev_dir for the bulk calculation only
+        Returns
+        -------
+        flow: Flow
+            The workflow to calculate the formation energy diagram.
+        """
+        jobs, defect_outputs = [], {}
+        defect_outputs = {defect.name: {} for defect in defects} # TODO DEFECT NAMES ARE NOT UNIQUE HASHES
+        bulk_structure = ensure_defects_same_structure(defects)
+
+        sc_mat = self.supercell_matrix if self.supercell_matrix else \
+                    get_sc_fromstruct(
+                        bulk_structure, self.min_atoms, 
+                        self.max_atoms, self.min_length, 
+                        self.force_diagonal,)
+
+        if self.run_bulk:
+            bulk_job = self.bulk_maker.make(bulk_structure * sc_mat, prev_cp2k_dir=prev_cp2k_dir)
+            jobs.append(bulk_job)
+
+        for defect in defects:
+            chgs = defect.get_charge_states() if run_all_charges else [0]
+            for charge in chgs:
+                # write some provenances data in info.json file
+                info = {"defect": deepcopy(defect), "supercell_matrix": sc_mat}
+                defect_job = self.def_maker.make(defect=deepcopy(defect), charge=charge)
+                defect_job.update_maker_kwargs(
+                    {"_set": {"write_additional_data->info:json": info}}, dict_mod=True
+                )
+                jobs.append(defect_job)
+                defect_outputs[defect.name][int(charge)] = (defect, defect_job.output)
+
+        jobs.append(collect_defect_outputs(
+            defect_outputs=defect_outputs,
+            bulk_output=bulk_job.output,
+            dielectric=dielectric
+            )
+        )
+
+        return Flow(
+            jobs=jobs,
+            name=self.name,
+            output=jobs[-1].output,
+        )
+
+# TODO this is totally code agnostic and should be in common
+@job
+def collect_defect_outputs(
+    defect_outputs: Mapping[str, Mapping[int, OutputReference]], bulk_output: OutputReference, dielectric: NDArray | int | float | None
+) -> dict:
+    """Collect all the outputs from the defect calculations.
+    This job will combine the structure and entry fields to create a
+    ComputerStructureEntry object.
+    Parameters
+    ----------
+    defects_output:
+        The output from the defect calculations.
+    bulk_sc_dir:
+        The directory containing the bulk supercell calculation.
+    dielectric:
+        The dielectric constant used to construct the formation energy diagram.
+    """
+    outputs = {"results": {}}
+    if not dielectric:
+        logger.warn("Dielectric constant not provided. Defect formation energies will be uncorrected.")
+    for defect_name, defects_with_charges in defect_outputs.items():
+        defect_entries = []
+        fnv_plots = {}
+        for charge, defect_and_output in defects_with_charges.items():
+            defect, output_with_charge = defect_and_output
+            logger.info(f"Processing {defect.name} with charge state={charge}")
+            defect_entry = DefectEntry(
+                defect=defect,
+                charge_state=charge,
+                sc_entry=ComputedStructureEntry(structure=bulk_output.structure, energy=bulk_output.output.energy)
+            )
+            defect_entries.append(defect_entry)
+            plot_data = defect_entry.get_freysoldt_correction(
+                defect_locpot=VolumetricData.from_dict(output_with_charge.cp2k_objects['v_hartree']),
+                bulk_locpot=VolumetricData.from_dict(output_with_charge.cp2k_objects['v_hartree']),
+                dielectric=dielectric
+                )
+            fnv_plots[int(charge)] = plot_data
+        outputs["results"][defect.name] = dict(
+            defect=defect, defect_entries=defect_entries, fnv_plots=fnv_plots
+        )
+    return outputs
+
+#TODO should be in common
+def ensure_defects_same_structure(defects: Iterable[Defect]):
+    """Ensure that the defects are valid.
+    Parameters
+    ----------
+    defects
+        The defects to check.
+    Raises
+    ------
+    ValueError
+        If any defect is invalid.
+    """
+    struct = None
+    for defect in defects:
+        if struct is None:
+            struct = defect.structure
+        elif struct != defect.structure:
+            raise ValueError("All defects must have the same host structure.")
+    return struct
+
diff --git a/src/atomate2/cp2k/jobs/defect.py b/src/atomate2/cp2k/jobs/defect.py
new file mode 100644
index 0000000000..aa0fc56b3f
--- /dev/null
+++ b/src/atomate2/cp2k/jobs/defect.py
@@ -0,0 +1,119 @@
+"""Jobs for defect calculations."""
+
+from __future__ import annotations
+
+import logging
+from pathlib import Path
+from dataclasses import dataclass, field
+from copy import deepcopy
+from tkinter import W
+from numpy.typing import NDArray
+
+from pymatgen.analysis.defects.core import Defect, Vacancy
+from atomate2.cp2k.sets.base import Cp2kInputGenerator
+from atomate2.cp2k.sets.defect import (
+    DefectSetGenerator, DefectStaticSetGenerator, DefectRelaxSetGenerator, DefectCellOptSetGenerator, 
+    DefectHybridStaticSetGenerator, DefectHybridRelaxSetGenerator, DefectHybridCellOptSetGenerator
+)
+from atomate2.cp2k.jobs.base import BaseCp2kMaker, cp2k_job
+from atomate2.cp2k.jobs.core import HybridStaticMaker, HybridRelaxMaker, HybridCellOptMaker
+
+logger = logging.getLogger(__name__)
+
+DEFECT_TASK_DOC = {
+    "average_v_hartree": True,
+    "store_volumetric_data": ("v_hartree",)
+}
+
+@dataclass 
+class BaseDefectMaker(BaseCp2kMaker):
+
+    task_document_kwargs: dict = field(default_factory=lambda: DEFECT_TASK_DOC)
+    supercell_matrix: NDArray = field(default=None)
+    min_atoms: int = field(default=80)
+    max_atoms: int = field(default=240)
+    min_length: int = field(default=10)
+    force_diagonal: bool = field(default=False)
+
+    @cp2k_job
+    def make(self, defect: Defect, charge: int = 0, prev_cp2k_dir: str | Path | None = None):
+        if isinstance(defect, Vacancy):
+            defect = GhostVacancy(
+                structure=defect.structure, site=defect.site,
+                multiplicity=defect.multiplicity, oxi_state=defect.oxi_state,
+                symprec=defect.symprec, angle_tolerance=defect.angle_tolerance
+                )
+        structure = defect.get_supercell_structure(
+            sc_mat=self.supercell_matrix, 
+            dummy_species=None, 
+            min_atoms=self.min_atoms,
+            max_atoms=self.max_atoms,
+            min_length=self.min_length,
+            force_diagonal=self.force_diagonal,
+        )
+        structure.set_charge(charge)
+        return super().make.original(self, structure=structure, prev_cp2k_dir=prev_cp2k_dir)
+
+@dataclass
+class DefectStaticMaker(BaseDefectMaker):
+
+    name: str = "defect static"
+    input_set_generator: DefectSetGenerator = field(
+        default_factory=DefectStaticSetGenerator
+        )
+
+@dataclass
+class DefectRelaxMaker(BaseDefectMaker):
+    """
+    Maker to create a relaxation job for point defects.
+
+    Adds an initial random perturbation and ensures that the output contains
+    the hartree potential for finite size corrections.
+    """
+
+    name: str = "defect relax"
+    input_set_generator: Cp2kInputGenerator = field(default_factory=DefectRelaxSetGenerator)
+    transformations: tuple[str, ...] = field(default=("PerturbStructureTransformation",))
+    transformation_params: tuple[dict, ...] | None = field(default=({"distance": 0.01},))
+
+@dataclass
+class DefectCellOptMaker(BaseDefectMaker):
+    """
+    Maker to create a cell for point defects.
+
+    Adds an initial random perturbation and ensures that the output contains
+    the hartree potential for finite size corrections.
+    """
+
+    name: str = "defect relax"
+    input_set_generator: Cp2kInputGenerator = field(default_factory=DefectCellOptSetGenerator)
+    transformations: tuple[str, ...] = field(default=("PerturbStructureTransformation",))
+    transformation_params: tuple[dict, ...] | None = field(default=({"distance": 0.01},))
+
+@dataclass
+class DefectHybridStaticMaker(DefectStaticMaker, HybridStaticMaker):
+    
+    name: str = "defect hybrid static"
+    input_set_generator: DefectSetGenerator = field(default_factory=DefectHybridStaticSetGenerator)
+
+@dataclass
+class DefectHybridRelaxMaker(DefectRelaxMaker, HybridRelaxMaker):
+
+    name: str = "defect hybrid relax"
+    input_set_generator: DefectSetGenerator = field(default_factory=DefectHybridRelaxSetGenerator)
+
+@dataclass
+class DefectHybridCellOptMaker(DefectCellOptMaker, HybridCellOptMaker):
+
+    name: str = "defect hybrid cell opt"
+    input_set_generator: DefectSetGenerator = field(default_factory=DefectHybridCellOptSetGenerator)
+
+class GhostVacancy(Vacancy):
+    """Custom override of vacancy to deal with basis set superposition error."""
+
+    @property
+    def defect_structure(self):
+        """Returns the defect structure with the proper oxidation state"""
+        struct = self.structure.copy()
+        struct.add_site_property("ghost", [i == self.defect_site_index for i in range(len(struct))])
+        return struct
\ No newline at end of file
diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
new file mode 100644
index 0000000000..9f58878733
--- /dev/null
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -0,0 +1,345 @@
+from datetime import datetime
+from tokenize import group
+from typing import ClassVar, Dict, Tuple, Mapping, List
+from pydantic import BaseModel, Field
+from pydantic import validator
+from itertools import groupby
+
+from monty.json import MontyDecoder 
+
+from pymatgen.core import Structure
+from pymatgen.entries.computed_entries import ComputedStructureEntry
+from pymatgen.analysis.defects.core import Defect
+from pymatgen.analysis.defects.corrections import get_correction
+from pymatgen.analysis.defects.thermo import DefectEntry, DefectSiteFinder
+from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
+from atomate2 import SETTINGS
+
+from atomate2.common.schemas.structure import StructureMetadata
+from atomate2.cp2k.schemas.calc_types.utils import run_type, task_type
+from atomate2.cp2k.schemas.calc_types.enums import CalcType, TaskType, RunType
+from atomate2.cp2k.schemas.task import TaskDocument
+
+class DefectDoc(StructureMetadata):
+    """
+    A document used to represent a single defect. e.g. a O vacancy with a -2 charge.
+    This document can contain an arbitrary number of defect entries, originating from
+    pairs (defect and bulk) of calculations. This document provides access to the "best"
+    calculation of each run_type.
+    """
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    property_name: ClassVar[str] = "defect"
+
+    defect: Defect = Field(None, description="Pymatgen defect object for this defect doc")
+
+    name: str = Field(None, description="Name of this defect as generated by the defect object")
+
+    material_id: int = Field(None, description="Unique material ID for the bulk material") #TODO Change to MPID
+
+    task_ids: List[int] = Field(
+        None, description="All task ids used in creating this defect doc."
+    )
+
+    calc_types: Mapping[int, CalcType] = Field(  # type: ignore
+        None,
+        description="Calculation types for all the calculations that make up this material",
+    )
+    task_types: Mapping[int, TaskType] = Field(
+        None,
+        description="Task types for all the calculations that make up this material",
+    )
+    run_types: Mapping[int, RunType] = Field(
+        None,
+        description="Run types for all the calculations that make up this material",
+    )
+
+    tasks: Mapping[RunType, Tuple[TaskDocument, TaskDocument]] = Field(
+        None, description="Task documents (defect task, bulk task) for the defect entry of RunType"
+    )
+
+    entries: Mapping[RunType, DefectEntry] = Field(
+        None, description="Dictionary for tracking entries for CP2K calculations"
+    )
+
+    last_updated: datetime = Field(
+        description="Timestamp for when this document was last updated",
+        default_factory=datetime.utcnow,
+    )
+
+    created_at: datetime = Field(
+        description="Timestamp for when this material document was first created",
+        default_factory=datetime.utcnow,
+    )
+
+    metadata: Dict = Field(description="Metadata for this defect")
+
+    # TODO How can monty serialization incorporate into pydantic? It seems like VASP MatDocs dont need this
+    @validator("entries", pre=True)
+    def decode(cls, entries):
+        for e in entries:
+            if isinstance(entries[e], dict):
+                entries[e] = MontyDecoder().process_decoded({k: v for k, v in entries[e].items()})
+        return entries
+
+    def update(self, defect_task, bulk_task, dielectric, query='defect'):
+
+        defect_task_doc = TaskDocument(**defect_task)
+        bulk_task_doc = TaskDocument(**bulk_task)
+
+        rt = defect_task_doc.run_type
+        tt = defect_task_doc.task_type
+        ct = defect_task_doc.calc_type
+
+        # Metadata
+        last_updated = max(dtsk.last_updated for dtsk, btsk in self.tasks.values()) if self.tasks else datetime.now()
+        created_at = min(dtsk.last_updated for dtsk, btsk in self.tasks.values()) if self.tasks else datetime.now()
+
+        if defect_task_doc.task_id in self.task_ids:
+            return
+        else:
+            self.last_updated = last_updated
+            self.created_at = created_at
+            self.task_ids.append(defect_task_doc.task_id)
+
+            def _run_type(x):
+                return run_type(x[0]['input']['dft']).value
+
+            def _compare(new, old):
+                # TODO return kpoint density
+                return new['nsites'] > old.nsites
+
+            if defect_task_doc.run_type not in self.tasks or _compare(defect_task, self.tasks[rt][0]):
+                self.run_types.update({defect_task_doc.task_id: rt})
+                self.task_types.update({defect_task_doc.task_id: tt})
+                self.calc_types.update({defect_task_doc.task_id: ct})
+                entry = self.__class__.get_defect_entry_from_tasks(
+                            defect_task=defect_task,
+                            bulk_task=bulk_task,
+                            dielectric=dielectric,
+                            query=query
+                        )
+                self.entries[rt] = entry
+                self.tasks[rt] = (defect_task_doc, bulk_task_doc)
+
+    def update_all(self, tasks, query='defect'):
+        for defect_task, bulk_task, dielectric in tasks:
+            self.update(defect_task=defect_task, bulk_task=bulk_task, dielectric=dielectric, query=query)
+
+    @classmethod
+    def from_tasks(cls, tasks: List, query='defect', material_id=None):
+        """
+        The standard way to create this document.
+        Args:
+            tasks: A list of defect,bulk task pairs which will be used to construct a
+                series of DefectEntry objects.
+            query: How to retrieve the defect object stored in the task.
+        """
+        task_group = [TaskDocument(**defect_task) for defect_task, bulk_task, dielectric in tasks]
+
+        # Metadata
+        last_updated = datetime.now() or max(task.last_updated for task in task_group)
+        created_at = datetime.now() or min(task.completed_at for task in task_group)
+        task_ids = {task.task_id for task in task_group}
+
+        deprecated_tasks = list(
+            {task.task_id for task in task_group if not task.is_valid}
+        )
+
+        run_types = {task.task_id: task.run_type for task in task_group}
+        task_types = {task.task_id: task.task_type for task in task_group}
+        calc_types = {task.task_id: task.calc_type for task in task_group}
+
+        def _run_type(x):
+            return run_type(x[0]['input']['dft']).value
+
+        def _task_type(x):
+            return task_type(x[0]['input']['dft']).value
+
+        def _sort(x):
+            # TODO return kpoint density, currently just does supercell size
+            return -x[0]['nsites'], x[0]['output']['energy']
+
+        entries = {}
+        final_tasks = {}
+        metadata = {}
+        for key, tasks_for_runtype in groupby(sorted(tasks, key=_run_type), key=_run_type):
+            sorted_tasks = sorted(tasks_for_runtype, key=_sort)
+            ents = [cls.get_defect_entry_from_tasks(t[0], t[1], t[2], query) for t in sorted_tasks]
+            best_entry = ents[0]
+            best_defect_task, best_bulk_task, dielectric = sorted_tasks[0]
+            metadata[key] = {'convergence': [(sorted_tasks[i][0]['nsites'], ents[i].energy) for i in range(len(ents))]}
+            best_defect_task, best_bulk_task = TaskDocument(**best_defect_task), TaskDocument(**best_bulk_task)
+            entries[best_defect_task.run_type] = best_entry
+            final_tasks[best_defect_task.run_type] = (best_defect_task, best_bulk_task)
+
+        data = {
+                'entries': entries,
+                'run_types': run_types,
+                'task_types': task_types,
+                'calc_types': calc_types,
+                'last_updated': last_updated,
+                'created_at': created_at,
+                'task_ids': task_ids,
+                'deprecated_tasks': deprecated_tasks,
+                'tasks': final_tasks,
+                'material_id': material_id if material_id else best_entry.parameters['material_id'],
+                'entry_ids': {rt: entries[rt].entry_id for rt in entries},
+                'defect': best_entry.defect,
+                'name': best_entry.defect.name,
+                'metadata': metadata,
+        }
+        prim = SpacegroupAnalyzer(best_entry.defect.bulk_structure).get_primitive_standard_structure()
+        data.update(StructureMetadata.from_structure(prim).dict())
+        return cls(**data)
+
+    @classmethod
+    def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, query='transformations.history.0.defect'):
+        """
+        Extract a defect entry from a single pair (defect and bulk) of tasks.
+
+        Args:
+            defect_task: task dict for the defect calculation
+            bulk_task: task dict for the bulk calculation
+            dielectric: Dielectric doc if the defect is charged. If not present, no dielectric
+                corrections will be performed, even if the defect is charged.
+            query: Mongo-style query to retrieve the defect object from the defect task
+        """
+        parameters = cls.get_parameters_from_tasks(defect_task=defect_task, bulk_task=bulk_task)
+        if dielectric:
+            parameters['dielectric'] = dielectric
+
+        if parameters['charge_state']
+            corrections, plt_data = get_correction(
+                q=defect_entry.charge_state, dielectric=parameters['dielectric'], 
+                defect_locpot=parameters['defect_v_hartree'], 
+                bulk_locpot=parameters['bulk_v_hartree'], 
+                defect_frac_coords=parameters['defect_frac_sc_coords'],
+                )
+        else:
+            corrections = {}
+
+        sc_entry = ComputedStructureEntry(
+            structure=parameters['final_defect_structure'], 
+            energy=parameters['defect_energy'] - parameters['bulk_energy']
+            )
+
+        defect_entry = DefectEntry(
+            defect=cls.get_defect_from_task(query=query, task=defect_task),
+            charge_state=,
+            sc_entry=sc_entry,
+            sc_defect_frac_coords=parameters['defect_frac_sc_coords'],
+            corrections=corrections,
+        )
+
+        return defect_entry.as_dict()
+
+    @classmethod
+    def get_defect_from_task(cls, query, task):
+        """
+        Unpack a Mongo-style query and retrieve a defect object from a task.
+        """
+        defect = unpack(query.split('.'), task)
+        needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']
+        return MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys})
+
+    @classmethod
+    def get_parameters_from_tasks(cls, defect_task, bulk_task):
+        """
+        Get parameters necessary to create a defect entry from defect and bulk task dicts
+        Args:
+            defect_task: task dict for the defect calculation
+            bulk_task: task dict for the bulk calculation
+        """
+
+        defect_task = TaskDocument(**defect_task)
+        bulk_task = TaskDocument(**bulk_task)
+
+        final_defect_structure = defect_task.structure
+        final_bulk_structure = bulk_task.structure
+
+        ghost = [index for index, prop in enumerate(final_defect_structure.site_properties.get("ghost")) if prop]
+        if ghost:
+            defect_frac_sc_coords = final_defect_structure[ghost[0]]
+        else:
+            defect_frac_sc_coords = DefectSiteFinder(SETTINGS.SYMPREC).get_defect_fpos(defect_structure=final_defect_structure, base_structure=final_bulk_structure)
+
+        parameters = {
+            'defect_energy': defect_task['output']['energy'],
+            'bulk_energy': bulk_task['output']['energy'],
+            'final_defect_structure': final_defect_structure,
+            'vbm': bulk_task['output']['vbm'],
+            'cbm': bulk_task['output']['cbm'],
+            'defect_frac_sc_coords': defect_frac_sc_coords,
+            'defect_v_hartree': defect_task.cp2k_objects['v_hartree'], # TODO CP2K spec name
+            'bulk_v_hartree': bulk_task.cp2k_objects['v_hartree'], # TODO CP2K spec name
+        }
+
+        return parameters
+
+
+# TODO Some of this should be done by DefectCompatibility,
+# but it's not clear how to do that since 2d materials 
+# are not tagged in any particular way to allow defect compatibility
+# to decide which correction to apply
+class DefectDoc2d(DefectDoc):
+    """
+    DefectDoc subclass for 2D defects
+    """
+
+    @classmethod
+    def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, query='transformations.history.0.defect'):
+        """
+        Get defect entry from defect and bulk tasks. 
+        Args:
+            defect_task: task dict for the defect calculation
+            bulk_task: task dict for the bulk calculation
+            dielectric: dielectric tensor for the defect calculation
+            query: query string for defect entry
+        """
+        parameters = cls.get_parameters_from_tasks(defect_task=defect_task, bulk_task=bulk_task)
+        if dielectric:
+            eps_parallel = (dielectric[0][0] + dielectric[1][1]) / 2
+            eps_perp = dielectric[2][2]
+            parameters['dielectric'] = (eps_parallel - 1) / (1 - 1/eps_perp)
+
+        defect_entry = DefectEntry(
+            cls.get_defect_from_task(query=query, task=defect_task),
+            uncorrected_energy=parameters.pop('defect_energy') - parameters.pop('bulk_energy'),
+            parameters=parameters,
+            entry_id=parameters.pop('entry_id')
+        )
+
+        DefectCompatibility().process_entry(defect_entry, perform_corrections=False)
+        with ScratchDir('.'):
+            fc = FreysoldtCorrection2d(
+                    defect_entry.parameters.get('dielectric'), 
+                    "LOCPOT.ref", "LOCPOT.def", encut=520, buffer=2
+                ) 
+            lref = VolumetricData(
+                structure=Structure.from_dict(bulk_task['input']['structure']), 
+                data={'total': MontyDecoder().process_decoded(bulk_task['v_hartree'])}
+            )
+            ldef = VolumetricData(
+                structure=Structure.from_dict(defect_task['input']['structure']), 
+                data={'total': MontyDecoder().process_decoded(defect_task['v_hartree'])}
+            )
+            lref.write_file("LOCPOT.ref")
+            ldef.write_file("LOCPOT.def")
+            ecorr = fc.get_correction(defect_entry)
+            defect_entry.corrections.update(ecorr)
+            defect_entry.parameters['freysoldt2d_meta'] = fc.metadata
+
+        defect_entry_as_dict = defect_entry.as_dict()
+        defect_entry_as_dict['task_id'] = defect_entry_as_dict['entry_id']  # this seemed necessary for legacy db
+
+        return defect_entry
+
+def unpack(query, d):
+    if not query:
+        return d
+    if isinstance(d, List):
+        return unpack(query[1:], d.__getitem__(int(query.pop(0))))
+    return unpack(query[1:], d.__getitem__(query.pop(0)))
\ No newline at end of file
diff --git a/src/atomate2/cp2k/sets/defect.py b/src/atomate2/cp2k/sets/defect.py
new file mode 100644
index 0000000000..df3a2cbe4c
--- /dev/null
+++ b/src/atomate2/cp2k/sets/defect.py
@@ -0,0 +1,55 @@
+"""Module defining defect input set generators."""
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+
+from pymatgen.core import Structure
+
+from atomate2.cp2k.sets.base import Cp2kInputGenerator, multiple_input_updators
+from atomate2.cp2k.sets.core import (
+    StaticSetGenerator, RelaxSetGenerator, CellOptSetGenerator,
+    HybridStaticSetGenerator, HybridRelaxSetGenerator, HybridCellOptSetGenerator
+) 
+logger = logging.getLogger(__name__)
+
+@dataclass
+class DefectSetGenerator(Cp2kInputGenerator):
+    """
+    """
+
+    def get_input_updates(self, structure: Structure, *args, **kwargs) -> dict:
+        """
+        """
+        return {'print_v_hartree': True}
+
+@dataclass
+@multiple_input_updators()
+class DefectStaticSetGenerator(DefectSetGenerator, StaticSetGenerator):
+    pass    
+
+@dataclass
+@multiple_input_updators()
+class DefectRelaxSetGenerator(DefectSetGenerator, RelaxSetGenerator):
+    pass
+
+@dataclass
+@multiple_input_updators()
+class DefectCellOptSetGenerator(DefectSetGenerator, CellOptSetGenerator):
+    pass
+
+@dataclass
+@multiple_input_updators()
+class DefectHybridStaticSetGenerator(DefectSetGenerator, HybridStaticSetGenerator):
+    pass   
+
+@dataclass
+@multiple_input_updators()
+class DefectHybridRelaxSetGenerator(DefectSetGenerator, HybridRelaxSetGenerator):
+    pass   
+
+@dataclass
+@multiple_input_updators()
+class DefectHybridCellOptSetGenerator(DefectSetGenerator, HybridCellOptSetGenerator):
+    pass 
\ No newline at end of file

From 5be4ac3b34b8acc25d0575be621922c7138c21ef Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Thu, 27 Oct 2022 20:39:17 -0700
Subject: [PATCH 02/50] Defect updates

---
 src/atomate2/cp2k/drones.py             |   2 +-
 src/atomate2/cp2k/flows/defect.py       |  55 +++++----
 src/atomate2/cp2k/jobs/defect.py        |   9 ++
 src/atomate2/cp2k/schemas/defect.py     | 118 ++++++++------------
 src/atomate2/cp2k/sets/BaseCp2kSet.yaml | 142 ++++++++++++------------
 5 files changed, 156 insertions(+), 170 deletions(-)

diff --git a/src/atomate2/cp2k/drones.py b/src/atomate2/cp2k/drones.py
index 662565d4ac..cce5c096c0 100644
--- a/src/atomate2/cp2k/drones.py
+++ b/src/atomate2/cp2k/drones.py
@@ -1,4 +1,4 @@
-"""Drones for parsing VASP calculations and related outputs."""
+"""Drones for parsing CP2K calculations and related outputs."""
 
 from __future__ import annotations
 
diff --git a/src/atomate2/cp2k/flows/defect.py b/src/atomate2/cp2k/flows/defect.py
index 7f48d318a2..df11cc2bb0 100644
--- a/src/atomate2/cp2k/flows/defect.py
+++ b/src/atomate2/cp2k/flows/defect.py
@@ -78,32 +78,34 @@ class FormationEnergyMaker(Maker):
     force_diagonal: bool = field(default=False)
 
     def __post_init__(self):
-        if self.run_bulk:
-            if self.run_bulk == 'relax':
-                if self.hybrid_functional:
-                    self.bulk_maker = HybridCellOptFlowMaker(
-                        initialize_with_pbe=self.initialize_with_pbe,
-                        hybrid_functional=self.hybrid_functional,
-                        hybrid_maker=HybridCellOptMaker(
-                            input_set_generator=DefectHybridCellOptSetGenerator()
-                        )
-                        )
-                else:
-                    self.bulk_maker = CellOptMaker(
-                        input_set_generator=DefectCellOptSetGenerator()
-                        )
-            elif self.run_bulk == "static":
-                if self.hybrid_functional:
-                    self.bulk_maker = HybridStaticFlowMaker( 
-                        hybrid_functional=self.hybrid_functional,
-                        hybrid_maker=HybridStaticMaker(
-                            input_set_generator=DefectHybridStaticSetGenerator()
+        if self.run_bulk == 'relax':
+            if self.hybrid_functional:
+                self.bulk_maker = HybridCellOptFlowMaker(
+                    initialize_with_pbe=self.initialize_with_pbe,
+                    hybrid_functional=self.hybrid_functional,
+                    hybrid_maker=HybridCellOptMaker(
+                        input_set_generator=DefectHybridCellOptSetGenerator()
                         )
+                    )
+            else:
+                self.bulk_maker = CellOptMaker(
+                    input_set_generator=DefectCellOptSetGenerator()
+                    )
+        elif self.run_bulk == "static":
+            if self.hybrid_functional:
+                self.bulk_maker = HybridStaticFlowMaker( 
+                    hybrid_functional=self.hybrid_functional,
+                    hybrid_maker=HybridStaticMaker(
+                        input_set_generator=DefectHybridStaticSetGenerator()
                         )
-                else:
-                    self.bulk_maker = StaticMaker(
-                        input_set_generator=DefectStaticSetGenerator()
-                        )
+                    )
+            else:
+                self.bulk_maker = StaticMaker(
+                    input_set_generator=DefectStaticSetGenerator()
+                    )
+
+        # TODO Can probably put this somewhere else?
+        self.bulk_maker.task_document_kwargs.update({"average_v_hartree": True, "store_volumetric_data": ("v_hartree",)})
 
         if self.hybrid_functional:
             self.def_maker = HybridRelaxFlowMaker(
@@ -158,12 +160,7 @@ def make(
         for defect in defects:
             chgs = defect.get_charge_states() if run_all_charges else [0]
             for charge in chgs:
-                # write some provenances data in info.json file
-                info = {"defect": deepcopy(defect), "supercell_matrix": sc_mat}
                 defect_job = self.def_maker.make(defect=deepcopy(defect), charge=charge)
-                defect_job.update_maker_kwargs(
-                    {"_set": {"write_additional_data->info:json": info}}, dict_mod=True
-                )
                 jobs.append(defect_job)
                 defect_outputs[defect.name][int(charge)] = (defect, defect_job.output)
 
diff --git a/src/atomate2/cp2k/jobs/defect.py b/src/atomate2/cp2k/jobs/defect.py
index aa0fc56b3f..df34dddd07 100644
--- a/src/atomate2/cp2k/jobs/defect.py
+++ b/src/atomate2/cp2k/jobs/defect.py
@@ -52,6 +52,15 @@ def make(self, defect: Defect, charge: int = 0, prev_cp2k_dir: str | Path | None
             force_diagonal=self.force_diagonal,
         )
         structure.set_charge(charge)
+        # provenance stuff
+        self.write_additional_data.update(
+            {
+                "info.json": {
+                    "defect": deepcopy(defect), 
+                    "defect_charge": charge, 
+                    "sc_mat": self.supercell_matrix}
+                    }
+            )
         return super().make.original(self, structure=structure, prev_cp2k_dir=prev_cp2k_dir)
 
 @dataclass
diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index 9f58878733..08917df314 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -6,11 +6,12 @@
 from itertools import groupby
 
 from monty.json import MontyDecoder 
+from monty.tempfile import ScratchDir
 
 from pymatgen.core import Structure
 from pymatgen.entries.computed_entries import ComputedStructureEntry
 from pymatgen.analysis.defects.core import Defect
-from pymatgen.analysis.defects.corrections import get_correction
+from pymatgen.analysis.defects.corrections import get_freysoldt_correction, get_freysoldt2d_correction
 from pymatgen.analysis.defects.thermo import DefectEntry, DefectSiteFinder
 from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
 from atomate2 import SETTINGS
@@ -211,15 +212,7 @@ def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, qu
         if dielectric:
             parameters['dielectric'] = dielectric
 
-        if parameters['charge_state']
-            corrections, plt_data = get_correction(
-                q=defect_entry.charge_state, dielectric=parameters['dielectric'], 
-                defect_locpot=parameters['defect_v_hartree'], 
-                bulk_locpot=parameters['bulk_v_hartree'], 
-                defect_frac_coords=parameters['defect_frac_sc_coords'],
-                )
-        else:
-            corrections = {}
+        corrections, metadata = cls.get_correction_from_parameters(parameters)
 
         sc_entry = ComputedStructureEntry(
             structure=parameters['final_defect_structure'], 
@@ -228,7 +221,7 @@ def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, qu
 
         defect_entry = DefectEntry(
             defect=cls.get_defect_from_task(query=query, task=defect_task),
-            charge_state=,
+            charge_state=parameters['charge_state'],
             sc_entry=sc_entry,
             sc_defect_frac_coords=parameters['defect_frac_sc_coords'],
             corrections=corrections,
@@ -236,6 +229,50 @@ def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, qu
 
         return defect_entry.as_dict()
 
+    @classmethod
+    def get_correction_from_parameters(cls, parameters) -> Tuple[Dict, Dict]:
+        corrections = {}
+        metadata = {}
+        for correction in ["get_freysoldt_correction", "get_freysoldt2d_correction"]:
+            c, m = getattr(cls, correction)(parameters)
+            corrections.update(c)
+            metadata.update(m)
+        return corrections, metadata
+
+    @classmethod
+    def get_freysold_correction(cls, parameters) -> Tuple[Dict, Dict]:
+        if parameters['charge_state'] and not parameters.get("2d"):
+            return get_freysoldt_correction(
+                q=parameters['charge_state'], dielectric=parameters['dielectric'], 
+                defect_locpot=parameters['defect_v_hartree'], 
+                bulk_locpot=parameters['bulk_v_hartree'], 
+                defect_frac_coords=parameters['defect_frac_sc_coords'],
+                )
+        return {}, {}
+    
+    @classmethod
+    def get_freysoldt2d_correction(cls, parameters):
+
+        from pymatgen.io.vasp.outputs import VolumetricData as VaspVolumetricData
+
+        if parameters['charge_state'] and parameters.get("2d"):
+            eps_parallel = (parameters['dielectric'][0][0] + parameters['dielectric'][1][1]) / 2
+            eps_perp = parameters['dielectric'][2][2]
+            dielectric = (eps_parallel - 1) / (1 - 1/eps_perp)
+            with ScratchDir('.'):
+                
+                lref = VaspVolumetricData(structure=parameters['bulk_locpot'].structure, data=parameters['bulk_locpot'].data)
+                ldef = VaspVolumetricData(structure=parameters['defect_locpot'].structure, data=parameters['defect_locpot'].data)
+                lref.write_file("LOCPOT.ref")
+                ldef.write_file("LOCPOT.def")
+
+                return get_freysoldt2d_correction(
+                    q=parameters['charge_state'], dielectric=dielectric, defect_locpot="LOCPOT.def", 
+                    bulk_locpot="LOCPOT.ref", defect_frac_coords=parameters['defect_frac_sc_coords'], 
+                    energy_cutoff=250, slab_buffer=2
+                    )
+        return {}, {}
+
     @classmethod
     def get_defect_from_task(cls, query, task):
         """
@@ -272,6 +309,7 @@ def get_parameters_from_tasks(cls, defect_task, bulk_task):
             'final_defect_structure': final_defect_structure,
             'vbm': bulk_task['output']['vbm'],
             'cbm': bulk_task['output']['cbm'],
+            'charge_state': defect_task.output.structure.charge,
             'defect_frac_sc_coords': defect_frac_sc_coords,
             'defect_v_hartree': defect_task.cp2k_objects['v_hartree'], # TODO CP2K spec name
             'bulk_v_hartree': bulk_task.cp2k_objects['v_hartree'], # TODO CP2K spec name
@@ -279,64 +317,6 @@ def get_parameters_from_tasks(cls, defect_task, bulk_task):
 
         return parameters
 
-
-# TODO Some of this should be done by DefectCompatibility,
-# but it's not clear how to do that since 2d materials 
-# are not tagged in any particular way to allow defect compatibility
-# to decide which correction to apply
-class DefectDoc2d(DefectDoc):
-    """
-    DefectDoc subclass for 2D defects
-    """
-
-    @classmethod
-    def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, query='transformations.history.0.defect'):
-        """
-        Get defect entry from defect and bulk tasks. 
-        Args:
-            defect_task: task dict for the defect calculation
-            bulk_task: task dict for the bulk calculation
-            dielectric: dielectric tensor for the defect calculation
-            query: query string for defect entry
-        """
-        parameters = cls.get_parameters_from_tasks(defect_task=defect_task, bulk_task=bulk_task)
-        if dielectric:
-            eps_parallel = (dielectric[0][0] + dielectric[1][1]) / 2
-            eps_perp = dielectric[2][2]
-            parameters['dielectric'] = (eps_parallel - 1) / (1 - 1/eps_perp)
-
-        defect_entry = DefectEntry(
-            cls.get_defect_from_task(query=query, task=defect_task),
-            uncorrected_energy=parameters.pop('defect_energy') - parameters.pop('bulk_energy'),
-            parameters=parameters,
-            entry_id=parameters.pop('entry_id')
-        )
-
-        DefectCompatibility().process_entry(defect_entry, perform_corrections=False)
-        with ScratchDir('.'):
-            fc = FreysoldtCorrection2d(
-                    defect_entry.parameters.get('dielectric'), 
-                    "LOCPOT.ref", "LOCPOT.def", encut=520, buffer=2
-                ) 
-            lref = VolumetricData(
-                structure=Structure.from_dict(bulk_task['input']['structure']), 
-                data={'total': MontyDecoder().process_decoded(bulk_task['v_hartree'])}
-            )
-            ldef = VolumetricData(
-                structure=Structure.from_dict(defect_task['input']['structure']), 
-                data={'total': MontyDecoder().process_decoded(defect_task['v_hartree'])}
-            )
-            lref.write_file("LOCPOT.ref")
-            ldef.write_file("LOCPOT.def")
-            ecorr = fc.get_correction(defect_entry)
-            defect_entry.corrections.update(ecorr)
-            defect_entry.parameters['freysoldt2d_meta'] = fc.metadata
-
-        defect_entry_as_dict = defect_entry.as_dict()
-        defect_entry_as_dict['task_id'] = defect_entry_as_dict['entry_id']  # this seemed necessary for legacy db
-
-        return defect_entry
-
 def unpack(query, d):
     if not query:
         return d
diff --git a/src/atomate2/cp2k/sets/BaseCp2kSet.yaml b/src/atomate2/cp2k/sets/BaseCp2kSet.yaml
index 14a24c5f5a..edab0e1f93 100644
--- a/src/atomate2/cp2k/sets/BaseCp2kSet.yaml
+++ b/src/atomate2/cp2k/sets/BaseCp2kSet.yaml
@@ -8,11 +8,11 @@ cp2k_input:
       basis: null
       potential: null
     Ag:
-      aux_basis: admm-dz-q11
+      aux_basis: admm-tzp-q11
       basis: TZVP-MOLOPT-PBE-GTH-q11
       potential: GTH-PBE-q19
     Al:
-      aux_basis: admm-dz-q3
+      aux_basis: admm-tzp-q3
       basis: TZVP-MOLOPT-PBE-GTH-q3
       potential: GTH-PBE-q3
     Am:
@@ -20,31 +20,31 @@ cp2k_input:
       basis: null
       potential: null
     Ar:
-      aux_basis: admm-dz-q8
+      aux_basis: admm-tzp-q8
       basis: TZVP-MOLOPT-PBE-GTH-q8
       potential: GTH-PBE-q8
     As:
-      aux_basis: admm-dz-q5
+      aux_basis: admm-tzp-q5
       basis: TZVP-MOLOPT-PBE-GTH-q5
       potential: GTH-PBE-q5
     At:
-      aux_basis: admm-dz-q7
+      aux_basis: admm-tzp-q7
       basis: TZVP-MOLOPT-PBE-GTH-q7
       potential: GTH-PBE-q7
     Au:
-      aux_basis: admm-dz-q11
+      aux_basis: admm-tzp-q11
       basis: TZVP-MOLOPT-PBE-GTH-q11
       potential: GTH-PBE-q19
     B:
-      aux_basis: admm-dz-q3
+      aux_basis: admm-tzp-q3
       basis: TZVP-MOLOPT-PBE-GTH-q3
       potential: GTH-PBE-q3
     Ba:
-      aux_basis: admm-dz-q10
+      aux_basis: admm-tzp-q10
       basis: TZVP-MOLOPT-PBE-GTH-q10
       potential: GTH-PBE-q10
     Be:
-      aux_basis: admm-dz-q4
+      aux_basis: admm-tzp-q4
       basis: TZVP-MOLOPT-PBE-GTH-q2
       potential: GTH-PBE-q4
     Bh:
@@ -52,7 +52,7 @@ cp2k_input:
       basis: null
       potential: null
     Bi:
-      aux_basis: admm-dz-q5
+      aux_basis: admm-tzp-q5
       basis: TZVP-MOLOPT-PBE-GTH-q5
       potential: GTH-PBE-q5
     Bk:
@@ -60,19 +60,19 @@ cp2k_input:
       basis: null
       potential: null
     Br:
-      aux_basis: admm-dz-q7
+      aux_basis: admm-tzp-q7
       basis: TZVP-MOLOPT-PBE-GTH-q7
       potential: GTH-PBE-q7
     C:
-      aux_basis: admm-dz-q4
+      aux_basis: admm-tzp-q4
       basis: TZVP-MOLOPT-PBE-GTH-q4
       potential: GTH-PBE-q4
     Ca:
-      aux_basis: admm-dz-q10
+      aux_basis: admm-tzp-q10
       basis: TZVP-MOLOPT-PBE-GTH-q10
       potential: GTH-PBE-q10
     Cd:
-      aux_basis: admm-dz-q12
+      aux_basis: admm-tzp-q12
       basis: TZVP-MOLOPT-PBE-GTH-q12
       potential: GTH-PBE-q12
     Ce:
@@ -84,7 +84,7 @@ cp2k_input:
       basis: null
       potential: null
     Cl:
-      aux_basis: admm-dz-q7
+      aux_basis: admm-tzp-q7
       basis: TZVP-MOLOPT-PBE-GTH-q7
       potential: GTH-PBE-q7
     Cm:
@@ -96,19 +96,19 @@ cp2k_input:
       basis: null
       potential: null
     Co:
-      aux_basis: admm-dz-q17
+      aux_basis: admm-tzp-q17
       basis: TZVP-MOLOPT-PBE-GTH-q17
       potential: GTH-PBE-q17
     Cr:
-      aux_basis: admm-dz-q14
+      aux_basis: admm-tzp-q14
       basis: TZVP-MOLOPT-PBE-GTH-q14
       potential: GTH-PBE-q14
     Cs:
-      aux_basis: admm-dz-q9
+      aux_basis: admm-tzp-q9
       basis: TZVP-MOLOPT-PBE-GTH-q9
       potential: GTH-PBE-q9
     Cu:
-      aux_basis: admm-dz-q11
+      aux_basis: admm-tzp-q11
       basis: TZVP-MOLOPT-PBE-GTH-q11
       potential: GTH-PBE-q19
     Db:
@@ -136,11 +136,11 @@ cp2k_input:
       basis: null
       potential: GTH-PBE-q17
     F:
-      aux_basis: admm-dz-q7
+      aux_basis: admm-tzp-q7
       basis: TZVP-MOLOPT-PBE-GTH-q7
       potential: GTH-PBE-q7
     Fe:
-      aux_basis: admm-dz-q16
+      aux_basis: admm-tzp-q16
       basis: TZVP-MOLOPT-PBE-GTH-q16
       potential: GTH-PBE-q16
     Fl:
@@ -156,7 +156,7 @@ cp2k_input:
       basis: null
       potential: null
     Ga:
-      aux_basis: admm-dz-q13
+      aux_basis: admm-tzp-q13
       basis: TZVP-MOLOPT-PBE-GTH-q13
       potential: GTH-PBE-q3
     Gd:
@@ -164,23 +164,23 @@ cp2k_input:
       basis: null
       potential: GTH-PBE-q18
     Ge:
-      aux_basis: admm-dz-q4
+      aux_basis: admm-tzp-q4
       basis: TZVP-MOLOPT-PBE-GTH-q4
       potential: GTH-PBE-q4
     H:
-      aux_basis: admm-dz-q1
+      aux_basis: admm-tzp-q1
       basis: TZVP-MOLOPT-PBE-GTH-q1
       potential: GTH-PBE-q1
     He:
-      aux_basis: admm-dz-q2
+      aux_basis: admm-tzp-q2
       basis: TZVP-MOLOPT-PBE-GTH-q2
       potential: GTH-PBE-q2
     Hf:
-      aux_basis: admm-dz-q12
+      aux_basis: admm-tzp-q12
       basis: TZVP-MOLOPT-PBE-GTH-q12
       potential: GTH-PBE-q12
     Hg:
-      aux_basis: admm-dz-q12
+      aux_basis: admm-tzp-q12
       basis: TZVP-MOLOPT-PBE-GTH-q12
       potential: GTH-PBE-q12
     Ho:
@@ -192,23 +192,23 @@ cp2k_input:
       basis: null
       potential: null
     I:
-      aux_basis: admm-dz-q7
+      aux_basis: admm-tzp-q7
       basis: TZVP-MOLOPT-PBE-GTH-q7
       potential: GTH-PBE-q7
     In:
-      aux_basis: admm-dz-q13
+      aux_basis: admm-tzp-q13
       basis: TZVP-MOLOPT-PBE-GTH-q13
       potential: GTH-PBE-q3
     Ir:
-      aux_basis: admm-dz-q17
+      aux_basis: admm-tzp-q17
       basis: TZVP-MOLOPT-PBE-GTH-q17
       potential: GTH-PBE-q9
     K:
-      aux_basis: admm-dz-q9
+      aux_basis: admm-tzp-q9
       basis: TZVP-MOLOPT-PBE-GTH-q9
       potential: GTH-PBE-q9
     Kr:
-      aux_basis: admm-dz-q8
+      aux_basis: admm-tzp-q8
       basis: TZVP-MOLOPT-PBE-GTH-q8
       potential: GTH-PBE-q8
     La:
@@ -216,7 +216,7 @@ cp2k_input:
       basis: null
       potential: GTH-PBE-q11
     Li:
-      aux_basis: admm-dz-q3
+      aux_basis: admm-tzp-q3
       basis: TZVP-MOLOPT-PBE-GTH-q1
       potential: GTH-PBE-q3
     Lr:
@@ -240,15 +240,15 @@ cp2k_input:
       basis: null
       potential: null
     Mg:
-      aux_basis: admm-dz-q10
+      aux_basis: admm-tzp-q10
       basis: TZVP-MOLOPT-PBE-GTH-q10
       potential: GTH-PBE-q2
     Mn:
-      aux_basis: admm-dz-q15
+      aux_basis: admm-tzp-q15
       basis: TZVP-MOLOPT-PBE-GTH-q15
       potential: GTH-PBE-q15
     Mo:
-      aux_basis: admm-dz-q14
+      aux_basis: admm-tzp-q14
       basis: TZVP-MOLOPT-PBE-GTH-q14
       potential: GTH-PBE-q14
     Mt:
@@ -256,15 +256,15 @@ cp2k_input:
       basis: null
       potential: null
     N:
-      aux_basis: admm-dz-q5
+      aux_basis: admm-tzp-q5
       basis: TZVP-MOLOPT-PBE-GTH-q5
       potential: GTH-PBE-q5
     Na:
-      aux_basis: admm-dz-q9
+      aux_basis: admm-tzp-q9
       basis: TZVP-MOLOPT-PBE-GTH-q1
       potential: GTH-PBE-q9
     Nb:
-      aux_basis: admm-dz-q13
+      aux_basis: admm-tzp-q13
       basis: TZVP-MOLOPT-PBE-GTH-q13
       potential: GTH-PBE-q13
     Nd:
@@ -272,7 +272,7 @@ cp2k_input:
       basis: null
       potential: GTH-PBE-q14
     Ne:
-      aux_basis: admm-dz-q8
+      aux_basis: admm-tzp-q8
       basis: TZVP-MOLOPT-PBE-GTH-q8
       potential: GTH-PBE-q8
     Nh:
@@ -280,7 +280,7 @@ cp2k_input:
       basis: null
       potential: null
     Ni:
-      aux_basis: admm-dz-q18
+      aux_basis: admm-tzp-q18
       basis: TZVP-MOLOPT-PBE-GTH-q18
       potential: GTH-PBE-q18
     'No':
@@ -292,7 +292,7 @@ cp2k_input:
       basis: null
       potential: null
     O:
-      aux_basis: admm-dz-q6
+      aux_basis: admm-tzp-q6
       basis: TZVP-MOLOPT-PBE-GTH-q6
       potential: GTH-PBE-q6
     Og:
@@ -300,11 +300,11 @@ cp2k_input:
       basis: null
       potential: null
     Os:
-      aux_basis: admm-dz-q16
+      aux_basis: admm-tzp-q16
       basis: TZVP-MOLOPT-PBE-GTH-q16
       potential: GTH-PBE-q8
     P:
-      aux_basis: admm-dz-q5
+      aux_basis: admm-tzp-q5
       basis: TZVP-MOLOPT-PBE-GTH-q5
       potential: GTH-PBE-q5
     Pa:
@@ -312,11 +312,11 @@ cp2k_input:
       basis: null
       potential: null
     Pb:
-      aux_basis: admm-dz-q4
+      aux_basis: admm-tzp-q4
       basis: TZVP-MOLOPT-PBE-GTH-q4
       potential: GTH-PBE-q4
     Pd:
-      aux_basis: admm-dz-q18
+      aux_basis: admm-tzp-q18
       basis: TZVP-MOLOPT-PBE-GTH-q18
       potential: GTH-PBE-q18
     Pm:
@@ -324,7 +324,7 @@ cp2k_input:
       basis: null
       potential: GTH-PBE-q15
     Po:
-      aux_basis: admm-dz-q6
+      aux_basis: admm-tzp-q6
       basis: TZVP-MOLOPT-PBE-GTH-q6
       potential: GTH-PBE-q6
     Pr:
@@ -332,7 +332,7 @@ cp2k_input:
       basis: null
       potential: GTH-PBE-q13
     Pt:
-      aux_basis: admm-dz-q18
+      aux_basis: admm-tzp-q18
       basis: TZVP-MOLOPT-PBE-GTH-q18
       potential: GTH-PBE-q18
     Pu:
@@ -344,11 +344,11 @@ cp2k_input:
       basis: null
       potential: null
     Rb:
-      aux_basis: admm-dz-q9
+      aux_basis: admm-tzp-q9
       basis: TZVP-MOLOPT-PBE-GTH-q9
       potential: GTH-PBE-q9
     Re:
-      aux_basis: admm-dz-q15
+      aux_basis: admm-tzp-q15
       basis: TZVP-MOLOPT-PBE-GTH-q15
       potential: GTH-PBE-q7
     Rf:
@@ -360,31 +360,31 @@ cp2k_input:
       basis: null
       potential: null
     Rh:
-      aux_basis: admm-dz-q17
+      aux_basis: admm-tzp-q17
       basis: TZVP-MOLOPT-PBE-GTH-q17
       potential: GTH-PBE-q9
     Rn:
-      aux_basis: admm-dz-q8
+      aux_basis: admm-tzp-q8
       basis: TZVP-MOLOPT-PBE-GTH-q8
       potential: GTH-PBE-q8
     Ru:
-      aux_basis: admm-dz-q16
+      aux_basis: admm-tzp-q16
       basis: TZVP-MOLOPT-PBE-GTH-q16
       potential: GTH-PBE-q8
     S:
-      aux_basis: admm-dz-q6
+      aux_basis: admm-tzp-q6
       basis: TZVP-MOLOPT-PBE-GTH-q6
       potential: GTH-PBE-q6
     Sb:
-      aux_basis: admm-dz-q5
+      aux_basis: admm-tzp-q5
       basis: TZVP-MOLOPT-PBE-GTH-q5
       potential: GTH-PBE-q5
     Sc:
-      aux_basis: admm-dz-q11
+      aux_basis: admm-tzp-q11
       basis: TZVP-MOLOPT-PBE-GTH-q11
       potential: GTH-PBE-q11
     Se:
-      aux_basis: admm-dz-q6
+      aux_basis: admm-tzp-q6
       basis: TZVP-MOLOPT-PBE-GTH-q6
       potential: GTH-PBE-q6
     Sg:
@@ -392,7 +392,7 @@ cp2k_input:
       basis: null
       potential: null
     Si:
-      aux_basis: admm-dz-q4
+      aux_basis: admm-tzp-q4
       basis: TZVP-MOLOPT-PBE-GTH-q4
       potential: GTH-PBE-q4
     Sm:
@@ -400,15 +400,15 @@ cp2k_input:
       basis: null
       potential: GTH-PBE-q16
     Sn:
-      aux_basis: admm-dz-q4
+      aux_basis: admm-tzp-q4
       basis: TZVP-MOLOPT-PBE-GTH-q4
       potential: GTH-PBE-q4
     Sr:
-      aux_basis: admm-dz-q10
+      aux_basis: admm-tzp-q10
       basis: TZVP-MOLOPT-PBE-GTH-q10
       potential: GTH-PBE-q10
     Ta:
-      aux_basis: admm-dz-q13
+      aux_basis: admm-tzp-q13
       basis: TZVP-MOLOPT-PBE-GTH-q13
       potential: GTH-PBE-q5
     Tb:
@@ -416,11 +416,11 @@ cp2k_input:
       basis: null
       potential: GTH-PBE-q19
     Tc:
-      aux_basis: admm-dz-q15
+      aux_basis: admm-tzp-q15
       basis: TZVP-MOLOPT-PBE-GTH-q15
       potential: GTH-PBE-q15
     Te:
-      aux_basis: admm-dz-q6
+      aux_basis: admm-tzp-q6
       basis: TZVP-MOLOPT-PBE-GTH-q6
       potential: GTH-PBE-q6
     Th:
@@ -428,11 +428,11 @@ cp2k_input:
       basis: null
       potential: null
     Ti:
-      aux_basis: admm-dz-q12
+      aux_basis: admm-tzp-q12
       basis: TZVP-MOLOPT-PBE-GTH-q12
       potential: GTH-PBE-q12
     Tl:
-      aux_basis: admm-dz-q13
+      aux_basis: admm-tzp-q13
       basis: TZVP-MOLOPT-PBE-GTH-q13
       potential: GTH-PBE-q3
     Tm:
@@ -448,19 +448,19 @@ cp2k_input:
       basis: null
       potential: GTH-PBE-q14
     V:
-      aux_basis: admm-dz-q13
+      aux_basis: admm-tzp-q13
       basis: TZVP-MOLOPT-PBE-GTH-q13
       potential: GTH-PBE-q13
     W:
-      aux_basis: admm-dz-q14
+      aux_basis: admm-tzp-q14
       basis: TZVP-MOLOPT-PBE-GTH-q14
       potential: GTH-PBE-q6
     Xe:
-      aux_basis: admm-dz-q8
+      aux_basis: admm-tzp-q8
       basis: TZVP-MOLOPT-PBE-GTH-q8
       potential: GTH-PBE-q8
     Y:
-      aux_basis: admm-dz-q11
+      aux_basis: admm-tzp-q11
       basis: TZVP-MOLOPT-PBE-GTH-q11
       potential: GTH-PBE-q11
     Yb:
@@ -468,11 +468,11 @@ cp2k_input:
       basis: null
       potential: GTH-PBE-q24
     Zn:
-      aux_basis: admm-dz-q12
+      aux_basis: admm-tzp-q12
       basis: TZVP-MOLOPT-PBE-GTH-q12
       potential: GTH-PBE-q20
     Zr:
-      aux_basis: admm-dz-q12
+      aux_basis: admm-tzp-q12
       basis: TZVP-MOLOPT-PBE-GTH-q12
       potential: GTH-PBE-q12
     basis_filenames:

From 6b39a445f0119cbdf4f9fcedc99440522c4d825b Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Sat, 29 Oct 2022 09:53:03 -0700
Subject: [PATCH 03/50] Defect updates

---
 src/atomate2/cp2k/flows/core.py   | 11 ++++----
 src/atomate2/cp2k/flows/defect.py | 37 +++++++++++++++------------
 src/atomate2/cp2k/jobs/defect.py  | 42 ++++++++++++++++++++-----------
 3 files changed, 53 insertions(+), 37 deletions(-)

diff --git a/src/atomate2/cp2k/flows/core.py b/src/atomate2/cp2k/flows/core.py
index a153cfdc9b..7e531da2c6 100644
--- a/src/atomate2/cp2k/flows/core.py
+++ b/src/atomate2/cp2k/flows/core.py
@@ -217,15 +217,14 @@ class HybridFlowMaker(Maker):
     def __post_init__(self):
         self.hybrid_maker.hybrid_functional = self.hybrid_functional
 
-    def make(self, structure: Structure, prev_cp2k_dir: str | Path | None = None) -> Job:
+    def make(self, *args, **kwargs) -> Flow:
         jobs = []
         if self.initialize_with_pbe:
-            initialization = self.initialize_maker.make(structure, prev_cp2k_dir)
+            initialization = self.initialize_maker.make(*args, **kwargs)
             jobs.append(initialization)
-        hyb = self.hybrid_maker.make(
-            initialization.output.structure if self.initialize_with_pbe else structure, 
-            prev_cp2k_dir=initialization.output.dir_name if self.initialize_with_pbe else prev_cp2k_dir
-        ) 
+            hyb = self.hybrid_maker.make(initialization.output.structure, prev_cp2k_dir=initialization.output.dir_name)
+        else:
+            hyb = self.hybrid_maker.make(*args, **kwargs)
         jobs.append(hyb)
         return Flow(jobs, output=hyb.output, name=self.name)
 
diff --git a/src/atomate2/cp2k/flows/defect.py b/src/atomate2/cp2k/flows/defect.py
index df11cc2bb0..b67aa6da03 100644
--- a/src/atomate2/cp2k/flows/defect.py
+++ b/src/atomate2/cp2k/flows/defect.py
@@ -40,6 +40,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 # TODO close to being able to put this in common. Just need a switch that decides which core flow/job to use based on software
 @dataclass
 class FormationEnergyMaker(Maker):
@@ -84,35 +85,36 @@ def __post_init__(self):
                     initialize_with_pbe=self.initialize_with_pbe,
                     hybrid_functional=self.hybrid_functional,
                     hybrid_maker=HybridCellOptMaker(
-                        input_set_generator=DefectHybridCellOptSetGenerator()
+                        input_set_generator=DefectHybridCellOptSetGenerator(),
+                        task_document_kwargs={"average_v_hartree": True, "store_volumetric_data": ("v_hartree",)}
                         )
                     )
             else:
                 self.bulk_maker = CellOptMaker(
-                    input_set_generator=DefectCellOptSetGenerator()
+                    input_set_generator=DefectCellOptSetGenerator(),
+                    task_document_kwargs={"average_v_hartree": True, "store_volumetric_data": ("v_hartree",)}
                     )
         elif self.run_bulk == "static":
             if self.hybrid_functional:
                 self.bulk_maker = HybridStaticFlowMaker( 
                     hybrid_functional=self.hybrid_functional,
                     hybrid_maker=HybridStaticMaker(
-                        input_set_generator=DefectHybridStaticSetGenerator()
+                        input_set_generator=DefectHybridStaticSetGenerator(),
+                        task_document_kwargs={"average_v_hartree": True, "store_volumetric_data": ("v_hartree",)}
                         )
                     )
             else:
                 self.bulk_maker = StaticMaker(
-                    input_set_generator=DefectStaticSetGenerator()
+                    input_set_generator=DefectStaticSetGenerator(),
+                    task_document_kwargs={"average_v_hartree": True, "store_volumetric_data": ("v_hartree",)}
                     )
 
-        # TODO Can probably put this somewhere else?
-        self.bulk_maker.task_document_kwargs.update({"average_v_hartree": True, "store_volumetric_data": ("v_hartree",)})
-
         if self.hybrid_functional:
             self.def_maker = HybridRelaxFlowMaker(
                 hybrid_functional=self.hybrid_functional,
                 initialize_with_pbe=self.initialize_with_pbe,
                 initialize_maker=DefectStaticMaker(),
-                hybrid_maker=HybridRelaxMaker()
+                hybrid_maker=DefectHybridRelaxMaker(),
             )
         else:
             self.def_maker = DefectRelaxMaker()
@@ -160,21 +162,24 @@ def make(
         for defect in defects:
             chgs = defect.get_charge_states() if run_all_charges else [0]
             for charge in chgs:
-                defect_job = self.def_maker.make(defect=deepcopy(defect), charge=charge)
+                defect_job = self.def_maker.make(deepcopy(defect), charge)
                 jobs.append(defect_job)
                 defect_outputs[defect.name][int(charge)] = (defect, defect_job.output)
 
-        jobs.append(collect_defect_outputs(
-            defect_outputs=defect_outputs,
-            bulk_output=bulk_job.output,
-            dielectric=dielectric
-            )
-        )
+        if self.run_bulk and defects:
+            collect_job = collect_defect_outputs(
+                defect_outputs=defect_outputs,
+                bulk_output=bulk_job.output if self.run_bulk else None,
+                dielectric=dielectric
+                )
+            jobs.append(collect_job)
+        else:
+            collect_job = None
 
         return Flow(
             jobs=jobs,
             name=self.name,
-            output=jobs[-1].output,
+            output=jobs[-1].output if collect_job else None,
         )
 
 # TODO this is totally code agnostic and should be in common
diff --git a/src/atomate2/cp2k/jobs/defect.py b/src/atomate2/cp2k/jobs/defect.py
index df34dddd07..7a31173310 100644
--- a/src/atomate2/cp2k/jobs/defect.py
+++ b/src/atomate2/cp2k/jobs/defect.py
@@ -9,6 +9,7 @@
 from tkinter import W
 from numpy.typing import NDArray
 
+from pymatgen.core import Structure
 from pymatgen.analysis.defects.core import Defect, Vacancy
 from atomate2.cp2k.sets.base import Cp2kInputGenerator
 from atomate2.cp2k.sets.defect import (
@@ -36,21 +37,32 @@ class BaseDefectMaker(BaseCp2kMaker):
     force_diagonal: bool = field(default=False)
 
     @cp2k_job
-    def make(self, defect: Defect, charge: int = 0, prev_cp2k_dir: str | Path | None = None):
-        if isinstance(defect, Vacancy):
-            defect = GhostVacancy(
-                structure=defect.structure, site=defect.site,
-                multiplicity=defect.multiplicity, oxi_state=defect.oxi_state,
-                symprec=defect.symprec, angle_tolerance=defect.angle_tolerance
-                )
-        structure = defect.get_supercell_structure(
-            sc_mat=self.supercell_matrix, 
-            dummy_species=None, 
-            min_atoms=self.min_atoms,
-            max_atoms=self.max_atoms,
-            min_length=self.min_length,
-            force_diagonal=self.force_diagonal,
-        )
+    def make(self, defect: Defect | Structure, charge: int = 0, prev_cp2k_dir: str | Path | None = None):
+        if isinstance(defect, Defect):
+            if isinstance(defect, Vacancy):
+                defect = GhostVacancy(
+                    structure=defect.structure, site=defect.site,
+                    multiplicity=defect.multiplicity, oxi_state=defect.oxi_state,
+                    symprec=defect.symprec, angle_tolerance=defect.angle_tolerance
+                    )
+            structure = defect.get_supercell_structure(
+                sc_mat=self.supercell_matrix, 
+                dummy_species=None, 
+                min_atoms=self.min_atoms,
+                max_atoms=self.max_atoms,
+                min_length=self.min_length,
+                force_diagonal=self.force_diagonal,
+            )
+            self.write_additional_data.update(
+            {
+                "info.json": {
+                    "defect": deepcopy(defect), 
+                    "defect_charge": charge, 
+                    "sc_mat": self.supercell_matrix}
+                    }
+            )
+        else:
+            structure = deepcopy(defect)
         structure.set_charge(charge)
         # provenance stuff
         self.write_additional_data.update(

From 2951a966d205c45424c38ffb318f282375b4cec6 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Mon, 31 Oct 2022 10:22:47 -0700
Subject: [PATCH 04/50] Working on defects with builder

---
 src/atomate2/cp2k/builders/defect.py | 897 +++++++++++++++++++++++++++
 src/atomate2/cp2k/flows/defect.py    |  58 +-
 src/atomate2/cp2k/jobs/defect.py     |  11 +-
 src/atomate2/cp2k/schemas/defect.py  |  94 ++-
 4 files changed, 976 insertions(+), 84 deletions(-)

diff --git a/src/atomate2/cp2k/builders/defect.py b/src/atomate2/cp2k/builders/defect.py
index e69de29bb2..354502650d 100644
--- a/src/atomate2/cp2k/builders/defect.py
+++ b/src/atomate2/cp2k/builders/defect.py
@@ -0,0 +1,897 @@
+from datetime import datetime
+from itertools import chain, groupby, combinations
+from re import A
+from tkinter import W
+from typing import Dict, Iterator, List, Literal, Optional
+from copy import deepcopy
+from math import ceil
+from monty.json import MontyDecoder, jsanitize
+
+from maggma.builders import Builder
+from maggma.stores import Store
+from maggma.utils import grouper
+
+from pymatgen.core import Structure
+from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher
+from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
+
+from atomate.utils.utils import load_class
+
+from emmet.core.thermo import ThermoDoc
+from emmet.core.material import MaterialsDoc
+
+from emmet.builders.settings import EmmetBuildSettings
+
+from atomate2.settings import Atomate2Settings
+from atomate2.cp2k.schemas.task import TaskDocument
+from atomate2.cp2k.schemas.defect import DefectDoc
+from atomate2.cp2k.schemas.calc_types import TaskType
+from atomate2.cp2k.schemas.calc_types.utils import run_type
+
+from emmet.core.electronic_structure import ElectronicStructureDoc
+
+__author__ = "Nicholas Winner <nwinner@berkeley.edu>"
+
+
+# TODO this builder is very close to being code agnostic. We need only resolve the standard key names and
+# how they are fed to the DefectDoc class. e.g. VASP calcs store "locpot", but CP2K store "v_hartree"
+class DefectBuilder(Builder):
+    """
+    The DefectBuilder collects task documents performed on structures containing a single point defect.
+    The builder is intended to group tasks corresponding to the same defect (species including charge state),
+    find the best ones, and perform finite-size defect corrections to create a defect document. These
+    defect documents can then be assembled into defect phase diagrams using the DefectThermoBuilder.
+
+    In order to make the build process easier, an entry must exist inside of the task doc that identifies it
+    as a point defect calculation. Currently this is the Pymatgen defect object keyed by "defect". In the future,
+    this may be changed to having a defect transformation in the transformation history.
+
+    The process is as follows:
+
+        1.) Find all documents containing the defect query.
+        2.) Find all documents that do not contain the defect query, and which have DOS and dielectric data already
+            calculated. These are the candidate bulk tasks.
+        3.) For each candidate defect task, attempt to match to a candidate bulk task of the same number of sites
+            (+/- 1) with the required properties for analysis. Reject defects that do not have a corresponding
+            bulk calculation.
+        4.) Convert (defect, bulk task) doc pairs to DefectDocs
+        5.) Post-process and validate defect document
+        6.) Update the defect store
+    """
+
+    #TODO how to incorporate into settings?
+    DEFAULT_ALLOWED_TASKS = [
+            TaskType.Structure_Optimization.value, 
+            TaskType.Static.value
+            ]
+    
+    def __init__(
+        self,
+        tasks: Store,
+        defects: Store,
+        dielectric: Store,
+        electronic_structure: Store,
+        materials: Store,
+        electrostatic_potentials: Store,
+        task_validation: Optional[Store] = None,
+        query: Optional[Dict] = None,
+        bulk_query: Optional[Dict] = None,
+        allowed_task_types: Optional[List[str]] = DEFAULT_ALLOWED_TASKS,
+        task_schema: Literal["cp2k"] = "cp2k", # TODO cp2k specific right now, but this will go in common eventually
+        settings: Dict | None = None,
+        **kwargs,
+    ):
+        """
+        Args:
+            tasks: Store of task documents
+            defects: Store of defect documents to generate
+            dielectric: Store of dielectric data
+            electronic_structure: Store of electronic structure data
+            materials: Store of materials documents
+            electrostatic_potentials: Store of electrostatic potential data. These
+                are generally stored in seperately from the tasks on GridFS due to their size.
+            task_validation: Store of task validation documents.
+            query: dictionary to limit tasks to be analyzed. NOT the same as the defect_query property
+            allowed_task_types: list of task_types that can be processed
+            settings: EmmetBuildSettings object
+        """
+
+        self.tasks = tasks
+        self.defects = defects
+        self.materials = materials
+        self.dielectric = dielectric
+        self.electronic_structure = electronic_structure
+        self.electrostatic_potentials = electrostatic_potentials
+        self.task_validation = task_validation
+        self.allowed_task_types = allowed_task_types #TODO How to incorporate into getitems?
+
+        self._allowed_task_types = {TaskType(t) for t in self.allowed_task_types}
+        settings = settings if settings else {}
+        self.settings = Atomate2Settings(**settings) # TODO don't think this is right 
+        self.query = query if query else {}
+        self.bulk_query = bulk_query if bulk_query else {}
+        self.timestamp = None
+        self._mpid_map = {}
+        self.task_schema = task_schema
+        self.kwargs = kwargs
+
+        # TODO Long term, schemas should be part of the matching and grouping process so that a builder can be run on a mixture
+        self.query.update({'output.@module': f"atomate2.{self.task_schema}.schemas.task", "output.@class": "TaskDocument"})
+        self.bulk_query.update({'output.@module': f"atomate2.{self.task_schema}.schemas.task", "output.@class": "TaskDocument"})
+        self._defect_query = 'output.additional_json.info.defect'
+
+        self._required_defect_properties = [
+            self._defect_query,
+            self.tasks.key,
+            'output.output.energy',
+            'output.output.structure',
+            'output.input',
+            'output.nsites',
+            'output.cp2k_objects.v_hartree'
+        ] 
+
+        self._required_bulk_properties = [
+            self.tasks.key,
+            'output.output.energy',
+            'output.output.structure',
+            'output.input',
+            'output.cp2k_objects.v_hartree'
+        ] 
+
+        self._optional_defect_properties = []
+        self._optional_bulk_properties = []
+
+        sources = [tasks, dielectric, electronic_structure, materials, electrostatic_potentials]
+        if self.task_validation:
+            sources.append(self.task_validation)
+        super().__init__(sources=sources, targets=[defects], **kwargs)
+
+    @property
+    def defect_query(self) -> str:
+        """
+        The standard query for defect tasks.
+        """
+        return self._defect_query
+
+    #TODO Hartree pot should be required but only for charged defects
+    @property
+    def required_defect_properties(self) -> List:
+        """
+        Properties essential to processing a defect task.
+        """
+        return self._required_defect_properties
+
+    @property
+    def required_bulk_properties(self) -> List:
+        """
+        Properties essential to processing a bulk task.
+        """
+        return self._required_bulk_properties
+
+    @property
+    def optional_defect_properties(self) -> List:
+        """
+        Properties that are optional for processing a defect task.
+        """
+        return self._optional_defect_properties
+
+    @property
+    def optional_bulk_properties(self) -> List:
+        """
+        Properties that are optional for bulk tasks.
+        """
+        return self._optional_bulk_properties
+
+    @property
+    def mpid_map(self) -> Dict:
+        return self._mpid_map
+
+    def ensure_indexes(self):
+        """
+        Ensures indicies on the tasks and materials collections
+        """
+
+        # Basic search index for tasks
+        self.tasks.ensure_index(self.tasks.key)
+        self.tasks.ensure_index("output.last_updated")
+        self.tasks.ensure_index("output.state")
+        self.tasks.ensure_index("output.formula_pretty") # TODO is necessary?
+
+        # Search index for materials
+        self.materials.ensure_index("material_id")
+        self.materials.ensure_index("last_updated")
+        self.materials.ensure_index("task_ids")
+
+        # Search index for defects
+        self.defects.ensure_index("material_id")
+        self.defects.ensure_index("last_updated")
+        self.defects.ensure_index("task_ids")
+
+        if self.task_validation:
+            self.task_validation.ensure_index("task_id")
+            self.task_validation.ensure_index("valid")
+
+    def prechunk(self, number_splits: int) -> Iterator[Dict]:
+
+        tag_query = {} 
+        if len(self.settings.BUILD_TAGS) > 0 and len(self.settings.EXCLUDED_TAGS) > 0:
+            tag_query["$and"] = [
+                {"tags": {"$in": self.settings.BUILD_TAGS}},
+                {"tags": {"$nin": self.settings.EXCLUDED_TAGS}},
+            ]
+        elif len(self.settings.BUILD_TAGS) > 0:
+            tag_query["tags"] = {"$in": self.settings.BUILD_TAGS}
+
+        # Get defect tasks
+        temp_query = self.query.copy()
+        temp_query.update(tag_query)
+        temp_query.update({d: {'$exists': True, "$ne": None} for d in self.required_defect_properties})
+        temp_query.update({self.defect_query: {'$exists': True}, "state": "successful"})
+        defect_tasks = {
+            doc[self.tasks.key]
+            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
+        }
+
+        # Get bulk tasks
+        temp_query = self.bulk_query.copy()
+        temp_query.update(tag_query)
+        temp_query.update({d: {'$exists': True} for d in self.required_bulk_properties})
+        temp_query.update({self.defect_query: {'$exists': False}, "state": "successful"})
+        bulk_tasks = {
+            doc[self.tasks.key]
+            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
+        }
+
+        N = ceil(len(defect_tasks) / number_splits)
+        for task_chunk in grouper(defect_tasks, N):
+            yield {"query": {"task_id": {"$in": task_chunk + list(bulk_tasks)}}}
+
+    def get_items(self) -> Iterator[List[Dict]]:
+        """
+        Gets all items to process into defect documents.
+        This does no datetime checking; relying on on whether
+        task_ids are included in the Defect Collection.
+
+        The procedure is as follows:
+
+            1. Get all tasks with standard "defect" query tag
+            2. Filter all tasks by skipping tasks which are already in the Defect Store
+            3. Get all tasks that could be used as bulk
+            4. Filter all bulks which do not have corresponding Dielectric and 
+               ElectronicStructure data (if a band gap exists for that task).
+            5. Group defect tasks by defect matching 
+            6. Given defect object in a group, bundle them with bulk tasks
+               identified with structure matching
+            7. Yield the item bundles
+
+        Returns:
+            Iterator of (defect documents, task bundles)
+
+                The defect document is an existing defect doc to be updated with new data, or None
+            
+                task bundles bundle are all the tasks that correspond to the same defect and all possible
+                bulk tasks that could be matched to them.
+d        """
+
+        self.logger.info("Defect builder started")
+        self.logger.info(
+            f"Allowed task types: {[task_type.value for task_type in self._allowed_task_types]}"
+        )
+
+        self.logger.info("Setting indexes")
+        self.ensure_indexes()
+
+        # Save timestamp to mark buildtime for material documents
+        self.timestamp = datetime.utcnow()
+
+        self.logger.info("Finding tasks to process")
+
+        # Get defect tasks
+        temp_query = self.query.copy()
+        temp_query.update({d: {'$exists': True, "$ne": None} for d in self.required_defect_properties})
+        temp_query.update({self.defect_query: {'$exists': True}, "output.state": "successful"})
+        defect_tasks = {
+            doc[self.tasks.key]
+            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
+        }
+
+        # Get bulk tasks
+        temp_query = self.bulk_query.copy()
+        temp_query.update({d: {'$exists': True} for d in self.required_bulk_properties})
+        temp_query.update({self.defect_query: {'$exists': False}, "output.state": "successful"})
+        bulk_tasks = {
+            doc[self.tasks.key]
+            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
+        }
+
+        # TODO Not the same validation behavior as material builders?
+        # If validation store exists, find tasks that are invalid and remove them
+        if self.task_validation:
+            validated = {
+                doc[self.tasks.key]
+                for doc in self.task_validation.query(
+                    {}, [self.task_validation.key]        
+                )
+            }
+
+            defect_tasks = defect_tasks.intersection(validated)
+            bulk_tasks = bulk_tasks.intersection(validated)
+
+            invalid_ids = {
+                doc[self.tasks.key]
+                for doc in self.task_validation.query(
+                    {"is_valid": False}, [self.task_validation.key]
+                )
+            }
+            self.logger.info("Removing {} invalid tasks".format(len(invalid_ids)))
+            defect_tasks = defect_tasks - invalid_ids
+            bulk_tasks = bulk_tasks - invalid_ids
+
+        processed_defect_tasks = {
+            t_id
+            for d in self.defects.query({}, ["task_ids"])
+            for t_id in d.get("task_ids", [])
+        }
+
+        all_tasks = defect_tasks | bulk_tasks
+
+        self.logger.debug("All tasks: {}".format(len(all_tasks)))
+        self.logger.debug("Bulk tasks before filter: {}".format(len(bulk_tasks)))
+        bulk_tasks = set(filter(self.__preprocess_bulk, bulk_tasks))
+        self.logger.debug("Bulk tasks after filter: {}".format(len(bulk_tasks)))
+        self.logger.debug("All defect tasks: {}".format(len(defect_tasks)))
+        unprocessed_defect_tasks = defect_tasks - processed_defect_tasks
+
+        if not unprocessed_defect_tasks:
+            self.logger.info("No unprocessed defect tasks. Exiting")
+            return
+        elif not bulk_tasks:
+            self.logger.info("No compatible bulk calculations. Exiting.")
+            return
+
+        self.logger.info(f"Found {len(unprocessed_defect_tasks)} unprocessed defect tasks")
+        self.logger.info(f"Found {len(bulk_tasks)} bulk tasks with dielectric properties")
+
+        # Set total for builder bars to have a total
+        self.total = len(unprocessed_defect_tasks)
+
+        # yield list of defects that are of the same type, matched to an appropriate bulk calc
+        self.logger.info(f"Starting defect matching.")
+
+        for defect, defect_task_group in self.__filter_and_group_tasks(unprocessed_defect_tasks):
+            task_ids = self.__match_defects_to_bulks(bulk_tasks, defect_task_group)
+            if not task_ids:
+                continue 
+            doc = self.__get_defect_doc(defect)
+            item_bundle = self.__get_item_bundle(task_ids)
+            material_id = self.mpid_map[item_bundle[0][1][self.tasks.key]]
+            yield doc, item_bundle, material_id
+
+    def process_item(self, items):
+        """
+        Process a group of defect tasks that correspond to the same defect into a single defect
+        document. If the DefectDoc already exists, then update it and return it. If it does not,
+        create a new DefectDoc
+
+        Args:
+            items: (DefectDoc or None, [(defect task dict, bulk task dict, dielectric dict), ... ]
+
+        returns: the defect document as a dictionary
+        """
+        defect_doc, item_bundle, material_id = items
+        self.logger.info(f"Processing group of {len(item_bundle)} defects into DefectDoc")
+        if item_bundle:
+            defect_tasks, bulk_tasks, dielectrics = list(zip(*item_bundle))
+            if defect_doc:
+                defect_doc.update_all(
+                    defect_tasks=defect_tasks, bulk_tasks=bulk_tasks, 
+                    dielectrics=dielectrics, query=self.defect_query
+                    )
+            else:
+                defect_doc = DefectDoc.from_tasks(
+                    defect_tasks=defect_tasks, bulk_tasks=bulk_tasks, dielectrics=dielectrics,
+                    query=self.defect_query, key=self.tasks.key, material_id=material_id
+                    )
+            return defect_doc.dict()
+        return {}
+
+    def update_targets(self, items):
+        """
+        Inserts the new task_types into the task_types collection
+        """
+
+        items = [item for item in items if item]
+
+        if len(items) > 0:
+            self.logger.info(f"Updating {len(items)} defects")
+            for item in items:
+                item.update({"_bt": self.timestamp})
+                self.defects.remove_docs(
+                    {
+                       "task_ids": item['task_ids'],
+                    }
+                )
+            self.defects.update(
+                docs=jsanitize(items, allow_bson=True),
+                key='task_ids',
+            )
+        else:
+            self.logger.info("No items to update")
+
+    def __filter_and_group_tasks(self, tasks):
+        """
+        Groups defect tasks. Tasks are grouped according to the reduced representation
+        of the defect, and so tasks with different settings (e.g. supercell size, functional)
+        will be grouped together.
+
+        Args:
+            tasks: task_ids for unprocessed defects
+
+        returns:
+            [ (defect, [task_ids] ), ...] where task_ids correspond to the same defect
+        """
+
+        props = [
+            self.defect_query,
+            self.tasks.key,
+            'output.structure'
+        ]
+
+        self.logger.debug(f"Finding equivalent tasks for {len(tasks)} defects")
+
+        sm = StructureMatcher(allow_subset=False) #TODO build settings
+        defects = [
+            {
+                self.tasks.key: t[self.tasks.key], 'defect': self.__get_defect_from_task(t),
+                'structure': Structure.from_dict(t['output']['structure'])
+            }
+            for t in self.tasks.query(criteria={self.tasks.key: {'$in': list(tasks)}}, properties=props)
+        ]
+        for d in defects:
+            # TODO remove oxidation state because spins/oxidation cause errors in comparison.
+            #  but they shouldnt if those props are close in value
+            d['structure'].remove_oxidation_states()
+
+        def key(x):
+            s = x['defect'].structure
+            return get_sg(s), s.composition.reduced_composition
+
+        def are_equal(x, y):
+            """
+            To decide if defects are equal. Either the defect objects are
+            equal, OR two different defect objects relaxed to the same final structure
+            (common with interstitials).
+:w
+
+            TODO Need a way to do the output structure comparison for a X atom defect cell
+            TODO which can be embedded in a Y atom defect cell up to tolerance.
+            """
+            if x['defect'] == y['defect']:
+                return True
+
+            # TODO This is needed for ghost vacancy unfortunately, since sm.fit can't distinguish ghosts
+            if x['defect'].defect_composition == y['defect'].defect_composition and \
+                    x['defect'].charge == y['defect'].charge and \
+                    sm.fit(x['structure'], y['structure']):
+                return True
+            return False
+
+        sorted_s_list = sorted(enumerate(defects), key=lambda x: key(x[1]))
+        all_groups = []
+
+        # For each pre-grouped list of structures, perform actual matching.
+        for k, g in groupby(sorted_s_list, key=lambda x: key(x[1])):
+            unmatched = list(g)
+            while len(unmatched) > 0:
+                i, refs = unmatched.pop(0)
+                matches = [i]
+                inds = list(filter(lambda j: are_equal(refs, unmatched[j][1]), list(range(len(unmatched)))))
+                matches.extend([unmatched[i][0] for i in inds])
+                unmatched = [unmatched[i] for i in range(len(unmatched)) if i not in inds]
+                all_groups.append(
+                    (defects[i]['defect'], [defects[i][self.tasks.key] for i in matches])
+                )
+
+        self.logger.debug(f"All groups {all_groups}")
+        return all_groups
+
+    def __get_defect_from_task(self, task):
+        """
+        Using the defect_query property, retrieve a pymatgen defect object from the task document
+        """
+        defect = unpack(self.defect_query, task)
+        return MontyDecoder().process_decoded(defect)
+
+    def __get_defect_doc(self, defect):
+        """
+        Given a defect, find the DefectDoc corresponding to it in the defects store if it exists
+
+        returns: DefectDoc or None
+        """
+        material_id = self._get_mpid(defect.structure)
+        docs = [
+            DefectDoc(**doc)
+            for doc in self.defects.query(criteria={'material_id': material_id}, properties=None)
+        ]
+        for doc in docs:
+            if defect == doc.defect:
+                return doc
+        return None
+
+    # TODO should move to returning dielectric doc or continue returning the total diel tensor?
+    def __get_dielectric(self, key):
+        """
+        Given a bulk task's task_id, find the material_id, and then use it to query the dielectric store
+        and retrieve the total dielectric tensor for defect analysis. If no dielectric exists, as would
+        be the case for metallic systems, return None.
+        """
+        for diel in self.dielectric.query(criteria={"material_id": key}, properties=['total']):
+            return diel['total']
+        return None
+
+    #TODO retrieving the electrostatic potential is by far the most expesive part of the builder. Any way to reduce?
+    def __get_item_bundle(self, task_ids):
+        """
+        Gets a group of items that can be processed together into a defect document.
+
+        Args:
+            bulk_tasks: possible bulk tasks to match to defects
+            defect_task_group: group of equivalent defects (defined by PointDefectComparator)
+
+        returns: [(defect task dict, bulk_task_dict, dielectric dict), ...]
+        """
+        return [
+            (
+                self.tasks.query_one(criteria={self.tasks.key: defect_tasks_id}),
+                self.tasks.query_one(criteria={self.tasks.key: bulk_tasks_id}),
+                self.__get_dielectric(self._mpid_map[bulk_tasks_id]),
+            )
+            for defect_tasks_id, bulk_tasks_id in task_ids
+        ]
+
+    def _get_mpid(self, structure):
+        """
+        Given a structure, determine if an equivalent structure exists, with a material_id,
+        in the materials store.
+
+        Args:
+            structure: Candidate structure
+
+        returns: material_id, if one exists, else None
+        """
+        sga = SpacegroupAnalyzer(structure, symprec=self.settings.SYMPREC) # TODO Add angle tolerance
+        mats = self.materials.query(
+            criteria={
+                'chemsys': structure.composition.chemical_system,
+            }, properties=['structure', 'material_id']
+        )
+        # TODO coudl more than one material match true?
+        sm = StructureMatcher() # TODO add tolerances
+        for m in mats:
+            if sm.fit(structure, Structure.from_dict(m['structure'])):
+                return m['material_id']
+        return None
+
+    def __match_defects_to_bulks(self, bulk_ids, defect_ids):
+        """
+        Given task_ids of bulk and defect tasks, match the defects to a bulk task that has
+        commensurate:
+
+            - Composition
+            - Number of sites
+            - Symmetry
+
+        """
+
+        self.logger.debug(f"Finding bulk/defect task combinations.")
+        self.logger.debug(f"Bulk tasks: {bulk_ids}")
+        self.logger.debug(f"Defect tasks: {defect_ids}")
+
+        # TODO mongo projection on array doesn't work (see above)
+        props = [
+            self.tasks.key,
+            self.defect_query, 
+            'output.input',
+            'output.nsites',
+            'output.output.structure',
+            "output.additional_json.info.sc_mat" 
+        ]
+        defects = list(self.tasks.query(criteria={self.tasks.key: {'$in': list(defect_ids)}}, properties=props))
+        ps = self.__get_pristine_supercell(defects[0])
+        ps.remove_oxidation_states() # TODO might cause problems
+        bulks = list(
+            self.tasks.query(
+                criteria={
+                    self.tasks.key: {'$in': list(bulk_ids)},
+                    'output.composition_reduced': jsanitize(ps.composition.to_reduced_dict),
+                },
+                properties=props
+            )
+        ) 
+        
+        # TODO add settings
+        sm = StructureMatcher(
+            primitive_cell=False,
+            scale=True,
+            attempt_supercell=False,
+            allow_subset=False,
+            comparator=ElementComparator(),
+        )
+
+        def _compare(b, d):
+            rtb = b.get('output').get('input').get('xc').split("+U")[0]
+            rtd = d.get('output').get('input').get('xc').split("+U")[0]
+            if rtb == rtd: 
+                if sm.fit(self.__get_pristine_supercell(d), self.__get_pristine_supercell(b)):
+                       return True
+            return False
+
+        pairs = [
+            (defect[self.tasks.key], bulk[self.tasks.key])
+            for bulk in bulks
+            for defect in defects
+            if _compare(bulk, defect)
+        ]
+
+        self.logger.debug(f"Found {len(pairs)} commensurate bulk/defect pairs")
+        return pairs
+
+    def __preprocess_bulk(self, task):
+        """
+        Given a TaskDoc that could be a bulk for defect analysis, check to see if it can be used. Bulk
+        tasks must have:
+
+            (1) Correspond to an existing material_id in the materials store
+            (2) If the bulk is not a metal, then the dielectric tensor must exist in the dielectric store
+            (3) If bulk is not a metal, electronic structure document must exist in the store
+
+        """
+        self.logger.debug("Preprocessing bulk task {}".format(task))
+        t = next(self.tasks.query(criteria={self.tasks.key: task}, properties=['output.output.structure', 'mpid']))
+
+        struc = Structure.from_dict(t.get('output').get('output').get('structure')) # TODO specific to atomate2
+        mpid = self._get_mpid(struc)
+        if not mpid:
+            self.logger.debug(f"No material id found for bulk task {task}")
+            return False
+        self._mpid_map[task] = mpid
+        self.logger.debug(f"Material ID: {mpid}")
+
+        elec = self.electronic_structure.query_one(
+            properties=['band_gap'], criteria={self.electronic_structure.key: mpid}
+            )
+        if not elec:
+            self.logger.debug(f"Electronic structure data not found for {mpid}")
+            return False
+
+        # TODO right now pulling dos from electronic structure, should just pull summary document
+        if elec['band_gap'] > 0:
+            diel = self.__get_dielectric(mpid)
+            if not diel:
+                self.logger.info(f"Task {task} for {mpid} ({struc.composition.reduced_formula}) requires "
+                                 f"dielectric properties, but none found in dielectric store")
+                return False
+
+        return True
+
+    def __get_pristine_supercell(self, task):
+        """
+        Given a task document for a defect calculation, retrieve the un-defective, pristine supercell.
+            - If defect transform exists, the following transform's input will be returned
+            - If no follow up transform exists, the calculation input will be returned
+
+        If defect cannot be found in task, return the input structure.
+        """
+        d = unpack(query=self.defect_query, d=task)
+        if d:
+            defect = MontyDecoder().process_decoded(d)
+            sc_mat = task.get('output', {}).get('additional_json', {}).get("info", {}).get('sc_mat')
+            s = defect.structure.copy()
+            s.make_supercell(sc_mat)
+            return s
+        else:
+            return MontyDecoder().process_decoded(task['output']['output']['structure'])
+
+#TODO Major problem with this builder. materials store is used to sync the diel, elec, and pd with a single material id
+#TODO This is a problem because the material id in vasp store is not synced to cp2k store
+#TODO Also the chempots needed to adjust entries must come from cp2k, but you need to give vasp to sync the others
+class DefectThermoBuilder(Builder):
+
+    """
+    This builder creates collections of the DefectThermoDoc object.
+
+        (1) Find all DefectDocs that correspond to the same bulk material
+            given by material_id
+        (2) Create a new DefectThermoDoc for all of those documents
+        (3) Insert/Update the defect_thermos store with the new documents
+    """
+
+    def __init__(
+            self,
+            defects: Store,
+            defect_thermos: Store,
+            materials: Store,
+            thermo: Store,
+            electronic_structures: Store,
+            dos: Store,
+            query: Optional[Dict] = None,
+            **kwargs,
+    ):
+        """
+        Args:
+            defects: Store of defect documents (generated by DefectBuilder)
+            defect_thermos: Store of DefectThermoDocs to generate.
+            materials: Store of MaterialDocs to construct phase diagram
+            electronic_structures: Store of DOS objects
+            query: dictionary to limit tasks to be analyzed
+        """
+
+        self.defects = defects
+        self.defect_thermos = defect_thermos
+        self.materials = materials
+        self.thermo = thermo
+        self.dos = dos
+        self.electronic_structures = electronic_structures
+
+        self.query = query if query else {}
+        self.timestamp = None
+        self.kwargs = kwargs
+
+        super().__init__(sources=[defects, materials, thermo, electronic_structures, dos], targets=[defect_thermos], **kwargs)
+
+    def ensure_indexes(self):
+        """
+        Ensures indicies on the collections
+        """
+
+        # Basic search index for tasks
+        self.defects.ensure_index("material_id")
+        self.defects.ensure_index("defect_id")
+
+        # Search index for materials
+        self.defect_thermos.ensure_index("material_id")
+
+    # TODO need to only process new tasks. Fast builder so currently is OK for small collections
+    def get_items(self) -> Iterator[List[Dict]]:
+        """
+        Gets items to process into DefectThermoDocs.
+
+        returns:
+            iterator yielding tuples containing:
+                - group of DefectDocs belonging to the same bulk material as indexed by material_id,
+                - materials in the chemsys of the bulk material for constructing phase diagram
+                - Dos of the bulk material for constructing phase diagrams/getting doping
+
+        """
+
+        self.logger.info("Defect thermo builder started")
+        self.logger.info("Setting indexes")
+        self.ensure_indexes()
+
+        # Save timestamp to mark build time for defect thermo documents
+        self.timestamp = datetime.utcnow()
+
+        # Get all tasks
+        self.logger.info("Finding tasks to process")
+        temp_query = dict(self.query)
+        temp_query["state"] = "successful"
+
+        #unprocessed_defect_tasks = all_tasks - processed_defect_tasks
+
+        all_docs = [doc for doc in self.defects.query(self.query)]
+
+        self.logger.debug(f"Found {len(all_docs)} defect docs to process")
+
+        def filterfunc(x):
+            # material for defect x exists
+            if not list(self.materials.query(criteria={'material_id': x['material_id']}, properties=None)):
+                self.logger.debug(f"No material with MPID={x['material_id']} in the material store")
+                return False
+
+            for el in load_class(x['defect']['@module'], x['defect']['@class']).from_dict(x['defect']).defect_composition:
+                if not list(self.thermo.query(criteria={'chemsys': str(el)}, properties=None)):
+                    self.logger.debug(f"No entry for {el} in Thermo Store")
+                    return False
+
+            return True
+
+        for key, group in groupby(
+                filter(
+                    filterfunc,
+                    sorted(all_docs, key=lambda x: x['material_id'])
+                ), key=lambda x: x['material_id']
+        ):
+            group = [g for g in group]
+            try:
+                mat = self.__get_materials(key)
+                thermo = self.__get_thermos(mat.composition)
+                elec = self.__get_electronic_structure(group[0]['material_id'])
+                yield (group, mat, thermo, elec)
+            except LookupError as exception:
+                raise exception
+
+    def process_item(self, docs):
+        """
+        Process a group of defects belonging to the same material into a defect thermo doc
+        """
+        self.logger.info(f"Processing defects")
+        defects, material, thermos, elec_struc = docs
+        defects = [DefectDoc(**d) for d in defects]
+        thermos = [ThermoDoc(**t) for t in thermos]
+        defect_thermo_doc = DefectThermoDoc.from_docs(defects, thermos=thermos, electronic_structure=elec_struc)
+        return defect_thermo_doc.dict()
+
+    def update_targets(self, items):
+        """
+        Inserts the new DefectThermoDocs into the defect_thermos store
+        """
+        items = [item for item in items if item]
+        for item in items:
+            item.update({"_bt": self.timestamp})
+
+        if len(items) > 0:
+            self.logger.info(f"Updating {len(items)} defect thermo docs")
+            self.defect_thermos.update(
+                docs=jsanitize(items, allow_bson=True),
+                key=self.defect_thermos.key,
+            )
+        else:
+            self.logger.info("No items to update")
+
+    def __get_electronic_structure(self, material_id):
+        """
+        Gets the electronic structure of the bulk material
+        """
+        self.logger.info(f"Getting electronic structure for {material_id}")
+
+        # TODO This is updated to return the whole query because a.t.m. the
+        # DOS part of the electronic builder isn't working, so I'm using
+        # this to pull direct from the store of dos objects with no processing.
+        dosdoc = self.electronic_structures.query_one(
+            criteria={self.electronic_structures.key: material_id},
+            properties=None,
+        )
+        t_id = ElectronicStructureDoc(**dosdoc).dos.total['1'].task_id
+        dos = self.dos.query_one(criteria={'task_id': int(t_id)}, properties=None) #TODO MPID str/int issues
+        return dos
+
+    def __get_materials(self, key) -> List:
+        """
+        Given a group of DefectDocs, use the bulk material_id to get materials in the chemsys from the
+        materials store.
+        """
+        bulk = self.materials.query_one(criteria={'material_id': key}, properties=None)
+        if not bulk:
+            raise LookupError(
+                f"The bulk material ({key}) for these defects cannot be found in the materials store"
+            )
+        return MaterialsDoc(**bulk)
+
+    def __get_thermos(self, composition) -> List:
+        return list(self.thermo.query(criteria={'elements': {"$size": 1}}, properties=None))
+
+
+def unpack(query, d):
+    """
+    Unpack a mongo-style query into dictionary retrieval
+    """
+    if not d:
+        return None
+    if not query:
+        return d
+    if isinstance(d, List):
+        return unpack(query[1:], d.__getitem__(int(query.pop(0))))
+    if isinstance(query, str):
+        for seperator in [".", ":", "->"]:
+            tmp = query.split(seperator)
+            if len(tmp) > 1:
+                return unpack(query.split("."), d)
+    return unpack(query[1:], d.__getitem__(query.pop(0)))
+
+# TODO SHOULD GO IN COMMON
+def get_sg(struc, symprec=.01) -> int:
+    """helper function to get spacegroup with a loose tolerance"""
+    try:
+        return struc.get_space_group_info(symprec=symprec)[1]
+    except Exception:
+        return -1
\ No newline at end of file
diff --git a/src/atomate2/cp2k/flows/defect.py b/src/atomate2/cp2k/flows/defect.py
index b67aa6da03..d23c914822 100644
--- a/src/atomate2/cp2k/flows/defect.py
+++ b/src/atomate2/cp2k/flows/defect.py
@@ -40,6 +40,23 @@
 
 logger = logging.getLogger(__name__)
 
+@dataclass
+class DefectHybridStaticFlowMaker(HybridStaticFlowMaker):
+
+    initialize_maker: BaseCp2kMaker = field(default_factory=DefectStaticMaker)
+    hybrid_maker: BaseCp2kMaker = field(default_factory=DefectHybridStaticMaker)
+
+@dataclass 
+class DefectHybridRelaxFlowMaker(HybridRelaxFlowMaker):
+
+    initialize_maker: BaseCp2kMaker = field(default_factory=DefectStaticMaker)
+    hybrid_maker: BaseCp2kMaker = field(default_factory=DefectHybridRelaxMaker)
+
+@dataclass 
+class DefectHybridCellOptFlowMaker(HybridCellOptFlowMaker):
+
+    initialize_maker: BaseCp2kMaker = field(default_factory=DefectStaticMaker)
+    hybrid_maker: BaseCp2kMaker = field(default_factory=DefectHybridCellOptMaker)
 
 # TODO close to being able to put this in common. Just need a switch that decides which core flow/job to use based on software
 @dataclass
@@ -81,40 +98,28 @@ class FormationEnergyMaker(Maker):
     def __post_init__(self):
         if self.run_bulk == 'relax':
             if self.hybrid_functional:
-                self.bulk_maker = HybridCellOptFlowMaker(
-                    initialize_with_pbe=self.initialize_with_pbe,
-                    hybrid_functional=self.hybrid_functional,
-                    hybrid_maker=HybridCellOptMaker(
-                        input_set_generator=DefectHybridCellOptSetGenerator(),
-                        task_document_kwargs={"average_v_hartree": True, "store_volumetric_data": ("v_hartree",)}
-                        )
+                self.bulk_maker = DefectHybridCellOptMaker(
+                    name="bulk hybrid relax",
+                    initialize_with_pbe=self.initialize_with_pbe, 
+                    hybrid_functional=self.hybrid_functional
                     )
             else:
-                self.bulk_maker = CellOptMaker(
-                    input_set_generator=DefectCellOptSetGenerator(),
-                    task_document_kwargs={"average_v_hartree": True, "store_volumetric_data": ("v_hartree",)}
-                    )
+                self.bulk_maker = DefectCellOptMaker(name="bulk relax")
+
         elif self.run_bulk == "static":
             if self.hybrid_functional:
-                self.bulk_maker = HybridStaticFlowMaker( 
+                self.bulk_maker = DefectHybridStaticFlowMaker( 
+                    name='bulk hybrid static',
+                    initialize_with_pbe=self.initialize_with_pbe,
                     hybrid_functional=self.hybrid_functional,
-                    hybrid_maker=HybridStaticMaker(
-                        input_set_generator=DefectHybridStaticSetGenerator(),
-                        task_document_kwargs={"average_v_hartree": True, "store_volumetric_data": ("v_hartree",)}
-                        )
                     )
             else:
-                self.bulk_maker = StaticMaker(
-                    input_set_generator=DefectStaticSetGenerator(),
-                    task_document_kwargs={"average_v_hartree": True, "store_volumetric_data": ("v_hartree",)}
-                    )
+                self.bulk_maker = DefectStaticMaker(name="bulk static")
 
         if self.hybrid_functional:
-            self.def_maker = HybridRelaxFlowMaker(
+            self.def_maker = DefectHybridRelaxFlowMaker(
                 hybrid_functional=self.hybrid_functional,
                 initialize_with_pbe=self.initialize_with_pbe,
-                initialize_maker=DefectStaticMaker(),
-                hybrid_maker=DefectHybridRelaxMaker(),
             )
         else:
             self.def_maker = DefectRelaxMaker()
@@ -128,7 +133,7 @@ def __post_init__(self):
 
     def make(
         self, defects: Iterable[Defect], 
-        run_all_charges: bool = False, 
+        charges: bool | Iterable[int] = False, 
         dielectric: NDArray | int | float | None = None,
         prev_cp2k_dir: str | Path | None = None):
         """Make a flow to run multiple defects in order to calculate their formation 
@@ -160,7 +165,10 @@ def make(
             jobs.append(bulk_job)
 
         for defect in defects:
-            chgs = defect.get_charge_states() if run_all_charges else [0]
+            if charges == True:
+                chgs = defect.get_charge_states() if charges else [0]
+            else:
+                chgs = charges if charges else [0]
             for charge in chgs:
                 defect_job = self.def_maker.make(deepcopy(defect), charge)
                 jobs.append(defect_job)
diff --git a/src/atomate2/cp2k/jobs/defect.py b/src/atomate2/cp2k/jobs/defect.py
index 7a31173310..1d5db8f432 100644
--- a/src/atomate2/cp2k/jobs/defect.py
+++ b/src/atomate2/cp2k/jobs/defect.py
@@ -53,6 +53,8 @@ def make(self, defect: Defect | Structure, charge: int = 0, prev_cp2k_dir: str |
                 min_length=self.min_length,
                 force_diagonal=self.force_diagonal,
             )
+
+            # provenance stuff
             self.write_additional_data.update(
             {
                 "info.json": {
@@ -64,15 +66,6 @@ def make(self, defect: Defect | Structure, charge: int = 0, prev_cp2k_dir: str |
         else:
             structure = deepcopy(defect)
         structure.set_charge(charge)
-        # provenance stuff
-        self.write_additional_data.update(
-            {
-                "info.json": {
-                    "defect": deepcopy(defect), 
-                    "defect_charge": charge, 
-                    "sc_mat": self.supercell_matrix}
-                    }
-            )
         return super().make.original(self, structure=structure, prev_cp2k_dir=prev_cp2k_dir)
 
 @dataclass
diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index 08917df314..736e592f64 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -17,7 +17,7 @@
 from atomate2 import SETTINGS
 
 from atomate2.common.schemas.structure import StructureMetadata
-from atomate2.cp2k.schemas.calc_types.utils import run_type, task_type
+from atomate2.cp2k.schemas.calc_types.utils import run_type, task_type, calc_type
 from atomate2.cp2k.schemas.calc_types.enums import CalcType, TaskType, RunType
 from atomate2.cp2k.schemas.task import TaskDocument
 
@@ -38,21 +38,21 @@ class Config:
 
     name: str = Field(None, description="Name of this defect as generated by the defect object")
 
-    material_id: int = Field(None, description="Unique material ID for the bulk material") #TODO Change to MPID
+    material_id: str = Field(None, description="Unique material ID for the bulk material") #TODO Change to MPID
 
-    task_ids: List[int] = Field(
+    task_ids: List[str] = Field(
         None, description="All task ids used in creating this defect doc."
     )
 
-    calc_types: Mapping[int, CalcType] = Field(  # type: ignore
+    calc_types: Mapping[str, CalcType] = Field(  # type: ignore
         None,
         description="Calculation types for all the calculations that make up this material",
     )
-    task_types: Mapping[int, TaskType] = Field(
+    task_types: Mapping[str, TaskType] = Field(
         None,
         description="Task types for all the calculations that make up this material",
     )
-    run_types: Mapping[int, RunType] = Field(
+    run_types: Mapping[str, RunType] = Field(
         None,
         description="Run types for all the calculations that make up this material",
     )
@@ -125,12 +125,12 @@ def _compare(new, old):
                 self.entries[rt] = entry
                 self.tasks[rt] = (defect_task_doc, bulk_task_doc)
 
-    def update_all(self, tasks, query='defect'):
-        for defect_task, bulk_task, dielectric in tasks:
+    def update_all(self, defect_tasks: List, bulk_tasks: List, dielectrics: List, query='defect'):
+        for defect_task, bulk_task, dielectric in zip(defect_tasks, bulk_tasks, dielectrics):
             self.update(defect_task=defect_task, bulk_task=bulk_task, dielectric=dielectric, query=query)
 
     @classmethod
-    def from_tasks(cls, tasks: List, query='defect', material_id=None):
+    def from_tasks(cls, defect_tasks: List, bulk_tasks: List, dielectrics: List, query='defect', key="task_id", material_id=None):
         """
         The standard way to create this document.
         Args:
@@ -138,43 +138,45 @@ def from_tasks(cls, tasks: List, query='defect', material_id=None):
                 series of DefectEntry objects.
             query: How to retrieve the defect object stored in the task.
         """
-        task_group = [TaskDocument(**defect_task) for defect_task, bulk_task, dielectric in tasks]
-
+        task_ids = [defect_task[key] for defect_task in defect_tasks]
+        bulk_tasks= [TaskDocument(**bulk_task['output']) for bulk_task in bulk_tasks]
+        defects = [cls.get_defect_from_task(query=query, task=defect_task) for defect_task in defect_tasks]
+        defect_tasks = [TaskDocument(**defect_task['output']) for defect_task in defect_tasks]
+        
         # Metadata
-        last_updated = datetime.now() or max(task.last_updated for task in task_group)
-        created_at = datetime.now() or min(task.completed_at for task in task_group)
-        task_ids = {task.task_id for task in task_group}
+        last_updated = datetime.now() or max(task.last_updated for task in defect_tasks)
+        created_at = datetime.now() or min(task.completed_at for task in defect_tasks)
 
-        deprecated_tasks = list(
-            {task.task_id for task in task_group if not task.is_valid}
-        )
+        #deprecated_tasks = list(
+        #    {task.task_id for task in task_group if not task.is_valid}
+        #)
 
-        run_types = {task.task_id: task.run_type for task in task_group}
-        task_types = {task.task_id: task.task_type for task in task_group}
-        calc_types = {task.task_id: task.calc_type for task in task_group}
+        run_types = {id: task.calcs_reversed[0].run_type for id, task in zip(task_ids, defect_tasks)}
+        task_types = {id: task.calcs_reversed[0].task_type for id, task in zip(task_ids, defect_tasks)}
+        calc_types = {id: task.calcs_reversed[0].calc_type for id, task in zip(task_ids, defect_tasks)}
 
         def _run_type(x):
-            return run_type(x[0]['input']['dft']).value
-
-        def _task_type(x):
-            return task_type(x[0]['input']['dft']).value
+            return x[0].calcs_reversed[0].run_type.value
 
         def _sort(x):
             # TODO return kpoint density, currently just does supercell size
-            return -x[0]['nsites'], x[0]['output']['energy']
+            return -x[0].nsites, x[0].output.energy
 
         entries = {}
         final_tasks = {}
         metadata = {}
-        for key, tasks_for_runtype in groupby(sorted(tasks, key=_run_type), key=_run_type):
+        for key, tasks_for_runtype in groupby(sorted(zip(defect_tasks, bulk_tasks, defects, dielectrics), key=_run_type), key=_run_type):
             sorted_tasks = sorted(tasks_for_runtype, key=_sort)
-            ents = [cls.get_defect_entry_from_tasks(t[0], t[1], t[2], query) for t in sorted_tasks]
+            ents = [
+                cls.get_defect_entry_from_tasks(defect_task, bulk_task, defect, dielectric) 
+                for defect_task, bulk_task, defect, dielectric in sorted_tasks
+                ]
             best_entry = ents[0]
-            best_defect_task, best_bulk_task, dielectric = sorted_tasks[0]
-            metadata[key] = {'convergence': [(sorted_tasks[i][0]['nsites'], ents[i].energy) for i in range(len(ents))]}
-            best_defect_task, best_bulk_task = TaskDocument(**best_defect_task), TaskDocument(**best_bulk_task)
-            entries[best_defect_task.run_type] = best_entry
-            final_tasks[best_defect_task.run_type] = (best_defect_task, best_bulk_task)
+            best_defect_task  = sorted_tasks[0][0]
+            best_bulk_task = sorted_tasks[0][1]
+            metadata[key] = {'convergence': [(sorted_tasks[i][0].nsites, ents[i].corrected_energy) for i in range(len(ents))]}
+            entries[best_defect_task.calcs_reversed[0].run_type] = best_entry
+            final_tasks[best_defect_task.calcs_reversed[0].run_type] = (best_defect_task, best_bulk_task)
 
         data = {
                 'entries': entries,
@@ -184,20 +186,18 @@ def _sort(x):
                 'last_updated': last_updated,
                 'created_at': created_at,
                 'task_ids': task_ids,
-                'deprecated_tasks': deprecated_tasks,
+                #'deprecated_tasks': deprecated_tasks,
                 'tasks': final_tasks,
                 'material_id': material_id if material_id else best_entry.parameters['material_id'],
-                'entry_ids': {rt: entries[rt].entry_id for rt in entries},
                 'defect': best_entry.defect,
-                'name': best_entry.defect.name,
                 'metadata': metadata,
         }
-        prim = SpacegroupAnalyzer(best_entry.defect.bulk_structure).get_primitive_standard_structure()
+        prim = SpacegroupAnalyzer(best_entry.defect.structure).get_primitive_standard_structure()
         data.update(StructureMetadata.from_structure(prim).dict())
         return cls(**data)
 
     @classmethod
-    def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, query='transformations.history.0.defect'):
+    def get_defect_entry_from_tasks(cls, defect_task: TaskDocument, bulk_task: TaskDocument, defect: Defect, dielectric=None):
         """
         Extract a defect entry from a single pair (defect and bulk) of tasks.
 
@@ -220,14 +220,14 @@ def get_defect_entry_from_tasks(cls, defect_task, bulk_task, dielectric=None, qu
             )
 
         defect_entry = DefectEntry(
-            defect=cls.get_defect_from_task(query=query, task=defect_task),
+            defect=defect,
             charge_state=parameters['charge_state'],
             sc_entry=sc_entry,
             sc_defect_frac_coords=parameters['defect_frac_sc_coords'],
             corrections=corrections,
         )
 
-        return defect_entry.as_dict()
+        return defect_entry
 
     @classmethod
     def get_correction_from_parameters(cls, parameters) -> Tuple[Dict, Dict]:
@@ -240,7 +240,7 @@ def get_correction_from_parameters(cls, parameters) -> Tuple[Dict, Dict]:
         return corrections, metadata
 
     @classmethod
-    def get_freysold_correction(cls, parameters) -> Tuple[Dict, Dict]:
+    def get_freysoldt_correction(cls, parameters) -> Tuple[Dict, Dict]:
         if parameters['charge_state'] and not parameters.get("2d"):
             return get_freysoldt_correction(
                 q=parameters['charge_state'], dielectric=parameters['dielectric'], 
@@ -279,11 +279,10 @@ def get_defect_from_task(cls, query, task):
         Unpack a Mongo-style query and retrieve a defect object from a task.
         """
         defect = unpack(query.split('.'), task)
-        needed_keys = ['@module', '@class', 'structure', 'defect_site', 'charge', 'site_name']
-        return MontyDecoder().process_decoded({k: v for k, v in defect.items() if k in needed_keys})
+        return MontyDecoder().process_decoded(defect)
 
     @classmethod
-    def get_parameters_from_tasks(cls, defect_task, bulk_task):
+    def get_parameters_from_tasks(cls, defect_task: TaskDocument, bulk_task: TaskDocument):
         """
         Get parameters necessary to create a defect entry from defect and bulk task dicts
         Args:
@@ -291,9 +290,6 @@ def get_parameters_from_tasks(cls, defect_task, bulk_task):
             bulk_task: task dict for the bulk calculation
         """
 
-        defect_task = TaskDocument(**defect_task)
-        bulk_task = TaskDocument(**bulk_task)
-
         final_defect_structure = defect_task.structure
         final_bulk_structure = bulk_task.structure
 
@@ -304,11 +300,9 @@ def get_parameters_from_tasks(cls, defect_task, bulk_task):
             defect_frac_sc_coords = DefectSiteFinder(SETTINGS.SYMPREC).get_defect_fpos(defect_structure=final_defect_structure, base_structure=final_bulk_structure)
 
         parameters = {
-            'defect_energy': defect_task['output']['energy'],
-            'bulk_energy': bulk_task['output']['energy'],
+            'defect_energy': defect_task.output.energy,
+            'bulk_energy': bulk_task.output.energy,
             'final_defect_structure': final_defect_structure,
-            'vbm': bulk_task['output']['vbm'],
-            'cbm': bulk_task['output']['cbm'],
             'charge_state': defect_task.output.structure.charge,
             'defect_frac_sc_coords': defect_frac_sc_coords,
             'defect_v_hartree': defect_task.cp2k_objects['v_hartree'], # TODO CP2K spec name

From 132a6d4b81c287dd245ec7af4d62ffcaf86f13ec Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Mon, 31 Oct 2022 11:05:29 -0700
Subject: [PATCH 05/50] Don't perturb when bulk

---
 src/atomate2/cp2k/flows/defect.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/atomate2/cp2k/flows/defect.py b/src/atomate2/cp2k/flows/defect.py
index d23c914822..069d80df42 100644
--- a/src/atomate2/cp2k/flows/defect.py
+++ b/src/atomate2/cp2k/flows/defect.py
@@ -99,12 +99,12 @@ def __post_init__(self):
         if self.run_bulk == 'relax':
             if self.hybrid_functional:
                 self.bulk_maker = DefectHybridCellOptMaker(
-                    name="bulk hybrid relax",
+                    name="bulk hybrid relax", transformations=None,
                     initialize_with_pbe=self.initialize_with_pbe, 
                     hybrid_functional=self.hybrid_functional
                     )
             else:
-                self.bulk_maker = DefectCellOptMaker(name="bulk relax")
+                self.bulk_maker = DefectCellOptMaker(name="bulk relax", transformations=None)
 
         elif self.run_bulk == "static":
             if self.hybrid_functional:

From a3feadd72d5594c40b0177dfd24d77ef09d71a30 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Mon, 31 Oct 2022 15:01:47 -0700
Subject: [PATCH 06/50] More robust for some reason

---
 src/atomate2/cp2k/flows/defect.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/atomate2/cp2k/flows/defect.py b/src/atomate2/cp2k/flows/defect.py
index 069d80df42..20a1a265bd 100644
--- a/src/atomate2/cp2k/flows/defect.py
+++ b/src/atomate2/cp2k/flows/defect.py
@@ -161,7 +161,9 @@ def make(
                         self.force_diagonal,)
 
         if self.run_bulk:
-            bulk_job = self.bulk_maker.make(bulk_structure * sc_mat, prev_cp2k_dir=prev_cp2k_dir)
+            s = bulk_structure.copy()
+            s.make_supercell(sc_mat)
+            bulk_job = self.bulk_maker.make(s, prev_cp2k_dir=prev_cp2k_dir)
             jobs.append(bulk_job)
 
         for defect in defects:

From ab4cc6a2993d1d9bfb3b7b9c1e6bf27ea19284e0 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Mon, 31 Oct 2022 16:57:30 -0700
Subject: [PATCH 07/50] Expand parents

I cannot for the life of me figure out why using the combined parent breaks everything, but I guess the multiple_input_updator can only go one level deep for inheriting
---
 src/atomate2/cp2k/jobs/defect.py |  3 ++-
 src/atomate2/cp2k/sets/defect.py | 10 +++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/atomate2/cp2k/jobs/defect.py b/src/atomate2/cp2k/jobs/defect.py
index 1d5db8f432..85f2ca5d6e 100644
--- a/src/atomate2/cp2k/jobs/defect.py
+++ b/src/atomate2/cp2k/jobs/defect.py
@@ -60,8 +60,9 @@ def make(self, defect: Defect | Structure, charge: int = 0, prev_cp2k_dir: str |
                 "info.json": {
                     "defect": deepcopy(defect), 
                     "defect_charge": charge, 
-                    "sc_mat": self.supercell_matrix}
+                    "sc_mat": self.supercell_matrix
                     }
+                }
             )
         else:
             structure = deepcopy(defect)
diff --git a/src/atomate2/cp2k/sets/defect.py b/src/atomate2/cp2k/sets/defect.py
index df3a2cbe4c..2e454af5dc 100644
--- a/src/atomate2/cp2k/sets/defect.py
+++ b/src/atomate2/cp2k/sets/defect.py
@@ -9,7 +9,7 @@
 
 from atomate2.cp2k.sets.base import Cp2kInputGenerator, multiple_input_updators
 from atomate2.cp2k.sets.core import (
-    StaticSetGenerator, RelaxSetGenerator, CellOptSetGenerator,
+    HybridSetGenerator, StaticSetGenerator, RelaxSetGenerator, CellOptSetGenerator,
     HybridStaticSetGenerator, HybridRelaxSetGenerator, HybridCellOptSetGenerator
 ) 
 logger = logging.getLogger(__name__)
@@ -41,15 +41,15 @@ class DefectCellOptSetGenerator(DefectSetGenerator, CellOptSetGenerator):
 
 @dataclass
 @multiple_input_updators()
-class DefectHybridStaticSetGenerator(DefectSetGenerator, HybridStaticSetGenerator):
+class DefectHybridStaticSetGenerator(DefectSetGenerator, StaticSetGenerator, HybridSetGenerator):
     pass   
 
 @dataclass
 @multiple_input_updators()
-class DefectHybridRelaxSetGenerator(DefectSetGenerator, HybridRelaxSetGenerator):
-    pass   
+class DefectHybridRelaxSetGenerator(DefectSetGenerator, RelaxSetGenerator, HybridSetGenerator):
+    pass
 
 @dataclass
 @multiple_input_updators()
-class DefectHybridCellOptSetGenerator(DefectSetGenerator, HybridCellOptSetGenerator):
+class DefectHybridCellOptSetGenerator(DefectSetGenerator, CellOptSetGenerator, HybridSetGenerator):
     pass 
\ No newline at end of file

From 99e3a55c3f16ff74f136d515743f08c01c2b84d2 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Mon, 31 Oct 2022 16:58:08 -0700
Subject: [PATCH 08/50] Ugly but functional

I'll look for a way to clean this up later
---
 src/atomate2/cp2k/flows/defect.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/src/atomate2/cp2k/flows/defect.py b/src/atomate2/cp2k/flows/defect.py
index 20a1a265bd..a404701bd5 100644
--- a/src/atomate2/cp2k/flows/defect.py
+++ b/src/atomate2/cp2k/flows/defect.py
@@ -121,15 +121,27 @@ def __post_init__(self):
                 hybrid_functional=self.hybrid_functional,
                 initialize_with_pbe=self.initialize_with_pbe,
             )
-        else:
-            self.def_maker = DefectRelaxMaker()
+            self.def_maker.initialize_maker.supercell_matrix = self.supercell_matrix
+            self.def_maker.hybrid_maker.supercell_matrix = self.supercell_matrix
+
+            self.def_maker.initialize_maker.max_atoms = self.max_atoms
+            self.def_maker.hybrid_maker.max_atoms = self.max_atoms
 
+            self.def_maker.initialize_maker.min_atoms = self.min_atoms
+            self.def_maker.hybrid_maker.min_atoms = self.min_atoms
 
-        self.def_maker.supercell_matrix = self.supercell_matrix
-        self.def_maker.max_atoms = self.max_atoms
-        self.def_maker.min_atoms = self.min_atoms
-        self.def_maker.min_length = self.min_length
-        self.def_maker.force_diagonal = self.force_diagonal
+            self.def_maker.initialize_maker.min_length = self.min_length
+            self.def_maker.hybrid_maker.min_length = self.min_length
+
+            self.def_maker.initialize_maker.force_diagonal = self.force_diagonal
+            self.def_maker.hybrid_maker.force_diagonal = self.force_diagonal
+        else:
+            self.def_maker = DefectRelaxMaker()
+            self.def_maker.supercell_matrix = self.supercell_matrix
+            self.def_maker.max_atoms = self.max_atoms
+            self.def_maker.min_atoms = self.min_atoms
+            self.def_maker.min_length = self.min_length
+            self.def_maker.force_diagonal = self.force_diagonal
 
     def make(
         self, defects: Iterable[Defect], 
@@ -163,7 +175,7 @@ def make(
         if self.run_bulk:
             s = bulk_structure.copy()
             s.make_supercell(sc_mat)
-            bulk_job = self.bulk_maker.make(s, prev_cp2k_dir=prev_cp2k_dir)
+            bulk_job = self.bulk_maker.make(bulk_structure * sc_mat, prev_cp2k_dir=prev_cp2k_dir)
             jobs.append(bulk_job)
 
         for defect in defects:

From 8daf86d9ffc3414b9926fbdc5ff7f97a291f3b07 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Thu, 3 Nov 2022 12:55:40 -0700
Subject: [PATCH 09/50] Updates for cluster testing

---
 src/atomate2/cp2k/builders/defect.py |  3 +-
 src/atomate2/cp2k/flows/defect.py    |  2 +-
 src/atomate2/cp2k/jobs/defect.py     | 12 +++----
 src/atomate2/cp2k/schemas/defect.py  | 51 ++++++++++++++++------------
 4 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/src/atomate2/cp2k/builders/defect.py b/src/atomate2/cp2k/builders/defect.py
index 354502650d..2c49f1ee74 100644
--- a/src/atomate2/cp2k/builders/defect.py
+++ b/src/atomate2/cp2k/builders/defect.py
@@ -127,7 +127,8 @@ def __init__(
             'output.output.structure',
             'output.input',
             'output.nsites',
-            'output.cp2k_objects.v_hartree'
+            'output.cp2k_objects.v_hartree',
+            'output.additional_json.info.sc_mat' # TODO figure out how to remove this requirement
         ] 
 
         self._required_bulk_properties = [
diff --git a/src/atomate2/cp2k/flows/defect.py b/src/atomate2/cp2k/flows/defect.py
index a404701bd5..c780860b81 100644
--- a/src/atomate2/cp2k/flows/defect.py
+++ b/src/atomate2/cp2k/flows/defect.py
@@ -233,7 +233,7 @@ def collect_defect_outputs(
             defect_entry = DefectEntry(
                 defect=defect,
                 charge_state=charge,
-                sc_entry=ComputedStructureEntry(structure=bulk_output.structure, energy=bulk_output.output.energy)
+                sc_entry=ComputedStructureEntry(structure=bulk_output.structure, energy=output_with_charge.output.energy - bulk_output.output.energy)
             )
             defect_entries.append(defect_entry)
             plot_data = defect_entry.get_freysoldt_correction(
diff --git a/src/atomate2/cp2k/jobs/defect.py b/src/atomate2/cp2k/jobs/defect.py
index 85f2ca5d6e..c9f21889cd 100644
--- a/src/atomate2/cp2k/jobs/defect.py
+++ b/src/atomate2/cp2k/jobs/defect.py
@@ -72,7 +72,7 @@ def make(self, defect: Defect | Structure, charge: int = 0, prev_cp2k_dir: str |
 @dataclass
 class DefectStaticMaker(BaseDefectMaker):
 
-    name: str = "defect static"
+    name: str = field(default="defect static")
     input_set_generator: DefectSetGenerator = field(
         default_factory=DefectStaticSetGenerator
         )
@@ -86,7 +86,7 @@ class DefectRelaxMaker(BaseDefectMaker):
     the hartree potential for finite size corrections.
     """
 
-    name: str = "defect relax"
+    name: str = field(default="defect relax")
     input_set_generator: Cp2kInputGenerator = field(default_factory=DefectRelaxSetGenerator)
     transformations: tuple[str, ...] = field(default=("PerturbStructureTransformation",))
     transformation_params: tuple[dict, ...] | None = field(default=({"distance": 0.01},))
@@ -100,7 +100,7 @@ class DefectCellOptMaker(BaseDefectMaker):
     the hartree potential for finite size corrections.
     """
 
-    name: str = "defect relax"
+    name: str = field(default="defect relax")
     input_set_generator: Cp2kInputGenerator = field(default_factory=DefectCellOptSetGenerator)
     transformations: tuple[str, ...] = field(default=("PerturbStructureTransformation",))
     transformation_params: tuple[dict, ...] | None = field(default=({"distance": 0.01},))
@@ -108,19 +108,19 @@ class DefectCellOptMaker(BaseDefectMaker):
 @dataclass
 class DefectHybridStaticMaker(DefectStaticMaker, HybridStaticMaker):
     
-    name: str = "defect hybrid static"
+    name: str = field(default="defect hybrid static")
     input_set_generator: DefectSetGenerator = field(default_factory=DefectHybridStaticSetGenerator)
 
 @dataclass
 class DefectHybridRelaxMaker(DefectRelaxMaker, HybridRelaxMaker):
 
-    name: str = "defect hybrid relax"
+    name: str = field(default="defect hybrid relax")
     input_set_generator: DefectSetGenerator = field(default_factory=DefectHybridRelaxSetGenerator)
 
 @dataclass
 class DefectHybridCellOptMaker(DefectCellOptMaker, HybridCellOptMaker):
 
-    name: str = "defect hybrid cell opt"
+    name: str = field(default="defect hybrid cell opt")
     input_set_generator: DefectSetGenerator = field(default_factory=DefectHybridCellOptSetGenerator)
 
 class GhostVacancy(Vacancy):
diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index 736e592f64..0d04ab0c87 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -1,6 +1,6 @@
 from datetime import datetime
 from tokenize import group
-from typing import ClassVar, Dict, Tuple, Mapping, List
+from typing import ClassVar, TypeVar, Type, Dict, Tuple, Mapping, List
 from pydantic import BaseModel, Field
 from pydantic import validator
 from itertools import groupby
@@ -21,6 +21,10 @@
 from atomate2.cp2k.schemas.calc_types.enums import CalcType, TaskType, RunType
 from atomate2.cp2k.schemas.task import TaskDocument
 
+__all__ = ["DefectDoc"]
+
+T = TypeVar("T", bound="DefectDoc")
+
 class DefectDoc(StructureMetadata):
     """
     A document used to represent a single defect. e.g. a O vacancy with a -2 charge.
@@ -29,9 +33,6 @@ class DefectDoc(StructureMetadata):
     calculation of each run_type.
     """
 
-    class Config:
-        arbitrary_types_allowed = True
-
     property_name: ClassVar[str] = "defect"
 
     defect: Defect = Field(None, description="Pymatgen defect object for this defect doc")
@@ -40,8 +41,9 @@ class Config:
 
     material_id: str = Field(None, description="Unique material ID for the bulk material") #TODO Change to MPID
 
+    # TODO Should it be all (defect + bulk) ids?
     task_ids: List[str] = Field(
-        None, description="All task ids used in creating this defect doc."
+        None, description="All defect task ids used in creating this defect doc."
     )
 
     calc_types: Mapping[str, CalcType] = Field(  # type: ignore
@@ -57,8 +59,12 @@ class Config:
         description="Run types for all the calculations that make up this material",
     )
 
-    tasks: Mapping[RunType, Tuple[TaskDocument, TaskDocument]] = Field(
-        None, description="Task documents (defect task, bulk task) for the defect entry of RunType"
+    best_tasks: Mapping[RunType, Tuple[str, str]] = Field(
+        None, description="Task ids (defect task, bulk task) for all tasks of a RunType"
+    )
+
+    all_tasks: Mapping[RunType, List[Tuple[str, str]]] = Field(
+        None, description="Task ids (defect task, bulk task) for all tasks of a RunType"
     )
 
     entries: Mapping[RunType, DefectEntry] = Field(
@@ -130,7 +136,7 @@ def update_all(self, defect_tasks: List, bulk_tasks: List, dielectrics: List, qu
             self.update(defect_task=defect_task, bulk_task=bulk_task, dielectric=dielectric, query=query)
 
     @classmethod
-    def from_tasks(cls, defect_tasks: List, bulk_tasks: List, dielectrics: List, query='defect', key="task_id", material_id=None):
+    def from_tasks(cls: Type[T], defect_tasks: List, bulk_tasks: List, dielectrics: List, query='defect', key="task_id", material_id=None):
         """
         The standard way to create this document.
         Args:
@@ -138,7 +144,8 @@ def from_tasks(cls, defect_tasks: List, bulk_tasks: List, dielectrics: List, que
                 series of DefectEntry objects.
             query: How to retrieve the defect object stored in the task.
         """
-        task_ids = [defect_task[key] for defect_task in defect_tasks]
+        defect_task_ids = [defect_task[key] for defect_task in defect_tasks]
+        bulk_task_ids = [bulk_task[key] for bulk_task in bulk_tasks]
         bulk_tasks= [TaskDocument(**bulk_task['output']) for bulk_task in bulk_tasks]
         defects = [cls.get_defect_from_task(query=query, task=defect_task) for defect_task in defect_tasks]
         defect_tasks = [TaskDocument(**defect_task['output']) for defect_task in defect_tasks]
@@ -151,9 +158,9 @@ def from_tasks(cls, defect_tasks: List, bulk_tasks: List, dielectrics: List, que
         #    {task.task_id for task in task_group if not task.is_valid}
         #)
 
-        run_types = {id: task.calcs_reversed[0].run_type for id, task in zip(task_ids, defect_tasks)}
-        task_types = {id: task.calcs_reversed[0].task_type for id, task in zip(task_ids, defect_tasks)}
-        calc_types = {id: task.calcs_reversed[0].calc_type for id, task in zip(task_ids, defect_tasks)}
+        run_types = {id: task.calcs_reversed[0].run_type for id, task in zip(defect_task_ids, defect_tasks)}
+        task_types = {id: task.calcs_reversed[0].task_type for id, task in zip(defect_task_ids, defect_tasks)}
+        calc_types = {id: task.calcs_reversed[0].calc_type for id, task in zip(defect_task_ids, defect_tasks)}
 
         def _run_type(x):
             return x[0].calcs_reversed[0].run_type.value
@@ -163,20 +170,21 @@ def _sort(x):
             return -x[0].nsites, x[0].output.energy
 
         entries = {}
-        final_tasks = {}
+        all_tasks = {}
+        best_tasks = {}
         metadata = {}
-        for key, tasks_for_runtype in groupby(sorted(zip(defect_tasks, bulk_tasks, defects, dielectrics), key=_run_type), key=_run_type):
+        for key, tasks_for_runtype in groupby(sorted(zip(defect_tasks, bulk_tasks, defects, dielectrics, defect_task_ids, bulk_task_ids), key=_run_type), key=_run_type):
             sorted_tasks = sorted(tasks_for_runtype, key=_sort)
             ents = [
                 cls.get_defect_entry_from_tasks(defect_task, bulk_task, defect, dielectric) 
-                for defect_task, bulk_task, defect, dielectric in sorted_tasks
+                for defect_task, bulk_task, defect, dielectric, did, bid in sorted_tasks
                 ]
+            rt = run_types[sorted_tasks[0][-2]]
             best_entry = ents[0]
-            best_defect_task  = sorted_tasks[0][0]
-            best_bulk_task = sorted_tasks[0][1]
+            best_tasks[rt] = (sorted_tasks[0][-2], sorted_tasks[0][-1]) 
+            all_tasks[rt] = [ (s[-2], s[-1]) for s in sorted_tasks ]
             metadata[key] = {'convergence': [(sorted_tasks[i][0].nsites, ents[i].corrected_energy) for i in range(len(ents))]}
-            entries[best_defect_task.calcs_reversed[0].run_type] = best_entry
-            final_tasks[best_defect_task.calcs_reversed[0].run_type] = (best_defect_task, best_bulk_task)
+            entries[rt] = ents[0]
 
         data = {
                 'entries': entries,
@@ -185,9 +193,10 @@ def _sort(x):
                 'calc_types': calc_types,
                 'last_updated': last_updated,
                 'created_at': created_at,
-                'task_ids': task_ids,
+                'task_ids': defect_task_ids,
                 #'deprecated_tasks': deprecated_tasks,
-                'tasks': final_tasks,
+                'all_tasks': all_tasks,
+                'best_tasks': best_tasks,
                 'material_id': material_id if material_id else best_entry.parameters['material_id'],
                 'defect': best_entry.defect,
                 'metadata': metadata,

From 804f4bbf301dca36f2b7670a7c665e66ab1488d3 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 8 Nov 2022 11:05:41 -0800
Subject: [PATCH 10/50] copy info

---
 src/atomate2/cp2k/flows/defect.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/atomate2/cp2k/flows/defect.py b/src/atomate2/cp2k/flows/defect.py
index c780860b81..5098d16dfa 100644
--- a/src/atomate2/cp2k/flows/defect.py
+++ b/src/atomate2/cp2k/flows/defect.py
@@ -44,19 +44,25 @@
 class DefectHybridStaticFlowMaker(HybridStaticFlowMaker):
 
     initialize_maker: BaseCp2kMaker = field(default_factory=DefectStaticMaker)
-    hybrid_maker: BaseCp2kMaker = field(default_factory=DefectHybridStaticMaker)
+    hybrid_maker: BaseCp2kMaker = field(default=DefectHybridStaticMaker(
+        copy_cp2k_kwargs={'additional_cp2k_files': ("info.json",)})
+        )
 
 @dataclass 
 class DefectHybridRelaxFlowMaker(HybridRelaxFlowMaker):
 
     initialize_maker: BaseCp2kMaker = field(default_factory=DefectStaticMaker)
-    hybrid_maker: BaseCp2kMaker = field(default_factory=DefectHybridRelaxMaker)
+    hybrid_maker: BaseCp2kMaker = field(default=DefectHybridRelaxMaker(
+        copy_cp2k_kwargs={'additional_cp2k_files': ("info.json",)})
+        )
 
 @dataclass 
 class DefectHybridCellOptFlowMaker(HybridCellOptFlowMaker):
 
     initialize_maker: BaseCp2kMaker = field(default_factory=DefectStaticMaker)
-    hybrid_maker: BaseCp2kMaker = field(default_factory=DefectHybridCellOptMaker)
+    hybrid_maker: BaseCp2kMaker = field(default=DefectHybridCellOptMaker(
+        copy_cp2k_kwargs={'additional_cp2k_files': ("info.json",)})
+        )
 
 # TODO close to being able to put this in common. Just need a switch that decides which core flow/job to use based on software
 @dataclass

From 41a2504c113937fe7f0e474b16fa8380b6581011 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Sat, 26 Nov 2022 17:30:32 -0800
Subject: [PATCH 11/50] Defects

---
 src/atomate2/cp2k/builders/defect.py |  4 ++--
 src/atomate2/cp2k/jobs/defect.py     |  6 +++---
 src/atomate2/cp2k/schemas/defect.py  | 20 +++++++++++---------
 src/atomate2/cp2k/schemas/task.py    |  4 ++++
 4 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/src/atomate2/cp2k/builders/defect.py b/src/atomate2/cp2k/builders/defect.py
index 2c49f1ee74..b6047e3b3a 100644
--- a/src/atomate2/cp2k/builders/defect.py
+++ b/src/atomate2/cp2k/builders/defect.py
@@ -543,8 +543,8 @@ def __get_item_bundle(self, task_ids):
         """
         return [
             (
-                self.tasks.query_one(criteria={self.tasks.key: defect_tasks_id}),
-                self.tasks.query_one(criteria={self.tasks.key: bulk_tasks_id}),
+                self.tasks.query_one(criteria={self.tasks.key: defect_tasks_id}, load=True),
+                self.tasks.query_one(criteria={self.tasks.key: bulk_tasks_id}, load=True), # load all for now
                 self.__get_dielectric(self._mpid_map[bulk_tasks_id]),
             )
             for defect_tasks_id, bulk_tasks_id in task_ids
diff --git a/src/atomate2/cp2k/jobs/defect.py b/src/atomate2/cp2k/jobs/defect.py
index c9f21889cd..e1b4515b74 100644
--- a/src/atomate2/cp2k/jobs/defect.py
+++ b/src/atomate2/cp2k/jobs/defect.py
@@ -11,7 +11,7 @@
 
 from pymatgen.core import Structure
 from pymatgen.analysis.defects.core import Defect, Vacancy
-from atomate2.cp2k.sets.base import Cp2kInputGenerator
+from atomate2.cp2k.sets.base import Cp2kInputGenerator, recursive_update
 from atomate2.cp2k.sets.defect import (
     DefectSetGenerator, DefectStaticSetGenerator, DefectRelaxSetGenerator, DefectCellOptSetGenerator, 
     DefectHybridStaticSetGenerator, DefectHybridRelaxSetGenerator, DefectHybridCellOptSetGenerator
@@ -55,8 +55,7 @@ def make(self, defect: Defect | Structure, charge: int = 0, prev_cp2k_dir: str |
             )
 
             # provenance stuff
-            self.write_additional_data.update(
-            {
+            recursive_update(self.write_additional_data, {
                 "info.json": {
                     "defect": deepcopy(defect), 
                     "defect_charge": charge, 
@@ -64,6 +63,7 @@ def make(self, defect: Defect | Structure, charge: int = 0, prev_cp2k_dir: str |
                     }
                 }
             )
+            
         else:
             structure = deepcopy(defect)
         structure.set_charge(charge)
diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index 0d04ab0c87..ece81f2959 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -270,15 +270,15 @@ def get_freysoldt2d_correction(cls, parameters):
             dielectric = (eps_parallel - 1) / (1 - 1/eps_perp)
             with ScratchDir('.'):
                 
-                lref = VaspVolumetricData(structure=parameters['bulk_locpot'].structure, data=parameters['bulk_locpot'].data)
-                ldef = VaspVolumetricData(structure=parameters['defect_locpot'].structure, data=parameters['defect_locpot'].data)
+                lref = VaspVolumetricData(structure=parameters['bulk_v_hartree'].structure, data=parameters['bulk_v_hartree'].data)
+                ldef = VaspVolumetricData(structure=parameters['defect_v_hartree'].structure, data=parameters['defect_v_hartree'].data)
                 lref.write_file("LOCPOT.ref")
                 ldef.write_file("LOCPOT.def")
 
                 return get_freysoldt2d_correction(
-                    q=parameters['charge_state'], dielectric=dielectric, defect_locpot="LOCPOT.def", 
-                    bulk_locpot="LOCPOT.ref", defect_frac_coords=parameters['defect_frac_sc_coords'], 
-                    energy_cutoff=250, slab_buffer=2
+                    q=parameters['charge_state'], dielectric=dielectric, defect_locpot=ldef, 
+                    bulk_locpot=lref, defect_frac_coords=parameters['defect_frac_sc_coords'], 
+                    energy_cutoff=520, slab_buffer=2
                     )
         return {}, {}
 
@@ -304,20 +304,22 @@ def get_parameters_from_tasks(cls, defect_task: TaskDocument, bulk_task: TaskDoc
 
         ghost = [index for index, prop in enumerate(final_defect_structure.site_properties.get("ghost")) if prop]
         if ghost:
-            defect_frac_sc_coords = final_defect_structure[ghost[0]]
+            defect_frac_sc_coords = final_defect_structure[ghost[0]].frac_coords
         else:
             defect_frac_sc_coords = DefectSiteFinder(SETTINGS.SYMPREC).get_defect_fpos(defect_structure=final_defect_structure, base_structure=final_bulk_structure)
-
         parameters = {
             'defect_energy': defect_task.output.energy,
             'bulk_energy': bulk_task.output.energy,
             'final_defect_structure': final_defect_structure,
             'charge_state': defect_task.output.structure.charge,
             'defect_frac_sc_coords': defect_frac_sc_coords,
-            'defect_v_hartree': defect_task.cp2k_objects['v_hartree'], # TODO CP2K spec name
-            'bulk_v_hartree': bulk_task.cp2k_objects['v_hartree'], # TODO CP2K spec name
+            'defect_v_hartree': MontyDecoder().process_decoded(defect_task.cp2k_objects['v_hartree']), # TODO CP2K spec name
+            'bulk_v_hartree': MontyDecoder().process_decoded(bulk_task.cp2k_objects['v_hartree']), # TODO CP2K spec name
         }
 
+        if defect_task.tags and "2d" in defect_task.tags:
+            parameters['2d'] = True
+
         return parameters
 
 def unpack(query, d):
diff --git a/src/atomate2/cp2k/schemas/task.py b/src/atomate2/cp2k/schemas/task.py
index 72905ddd40..c1e3a088ed 100644
--- a/src/atomate2/cp2k/schemas/task.py
+++ b/src/atomate2/cp2k/schemas/task.py
@@ -336,6 +336,10 @@ def from_directory(
 
         analysis = AnalysisSummary.from_cp2k_calc_docs(calcs_reversed)
         transformations, icsd_id, tags, author = _parse_transformations(dir_name)
+        if tags:
+            tags.extend(additional_fields.get("tags", []))
+        else:
+            tags = additional_fields.get('tags')
         custodian = _parse_custodian(dir_name)
         orig_inputs = _parse_orig_inputs(dir_name)
 

From a9fcc37c23c48fadcb6cbb4ccd422a4a1ae116d6 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Mon, 28 Nov 2022 14:40:50 -0800
Subject: [PATCH 12/50] Charge

If structure is passed, use charge from structure.
---
 src/atomate2/cp2k/jobs/defect.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/atomate2/cp2k/jobs/defect.py b/src/atomate2/cp2k/jobs/defect.py
index e1b4515b74..a5c849cf87 100644
--- a/src/atomate2/cp2k/jobs/defect.py
+++ b/src/atomate2/cp2k/jobs/defect.py
@@ -65,7 +65,9 @@ def make(self, defect: Defect | Structure, charge: int = 0, prev_cp2k_dir: str |
             )
             
         else:
+            charge = charge if charge else defect.charge
             structure = deepcopy(defect)
+
         structure.set_charge(charge)
         return super().make.original(self, structure=structure, prev_cp2k_dir=prev_cp2k_dir)
 

From a0725aa2eb742d1b3e02a6edde673c9b5fe33c20 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Mon, 28 Nov 2022 16:17:43 -0800
Subject: [PATCH 13/50] Def Builder

---
 src/atomate2/cp2k/builders/defect.py | 112 +++++++++++++++++++--------
 1 file changed, 79 insertions(+), 33 deletions(-)

diff --git a/src/atomate2/cp2k/builders/defect.py b/src/atomate2/cp2k/builders/defect.py
index b6047e3b3a..fd90e5b8ae 100644
--- a/src/atomate2/cp2k/builders/defect.py
+++ b/src/atomate2/cp2k/builders/defect.py
@@ -5,6 +5,7 @@
 from typing import Dict, Iterator, List, Literal, Optional
 from copy import deepcopy
 from math import ceil
+import numpy as np
 from monty.json import MontyDecoder, jsanitize
 
 from maggma.builders import Builder
@@ -13,6 +14,7 @@
 
 from pymatgen.core import Structure
 from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher
+from pymatgen.electronic_structure.dos import CompleteDos
 from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
 
 from atomate.utils.utils import load_class
@@ -24,7 +26,7 @@
 
 from atomate2.settings import Atomate2Settings
 from atomate2.cp2k.schemas.task import TaskDocument
-from atomate2.cp2k.schemas.defect import DefectDoc
+from atomate2.cp2k.schemas.defect import DefectDoc, DefectiveMaterialDoc
 from atomate2.cp2k.schemas.calc_types import TaskType
 from atomate2.cp2k.schemas.calc_types.utils import run_type
 
@@ -33,8 +35,6 @@
 __author__ = "Nicholas Winner <nwinner@berkeley.edu>"
 
 
-# TODO this builder is very close to being code agnostic. We need only resolve the standard key names and
-# how they are fed to the DefectDoc class. e.g. VASP calcs store "locpot", but CP2K store "v_hartree"
 class DefectBuilder(Builder):
     """
     The DefectBuilder collects task documents performed on structures containing a single point defect.
@@ -60,7 +60,11 @@ class DefectBuilder(Builder):
     """
 
     #TODO how to incorporate into settings?
-    DEFAULT_ALLOWED_TASKS = [
+    DEFAULT_ALLOWED_DFCT_TASKS = [
+            TaskType.Structure_Optimization.value, 
+            ]
+
+    DEFAULT_ALLOWED_BULK_TASKS = [
             TaskType.Structure_Optimization.value, 
             TaskType.Static.value
             ]
@@ -76,7 +80,8 @@ def __init__(
         task_validation: Optional[Store] = None,
         query: Optional[Dict] = None,
         bulk_query: Optional[Dict] = None,
-        allowed_task_types: Optional[List[str]] = DEFAULT_ALLOWED_TASKS,
+        allowed_dfct_types: Optional[List[str]] = DEFAULT_ALLOWED_DFCT_TASKS,
+        allowed_bulk_types: Optional[List[str]] = DEFAULT_ALLOWED_BULK_TASKS, 
         task_schema: Literal["cp2k"] = "cp2k", # TODO cp2k specific right now, but this will go in common eventually
         settings: Dict | None = None,
         **kwargs,
@@ -103,9 +108,9 @@ def __init__(
         self.electronic_structure = electronic_structure
         self.electrostatic_potentials = electrostatic_potentials
         self.task_validation = task_validation
-        self.allowed_task_types = allowed_task_types #TODO How to incorporate into getitems?
+        self._allowed_dfct_types = allowed_dfct_types #TODO How to incorporate into getitems?
+        self._allowed_bulk_types = allowed_bulk_types #TODO How to incorporate into getitems?
 
-        self._allowed_task_types = {TaskType(t) for t in self.allowed_task_types}
         settings = settings if settings else {}
         self.settings = Atomate2Settings(**settings) # TODO don't think this is right 
         self.query = query if query else {}
@@ -128,7 +133,6 @@ def __init__(
             'output.input',
             'output.nsites',
             'output.cp2k_objects.v_hartree',
-            'output.additional_json.info.sc_mat' # TODO figure out how to remove this requirement
         ] 
 
         self._required_bulk_properties = [
@@ -187,6 +191,14 @@ def optional_bulk_properties(self) -> List:
     def mpid_map(self) -> Dict:
         return self._mpid_map
 
+    @property
+    def allowed_dfct_types(self) -> set:
+        return {TaskType(t) for t in self._allowed_dfct_types}
+
+    @property
+    def allowed_bulk_types(self) -> set:
+        return {TaskType(t) for t in self._allowed_bulk_types}
+    
     def ensure_indexes(self):
         """
         Ensures indicies on the tasks and materials collections
@@ -276,7 +288,10 @@ def get_items(self) -> Iterator[List[Dict]]:
 
         self.logger.info("Defect builder started")
         self.logger.info(
-            f"Allowed task types: {[task_type.value for task_type in self._allowed_task_types]}"
+            f"Allowed defect types: {[task_type.value for task_type in self.allowed_dfct_types]}"
+        )
+        self.logger.info(
+            f"Allowed bulk types: {[task_type.value for task_type in self.allowed_bulk_types]}"
         )
 
         self.logger.info("Setting indexes")
@@ -287,7 +302,7 @@ def get_items(self) -> Iterator[List[Dict]]:
 
         self.logger.info("Finding tasks to process")
 
-        # Get defect tasks
+        ##### Get defect tasks #####
         temp_query = self.query.copy()
         temp_query.update({d: {'$exists': True, "$ne": None} for d in self.required_defect_properties})
         temp_query.update({self.defect_query: {'$exists': True}, "output.state": "successful"})
@@ -296,7 +311,17 @@ def get_items(self) -> Iterator[List[Dict]]:
             for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
         }
 
-        # Get bulk tasks
+        # TODO Seems slow
+        not_allowed = {
+            doc[self.tasks.key] 
+            for doc in self.tasks.query(criteria={self.tasks.key: {"$in": list(defect_tasks)}})
+            if TaskDocument(**doc['output']).calcs_reversed[0].task_type not in self.allowed_dfct_types
+        }
+        if not_allowed:
+            self.logger.debug(f"{len(not_allowed)} defect tasks dropped. Not allowed TaskType")
+        defect_tasks = defect_tasks - not_allowed
+
+        ##### Get bulk tasks #####
         temp_query = self.bulk_query.copy()
         temp_query.update({d: {'$exists': True} for d in self.required_bulk_properties})
         temp_query.update({self.defect_query: {'$exists': False}, "output.state": "successful"})
@@ -304,6 +329,16 @@ def get_items(self) -> Iterator[List[Dict]]:
             doc[self.tasks.key]
             for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
         }
+        
+        # TODO seems slow
+        not_allowed = {
+            doc[self.tasks.key] 
+            for doc in self.tasks.query(criteria={self.tasks.key: {"$in": list(bulk_tasks)}})
+            if TaskDocument(**doc['output']).calcs_reversed[0].task_type not in self.allowed_bulk_types
+        }
+        if not_allowed:
+            self.logger.debug(f"{len(not_allowed)} bulk tasks dropped. Not allowed TaskType")
+        bulk_tasks = bulk_tasks - not_allowed
 
         # TODO Not the same validation behavior as material builders?
         # If validation store exists, find tasks that are invalid and remove them
@@ -393,7 +428,7 @@ def process_item(self, items):
                     defect_tasks=defect_tasks, bulk_tasks=bulk_tasks, dielectrics=dielectrics,
                     query=self.defect_query, key=self.tasks.key, material_id=material_id
                     )
-            return defect_doc.dict()
+            return jsanitize(defect_doc.dict(), allow_bson=True, enum_values=True, strict=True)
         return {}
 
     def update_targets(self, items):
@@ -412,10 +447,7 @@ def update_targets(self, items):
                        "task_ids": item['task_ids'],
                     }
                 )
-            self.defects.update(
-                docs=jsanitize(items, allow_bson=True),
-                key='task_ids',
-            )
+            self.defects.update(items, key='task_ids')
         else:
             self.logger.info("No items to update")
 
@@ -462,19 +494,25 @@ def are_equal(x, y):
             To decide if defects are equal. Either the defect objects are
             equal, OR two different defect objects relaxed to the same final structure
             (common with interstitials).
-:w
 
             TODO Need a way to do the output structure comparison for a X atom defect cell
             TODO which can be embedded in a Y atom defect cell up to tolerance.
             """
+
+            # Defects with diff charges return true for the native __eq__
+            if x['structure'].charge != y['structure'].charge:
+                return False
+
+            # Are the defect objects eq.
             if x['defect'] == y['defect']:
                 return True
 
-            # TODO This is needed for ghost vacancy unfortunately, since sm.fit can't distinguish ghosts
-            if x['defect'].defect_composition == y['defect'].defect_composition and \
-                    x['defect'].charge == y['defect'].charge and \
+            # Are the final structures equal
+            # element-changes needed for ghost vacancies, since sm.fit can't distinguish them
+            if x['defect'].element_changes == y['defect'].element_changes and \
                     sm.fit(x['structure'], y['structure']):
                 return True
+
             return False
 
         sorted_s_list = sorted(enumerate(defects), key=lambda x: key(x[1]))
@@ -682,21 +720,28 @@ def __get_pristine_supercell(self, task):
             - If no follow up transform exists, the calculation input will be returned
 
         If defect cannot be found in task, return the input structure.
+
+        scale_matrix = np.array(scaling_matrix, int)
+        if scale_matrix.shape != (3, 3):
+            scale_matrix = np.array(scale_matrix * np.eye(3), int)
+        new_lattice = Lattice(np.dot(scale_matrix, self._lattice.matrix))
         """
         d = unpack(query=self.defect_query, d=task)
+        out_structure = MontyDecoder().process_decoded(task['output']['output']['structure'])
         if d:
             defect = MontyDecoder().process_decoded(d)
-            sc_mat = task.get('output', {}).get('additional_json', {}).get("info", {}).get('sc_mat')
             s = defect.structure.copy()
+            sc_mat = out_structure.lattice.matrix.dot(np.linalg.inv(s.lattice.matrix))
             s.make_supercell(sc_mat)
             return s
         else:
-            return MontyDecoder().process_decoded(task['output']['output']['structure'])
+            return out_structure
 
 #TODO Major problem with this builder. materials store is used to sync the diel, elec, and pd with a single material id
 #TODO This is a problem because the material id in vasp store is not synced to cp2k store
 #TODO Also the chempots needed to adjust entries must come from cp2k, but you need to give vasp to sync the others
-class DefectThermoBuilder(Builder):
+#TODO Thermo store is being replaced with a manual definition of chempots until further notice
+class DefectiveMaterialBuilder(Builder):
 
     """
     This builder creates collections of the DefectThermoDoc object.
@@ -712,9 +757,9 @@ def __init__(
             defects: Store,
             defect_thermos: Store,
             materials: Store,
-            thermo: Store,
             electronic_structures: Store,
             dos: Store,
+            thermo: Dict,
             query: Optional[Dict] = None,
             **kwargs,
     ):
@@ -731,14 +776,14 @@ def __init__(
         self.defect_thermos = defect_thermos
         self.materials = materials
         self.thermo = thermo
-        self.dos = dos
         self.electronic_structures = electronic_structures
+        self.dos = dos
 
         self.query = query if query else {}
         self.timestamp = None
         self.kwargs = kwargs
 
-        super().__init__(sources=[defects, materials, thermo, electronic_structures, dos], targets=[defect_thermos], **kwargs)
+        super().__init__(sources=[defects, materials, electronic_structures, dos], targets=[defect_thermos], **kwargs)
 
     def ensure_indexes(self):
         """
@@ -785,12 +830,13 @@ def get_items(self) -> Iterator[List[Dict]]:
 
         def filterfunc(x):
             # material for defect x exists
-            if not list(self.materials.query(criteria={'material_id': x['material_id']}, properties=None)):
+            if not self.materials.query_one(criteria={'material_id': x['material_id']}, properties=None):
                 self.logger.debug(f"No material with MPID={x['material_id']} in the material store")
                 return False
 
-            for el in load_class(x['defect']['@module'], x['defect']['@class']).from_dict(x['defect']).defect_composition:
-                if not list(self.thermo.query(criteria={'chemsys': str(el)}, properties=None)):
+            defect = MontyDecoder().process_decoded(x['defect'])
+            for el in defect.element_changes: 
+                if el not in self.thermo:
                     self.logger.debug(f"No entry for {el} in Thermo Store")
                     return False
 
@@ -805,7 +851,7 @@ def filterfunc(x):
             group = [g for g in group]
             try:
                 mat = self.__get_materials(key)
-                thermo = self.__get_thermos(mat.composition)
+                thermo = self.thermo #self.__get_thermos(mat.composition)
                 elec = self.__get_electronic_structure(group[0]['material_id'])
                 yield (group, mat, thermo, elec)
             except LookupError as exception:
@@ -816,10 +862,10 @@ def process_item(self, docs):
         Process a group of defects belonging to the same material into a defect thermo doc
         """
         self.logger.info(f"Processing defects")
-        defects, material, thermos, elec_struc = docs
+        defects, material, thermo, dos = docs
         defects = [DefectDoc(**d) for d in defects]
-        thermos = [ThermoDoc(**t) for t in thermos]
-        defect_thermo_doc = DefectThermoDoc.from_docs(defects, thermos=thermos, electronic_structure=elec_struc)
+        dos = CompleteDos.from_dict(dos)
+        defect_thermo_doc = DefectiveMaterialDoc.from_docs(defects, thermo=thermo, dos=dos)
         return defect_thermo_doc.dict()
 
     def update_targets(self, items):

From 725f7cae97634c6ce9169d313d7831d673b2150f Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Mon, 28 Nov 2022 16:18:25 -0800
Subject: [PATCH 14/50] DefectDoc

---
 src/atomate2/cp2k/schemas/defect.py | 53 +++++++++++++++++------------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index ece81f2959..d436d8e80d 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -9,10 +9,10 @@
 from monty.tempfile import ScratchDir
 
 from pymatgen.core import Structure
-from pymatgen.entries.computed_entries import ComputedStructureEntry
+from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
 from pymatgen.analysis.defects.core import Defect
 from pymatgen.analysis.defects.corrections import get_freysoldt_correction, get_freysoldt2d_correction
-from pymatgen.analysis.defects.thermo import DefectEntry, DefectSiteFinder
+from pymatgen.analysis.defects.thermo import DefectEntry, DefectSiteFinder, FormationEnergyDiagram
 from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
 from atomate2 import SETTINGS
 
@@ -67,9 +67,13 @@ class DefectDoc(StructureMetadata):
         None, description="Task ids (defect task, bulk task) for all tasks of a RunType"
     )
 
-    entries: Mapping[RunType, DefectEntry] = Field(
+    defect_entries: Mapping[RunType, DefectEntry] = Field(
         None, description="Dictionary for tracking entries for CP2K calculations"
     )
+    
+    bulk_entries: Mapping[RunType, ComputedStructureEntry] = Field(
+        None, description="Computed structure entry for the bulk calc."
+    )
 
     last_updated: datetime = Field(
         description="Timestamp for when this document was last updated",
@@ -83,14 +87,6 @@ class DefectDoc(StructureMetadata):
 
     metadata: Dict = Field(description="Metadata for this defect")
 
-    # TODO How can monty serialization incorporate into pydantic? It seems like VASP MatDocs dont need this
-    @validator("entries", pre=True)
-    def decode(cls, entries):
-        for e in entries:
-            if isinstance(entries[e], dict):
-                entries[e] = MontyDecoder().process_decoded({k: v for k, v in entries[e].items()})
-        return entries
-
     def update(self, defect_task, bulk_task, dielectric, query='defect'):
 
         defect_task_doc = TaskDocument(**defect_task)
@@ -136,7 +132,7 @@ def update_all(self, defect_tasks: List, bulk_tasks: List, dielectrics: List, qu
             self.update(defect_task=defect_task, bulk_task=bulk_task, dielectric=dielectric, query=query)
 
     @classmethod
-    def from_tasks(cls: Type[T], defect_tasks: List, bulk_tasks: List, dielectrics: List, query='defect', key="task_id", material_id=None):
+    def from_tasks(cls: Type[T], defect_tasks: List, bulk_tasks: List, dielectrics: List, query='defect', key="task_id", material_id=None) -> T:
         """
         The standard way to create this document.
         Args:
@@ -169,25 +165,30 @@ def _sort(x):
             # TODO return kpoint density, currently just does supercell size
             return -x[0].nsites, x[0].output.energy
 
-        entries = {}
+        defect_entries = {}
+        bulk_entries = {}
         all_tasks = {}
         best_tasks = {}
         metadata = {}
         for key, tasks_for_runtype in groupby(sorted(zip(defect_tasks, bulk_tasks, defects, dielectrics, defect_task_ids, bulk_task_ids), key=_run_type), key=_run_type):
             sorted_tasks = sorted(tasks_for_runtype, key=_sort)
             ents = [
-                cls.get_defect_entry_from_tasks(defect_task, bulk_task, defect, dielectric) 
+                (
+                    cls.get_defect_entry_from_tasks(defect_task, bulk_task, defect, dielectric),
+                    cls.get_bulk_entry_from_task(bulk_task)
+                )
                 for defect_task, bulk_task, defect, dielectric, did, bid in sorted_tasks
                 ]
             rt = run_types[sorted_tasks[0][-2]]
-            best_entry = ents[0]
             best_tasks[rt] = (sorted_tasks[0][-2], sorted_tasks[0][-1]) 
             all_tasks[rt] = [ (s[-2], s[-1]) for s in sorted_tasks ]
-            metadata[key] = {'convergence': [(sorted_tasks[i][0].nsites, ents[i].corrected_energy) for i in range(len(ents))]}
-            entries[rt] = ents[0]
+            metadata[key] = {'convergence': [(sorted_tasks[i][0].nsites, ents[i][0].corrected_energy) for i in range(len(ents))]}
+            defect_entries[rt], bulk_entries[rt] = ents[0]
 
+        v = next(iter(defect_entries.values())) 
         data = {
-                'entries': entries,
+                'defect_entries': defect_entries,
+                "bulk_entries": bulk_entries,
                 'run_types': run_types,
                 'task_types': task_types,
                 'calc_types': calc_types,
@@ -197,11 +198,12 @@ def _sort(x):
                 #'deprecated_tasks': deprecated_tasks,
                 'all_tasks': all_tasks,
                 'best_tasks': best_tasks,
-                'material_id': material_id if material_id else best_entry.parameters['material_id'],
-                'defect': best_entry.defect,
+                'material_id': material_id if material_id else v.parameters['material_id'],
+                'defect': v.defect, 
+                "name": v.defect.name,
                 'metadata': metadata,
         }
-        prim = SpacegroupAnalyzer(best_entry.defect.structure).get_primitive_standard_structure()
+        prim = SpacegroupAnalyzer(v.defect.structure).get_primitive_standard_structure()
         data.update(StructureMetadata.from_structure(prim).dict())
         return cls(**data)
 
@@ -225,7 +227,7 @@ def get_defect_entry_from_tasks(cls, defect_task: TaskDocument, bulk_task: TaskD
 
         sc_entry = ComputedStructureEntry(
             structure=parameters['final_defect_structure'], 
-            energy=parameters['defect_energy'] - parameters['bulk_energy']
+            energy=parameters['defect_energy']
             )
 
         defect_entry = DefectEntry(
@@ -238,6 +240,13 @@ def get_defect_entry_from_tasks(cls, defect_task: TaskDocument, bulk_task: TaskD
 
         return defect_entry
 
+    @classmethod
+    def get_bulk_entry_from_task(cls, bulk_task: TaskDocument):
+        return ComputedStructureEntry(
+            structure=bulk_task.structure,
+            energy=bulk_task.output.energy,
+        )
+
     @classmethod
     def get_correction_from_parameters(cls, parameters) -> Tuple[Dict, Dict]:
         corrections = {}

From 441c4f87c1aadd3235926c261f19ec492af67c1e Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Mon, 28 Nov 2022 16:38:33 -0800
Subject: [PATCH 15/50] DefectiveMat

---
 src/atomate2/cp2k/builders/defect.py |  3 --
 src/atomate2/cp2k/schemas/defect.py  | 64 ++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 3 deletions(-)

diff --git a/src/atomate2/cp2k/builders/defect.py b/src/atomate2/cp2k/builders/defect.py
index fd90e5b8ae..128eb72817 100644
--- a/src/atomate2/cp2k/builders/defect.py
+++ b/src/atomate2/cp2k/builders/defect.py
@@ -19,11 +19,8 @@
 
 from atomate.utils.utils import load_class
 
-from emmet.core.thermo import ThermoDoc
 from emmet.core.material import MaterialsDoc
 
-from emmet.builders.settings import EmmetBuildSettings
-
 from atomate2.settings import Atomate2Settings
 from atomate2.cp2k.schemas.task import TaskDocument
 from atomate2.cp2k.schemas.defect import DefectDoc, DefectiveMaterialDoc
diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index d436d8e80d..302dcbd1ee 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -24,6 +24,7 @@
 __all__ = ["DefectDoc"]
 
 T = TypeVar("T", bound="DefectDoc")
+S = TypeVar("S", bound="DefectiveMaterialDoc")
 
 class DefectDoc(StructureMetadata):
     """
@@ -331,6 +332,69 @@ def get_parameters_from_tasks(cls, defect_task: TaskDocument, bulk_task: TaskDoc
 
         return parameters
 
+class DefectiveMaterialDoc(StructureMetadata):
+    """Document containing all / many defect tasks for a single material ID"""
+
+    property_name: ClassVar[str] = "defective material"
+
+    material_id: str = Field(None, description="Unique material ID for the bulk material") #TODO Change to MPID
+
+    formation_energy_diagrams: Mapping[RunType, FormationEnergyDiagram] = Field(None, description="")
+
+    last_updated: datetime = Field(
+        description="Timestamp for when this document was last updated",
+        default_factory=datetime.utcnow,
+    )
+
+    created_at: datetime = Field(
+        description="Timestamp for when this material document was first created",
+        default_factory=datetime.utcnow,
+    )
+
+    metadata: Dict = Field(None, description="Metadata for this object")
+
+    @classmethod
+    def from_docs(cls: Type["S"], defect_docs: DefectDoc, thermo: Dict, dos) -> S:
+        """
+        # Metadata
+        metadata = {}
+        last_updated = datetime.now() 
+        created_at = datetime.now() 
+
+        bulk_ents = {}
+        dfct_ents = {}
+        formation_energy_diagrams = {}
+        els = set()
+        for doc in defect_docs:
+            els = els | set(doc.defect.element_changes.keys())
+            for rt, defect_entry in doc.defect_entries.items():
+                if rt not in dfct_ents:
+                    dfct_ents[rt] = []
+                dfct_ents[rt].append(defect_entry)
+            bulk_ents[rt] = doc.bulk_entries[rt]
+
+        atomic_entries = [ComputedEntry(composition=str(el), energy=thermo[el]) for el in els]
+
+        for rt in dfct_ents:
+
+            pd = PhaseDiagram(mp_entries)
+            cbm, vbm = dos.get_cbm_vbm()
+            
+            adjusted_entries = _get_adjusted_pd_entries(
+                    phase_diagram=pd, atomic_entries=atomic_entries
+                )
+
+            formation_energy_diagrams[rt] = FormationEnergyDiagram.with_atomic_entries(
+                                                bulk_entry=bulk_ents[rt], defect_entries=dfct_ents[rt],
+                                                atomic_entries=atomic_entries, phase_diagram=pd, vbm=vbm,
+                                                band_gap=cbm-vbm,
+                                            )
+        """
+
+        raise NotImplementedError
+        
+       
+
 def unpack(query, d):
     if not query:
         return d

From 1ddeec51d63aed13d92537fd640324fd1a91274c Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Mon, 28 Nov 2022 18:05:54 -0800
Subject: [PATCH 16/50] Print PDOS

---
 src/atomate2/cp2k/jobs/defect.py | 3 ++-
 src/atomate2/cp2k/sets/defect.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/atomate2/cp2k/jobs/defect.py b/src/atomate2/cp2k/jobs/defect.py
index a5c849cf87..43ce4717d7 100644
--- a/src/atomate2/cp2k/jobs/defect.py
+++ b/src/atomate2/cp2k/jobs/defect.py
@@ -23,7 +23,8 @@
 
 DEFECT_TASK_DOC = {
     "average_v_hartree": True,
-    "store_volumetric_data": ("v_hartree",)
+    "store_volumetric_data": ("v_hartree",),
+    "print_pdos": True,
 }
 
 @dataclass 
diff --git a/src/atomate2/cp2k/sets/defect.py b/src/atomate2/cp2k/sets/defect.py
index 2e454af5dc..20c0fbbf12 100644
--- a/src/atomate2/cp2k/sets/defect.py
+++ b/src/atomate2/cp2k/sets/defect.py
@@ -22,7 +22,7 @@ class DefectSetGenerator(Cp2kInputGenerator):
     def get_input_updates(self, structure: Structure, *args, **kwargs) -> dict:
         """
         """
-        return {'print_v_hartree': True}
+        return {'print_v_hartree': True, "print_pdos": True}
 
 @dataclass
 @multiple_input_updators()

From 9f2f7ecdb80bf3d480ac31f07c71b6f35b56e364 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 29 Nov 2022 08:41:21 -0800
Subject: [PATCH 17/50] Defects

---
 src/atomate2/cp2k/builders/defect.py | 7 ++++---
 src/atomate2/cp2k/jobs/defect.py     | 3 +--
 src/atomate2/cp2k/schemas/defect.py  | 7 ++++++-
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/atomate2/cp2k/builders/defect.py b/src/atomate2/cp2k/builders/defect.py
index 128eb72817..2f381c3d4c 100644
--- a/src/atomate2/cp2k/builders/defect.py
+++ b/src/atomate2/cp2k/builders/defect.py
@@ -137,7 +137,8 @@ def __init__(
             'output.output.energy',
             'output.output.structure',
             'output.input',
-            'output.cp2k_objects.v_hartree'
+            'output.cp2k_objects.v_hartree',
+            'output.vbm',
         ] 
 
         self._optional_defect_properties = []
@@ -281,7 +282,7 @@ def get_items(self) -> Iterator[List[Dict]]:
             
                 task bundles bundle are all the tasks that correspond to the same defect and all possible
                 bulk tasks that could be matched to them.
-d        """
+        """
 
         self.logger.info("Defect builder started")
         self.logger.info(
@@ -938,4 +939,4 @@ def get_sg(struc, symprec=.01) -> int:
     try:
         return struc.get_space_group_info(symprec=symprec)[1]
     except Exception:
-        return -1
\ No newline at end of file
+        return -1
diff --git a/src/atomate2/cp2k/jobs/defect.py b/src/atomate2/cp2k/jobs/defect.py
index 43ce4717d7..275db0ef73 100644
--- a/src/atomate2/cp2k/jobs/defect.py
+++ b/src/atomate2/cp2k/jobs/defect.py
@@ -24,7 +24,6 @@
 DEFECT_TASK_DOC = {
     "average_v_hartree": True,
     "store_volumetric_data": ("v_hartree",),
-    "print_pdos": True,
 }
 
 @dataclass 
@@ -134,4 +133,4 @@ def defect_structure(self):
         """Returns the defect structure with the proper oxidation state"""
         struct = self.structure.copy()
         struct.add_site_property("ghost", [i == self.defect_site_index for i in range(len(struct))])
-        return struct
\ No newline at end of file
+        return struct
diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index 302dcbd1ee..9732f63c8b 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -76,6 +76,8 @@ class DefectDoc(StructureMetadata):
         None, description="Computed structure entry for the bulk calc."
     )
 
+    vbm: Mapping[RunType, float] = Field(None, description="VBM for bulk task of each run type. Used for aligning potential")
+
     last_updated: datetime = Field(
         description="Timestamp for when this document was last updated",
         default_factory=datetime.utcnow,
@@ -170,6 +172,7 @@ def _sort(x):
         bulk_entries = {}
         all_tasks = {}
         best_tasks = {}
+        vbm = {}
         metadata = {}
         for key, tasks_for_runtype in groupby(sorted(zip(defect_tasks, bulk_tasks, defects, dielectrics, defect_task_ids, bulk_task_ids), key=_run_type), key=_run_type):
             sorted_tasks = sorted(tasks_for_runtype, key=_sort)
@@ -181,6 +184,7 @@ def _sort(x):
                 for defect_task, bulk_task, defect, dielectric, did, bid in sorted_tasks
                 ]
             rt = run_types[sorted_tasks[0][-2]]
+            vbm[rt] = sorted_tasks[0][1].output.vbm
             best_tasks[rt] = (sorted_tasks[0][-2], sorted_tasks[0][-1]) 
             all_tasks[rt] = [ (s[-2], s[-1]) for s in sorted_tasks ]
             metadata[key] = {'convergence': [(sorted_tasks[i][0].nsites, ents[i][0].corrected_energy) for i in range(len(ents))]}
@@ -202,6 +206,7 @@ def _sort(x):
                 'material_id': material_id if material_id else v.parameters['material_id'],
                 'defect': v.defect, 
                 "name": v.defect.name,
+                "vbm": vbm,
                 'metadata': metadata,
         }
         prim = SpacegroupAnalyzer(v.defect.structure).get_primitive_standard_structure()
@@ -400,4 +405,4 @@ def unpack(query, d):
         return d
     if isinstance(d, List):
         return unpack(query[1:], d.__getitem__(int(query.pop(0))))
-    return unpack(query[1:], d.__getitem__(query.pop(0)))
\ No newline at end of file
+    return unpack(query[1:], d.__getitem__(query.pop(0)))

From a4e3b01c411a1f520068a43c2fb4da495ee27de4 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Wed, 30 Nov 2022 16:26:39 -0800
Subject: [PATCH 18/50] Builder

---
 src/atomate2/cp2k/builders/defect.py | 65 +++++++++++++++++-----------
 1 file changed, 40 insertions(+), 25 deletions(-)

diff --git a/src/atomate2/cp2k/builders/defect.py b/src/atomate2/cp2k/builders/defect.py
index 2f381c3d4c..a62ad8a8f7 100644
--- a/src/atomate2/cp2k/builders/defect.py
+++ b/src/atomate2/cp2k/builders/defect.py
@@ -16,8 +16,7 @@
 from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher
 from pymatgen.electronic_structure.dos import CompleteDos
 from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
-
-from atomate.utils.utils import load_class
+from pymatgen.io.cp2k.inputs import Cp2kInput
 
 from emmet.core.material import MaterialsDoc
 
@@ -92,7 +91,8 @@ def __init__(
             materials: Store of materials documents
             electrostatic_potentials: Store of electrostatic potential data. These
                 are generally stored in seperately from the tasks on GridFS due to their size.
-            task_validation: Store of task validation documents.
+            task_validation: Store of task validation documents. If true, then only tasks that have passed
+                validation will be considered.
             query: dictionary to limit tasks to be analyzed. NOT the same as the defect_query property
             allowed_task_types: list of task_types that can be processed
             settings: EmmetBuildSettings object
@@ -138,7 +138,7 @@ def __init__(
             'output.output.structure',
             'output.input',
             'output.cp2k_objects.v_hartree',
-            'output.vbm',
+            'output.output.vbm',
         ] 
 
         self._optional_defect_properties = []
@@ -321,7 +321,7 @@ def get_items(self) -> Iterator[List[Dict]]:
 
         ##### Get bulk tasks #####
         temp_query = self.bulk_query.copy()
-        temp_query.update({d: {'$exists': True} for d in self.required_bulk_properties})
+        temp_query.update({d: {'$exists': True, "$ne": None} for d in self.required_bulk_properties})
         temp_query.update({self.defect_query: {'$exists': False}, "output.state": "successful"})
         bulk_tasks = {
             doc[self.tasks.key]
@@ -456,7 +456,7 @@ def __filter_and_group_tasks(self, tasks):
         will be grouped together.
 
         Args:
-            tasks: task_ids for unprocessed defects
+            tasks: task_ids (according to self.tasks.key) for unprocessed defects
 
         returns:
             [ (defect, [task_ids] ), ...] where task_ids correspond to the same defect
@@ -631,7 +631,7 @@ def __match_defects_to_bulks(self, bulk_ids, defect_ids):
             'output.input',
             'output.nsites',
             'output.output.structure',
-            "output.additional_json.info.sc_mat" 
+            'output.calcs_reversed' 
         ]
         defects = list(self.tasks.query(criteria={self.tasks.key: {'$in': list(defect_ids)}}, properties=props))
         ps = self.__get_pristine_supercell(defects[0])
@@ -646,33 +646,48 @@ def __match_defects_to_bulks(self, bulk_ids, defect_ids):
             )
         ) 
         
-        # TODO add settings
-        sm = StructureMatcher(
-            primitive_cell=False,
-            scale=True,
-            attempt_supercell=False,
-            allow_subset=False,
-            comparator=ElementComparator(),
-        )
-
-        def _compare(b, d):
-            rtb = b.get('output').get('input').get('xc').split("+U")[0]
-            rtd = d.get('output').get('input').get('xc').split("+U")[0]
-            if rtb == rtd: 
-                if sm.fit(self.__get_pristine_supercell(d), self.__get_pristine_supercell(b)):
-                       return True
-            return False
-
         pairs = [
             (defect[self.tasks.key], bulk[self.tasks.key])
             for bulk in bulks
             for defect in defects
-            if _compare(bulk, defect)
+            if self.__are_bulk_and_defect_commensurate(bulk, defect)
         ]
 
         self.logger.debug(f"Found {len(pairs)} commensurate bulk/defect pairs")
         return pairs
 
+    # TODO Checking for same dft settings (e.g. OT/diag) is a little cumbersome. 
+    # Maybe, in future, task doc can be defined to have OT/diag as part of input summary 
+    # for fast querying
+    def __are_bulk_and_defect_commensurate(self, b, d):
+        """
+        Check if a bulk and defect task are commensurate.
+
+        Checks for:
+            1. Same run type.
+            2. Same pristine structures with no supercell reduction
+            3. Compatible DFT settings 
+        """
+        # TODO add settings
+        sm = StructureMatcher(
+            primitive_cell=False,
+            scale=True,
+            attempt_supercell=False,
+            allow_subset=False,
+            comparator=ElementComparator(),
+        )
+        rtb = b.get('output').get('input').get('xc').split("+U")[0]
+        rtd = d.get('output').get('input').get('xc').split("+U")[0]
+        if rtb == rtd: 
+            if sm.fit(self.__get_pristine_supercell(d), self.__get_pristine_supercell(b)):
+                    cib = Cp2kInput.from_dict(b['output']['calcs_reversed'][0]['input']['cp2k_input'])
+                    cid = Cp2kInput.from_dict(d['output']['calcs_reversed'][0]['input']['cp2k_input'])
+                    bis_ot = cib.check("force_eval/dft/scf/ot")
+                    dis_ot = cid.check("force_eval/dft/scf/ot")
+                    if (bis_ot and dis_ot) or (not bis_ot and not dis_ot):
+                        return True
+        return False
+
     def __preprocess_bulk(self, task):
         """
         Given a TaskDoc that could be a bulk for defect analysis, check to see if it can be used. Bulk

From c2064b8239e2e455bc06e9389c7ccc110f82e615 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Mon, 5 Dec 2022 16:21:44 -0800
Subject: [PATCH 19/50] Defect

---
 src/atomate2/cp2k/builders/defect.py |  33 +-
 src/atomate2/cp2k/flows/defect.py    |   6 +-
 src/atomate2/cp2k/jobs/defect.py     |  12 +-
 src/atomate2/cp2k/schemas/defect.py  | 452 +++++++++++++++++----------
 src/atomate2/cp2k/sets/defect.py     |   2 +
 5 files changed, 309 insertions(+), 196 deletions(-)

diff --git a/src/atomate2/cp2k/builders/defect.py b/src/atomate2/cp2k/builders/defect.py
index a62ad8a8f7..ac19b55bf0 100644
--- a/src/atomate2/cp2k/builders/defect.py
+++ b/src/atomate2/cp2k/builders/defect.py
@@ -313,7 +313,7 @@ def get_items(self) -> Iterator[List[Dict]]:
         not_allowed = {
             doc[self.tasks.key] 
             for doc in self.tasks.query(criteria={self.tasks.key: {"$in": list(defect_tasks)}})
-            if TaskDocument(**doc['output']).calcs_reversed[0].task_type not in self.allowed_dfct_types
+            if TaskType(doc['output']['calcs_reversed'][0]['task_type']) not in self.allowed_dfct_types
         }
         if not_allowed:
             self.logger.debug(f"{len(not_allowed)} defect tasks dropped. Not allowed TaskType")
@@ -332,7 +332,7 @@ def get_items(self) -> Iterator[List[Dict]]:
         not_allowed = {
             doc[self.tasks.key] 
             for doc in self.tasks.query(criteria={self.tasks.key: {"$in": list(bulk_tasks)}})
-            if TaskDocument(**doc['output']).calcs_reversed[0].task_type not in self.allowed_bulk_types
+            if TaskType(doc['output']['calcs_reversed'][0]['task_type']) not in self.allowed_bulk_types
         }
         if not_allowed:
             self.logger.debug(f"{len(not_allowed)} bulk tasks dropped. Not allowed TaskType")
@@ -770,9 +770,6 @@ def __init__(
             defects: Store,
             defect_thermos: Store,
             materials: Store,
-            electronic_structures: Store,
-            dos: Store,
-            thermo: Dict,
             query: Optional[Dict] = None,
             **kwargs,
     ):
@@ -788,15 +785,12 @@ def __init__(
         self.defects = defects
         self.defect_thermos = defect_thermos
         self.materials = materials
-        self.thermo = thermo
-        self.electronic_structures = electronic_structures
-        self.dos = dos
 
         self.query = query if query else {}
         self.timestamp = None
         self.kwargs = kwargs
 
-        super().__init__(sources=[defects, materials, electronic_structures, dos], targets=[defect_thermos], **kwargs)
+        super().__init__(sources=[defects, materials], targets=[defect_thermos], **kwargs)
 
     def ensure_indexes(self):
         """
@@ -842,11 +836,10 @@ def get_items(self) -> Iterator[List[Dict]]:
         self.logger.debug(f"Found {len(all_docs)} defect docs to process")
 
         def filterfunc(x):
-            # material for defect x exists
             if not self.materials.query_one(criteria={'material_id': x['material_id']}, properties=None):
                 self.logger.debug(f"No material with MPID={x['material_id']} in the material store")
                 return False
-
+            return True
             defect = MontyDecoder().process_decoded(x['defect'])
             for el in defect.element_changes: 
                 if el not in self.thermo:
@@ -861,24 +854,18 @@ def filterfunc(x):
                     sorted(all_docs, key=lambda x: x['material_id'])
                 ), key=lambda x: x['material_id']
         ):
-            group = [g for g in group]
             try:
-                mat = self.__get_materials(key)
-                thermo = self.thermo #self.__get_thermos(mat.composition)
-                elec = self.__get_electronic_structure(group[0]['material_id'])
-                yield (group, mat, thermo, elec)
+                yield list(group)
             except LookupError as exception:
                 raise exception
 
-    def process_item(self, docs):
+    def process_item(self, defects):
         """
         Process a group of defects belonging to the same material into a defect thermo doc
         """
-        self.logger.info(f"Processing defects")
-        defects, material, thermo, dos = docs
-        defects = [DefectDoc(**d) for d in defects]
-        dos = CompleteDos.from_dict(dos)
-        defect_thermo_doc = DefectiveMaterialDoc.from_docs(defects, thermo=thermo, dos=dos)
+        defect_docs = [DefectDoc(**d) for d in defects]
+        self.logger.info(f"Processing {len(defect_docs)} defects")
+        defect_thermo_doc = DefectiveMaterialDoc.from_docs(defect_docs, material_id=defect_docs[0].material_id)
         return defect_thermo_doc.dict()
 
     def update_targets(self, items):
@@ -892,7 +879,7 @@ def update_targets(self, items):
         if len(items) > 0:
             self.logger.info(f"Updating {len(items)} defect thermo docs")
             self.defect_thermos.update(
-                docs=jsanitize(items, allow_bson=True),
+                docs=jsanitize(items, allow_bson=True, enum_values=True, strict=True),
                 key=self.defect_thermos.key,
             )
         else:
diff --git a/src/atomate2/cp2k/flows/defect.py b/src/atomate2/cp2k/flows/defect.py
index 5098d16dfa..5567d60502 100644
--- a/src/atomate2/cp2k/flows/defect.py
+++ b/src/atomate2/cp2k/flows/defect.py
@@ -153,7 +153,9 @@ def make(
         self, defects: Iterable[Defect], 
         charges: bool | Iterable[int] = False, 
         dielectric: NDArray | int | float | None = None,
-        prev_cp2k_dir: str | Path | None = None):
+        prev_cp2k_dir: str | Path | None = None,
+        collect_outputs: bool = True,
+        ):
         """Make a flow to run multiple defects in order to calculate their formation 
         energy diagram.
 
@@ -194,7 +196,7 @@ def make(
                 jobs.append(defect_job)
                 defect_outputs[defect.name][int(charge)] = (defect, defect_job.output)
 
-        if self.run_bulk and defects:
+        if self.run_bulk and defects and collect_outputs:
             collect_job = collect_defect_outputs(
                 defect_outputs=defect_outputs,
                 bulk_output=bulk_job.output if self.run_bulk else None,
diff --git a/src/atomate2/cp2k/jobs/defect.py b/src/atomate2/cp2k/jobs/defect.py
index 275db0ef73..1b712ed58e 100644
--- a/src/atomate2/cp2k/jobs/defect.py
+++ b/src/atomate2/cp2k/jobs/defect.py
@@ -39,21 +39,19 @@ class BaseDefectMaker(BaseCp2kMaker):
     @cp2k_job
     def make(self, defect: Defect | Structure, charge: int = 0, prev_cp2k_dir: str | Path | None = None):
         if isinstance(defect, Defect):
-            if isinstance(defect, Vacancy):
-                defect = GhostVacancy(
-                    structure=defect.structure, site=defect.site,
-                    multiplicity=defect.multiplicity, oxi_state=defect.oxi_state,
-                    symprec=defect.symprec, angle_tolerance=defect.angle_tolerance
-                    )
+
             structure = defect.get_supercell_structure(
                 sc_mat=self.supercell_matrix, 
-                dummy_species=None, 
+                dummy_species=defect.site.species if isinstance(defect, Vacancy) else None, 
                 min_atoms=self.min_atoms,
                 max_atoms=self.max_atoms,
                 min_length=self.min_length,
                 force_diagonal=self.force_diagonal,
             )
 
+            if isinstance(defect, Vacancy):
+                structure.sites[-1].properties['ghost'] = True
+
             # provenance stuff
             recursive_update(self.write_additional_data, {
                 "info.json": {
diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index 9732f63c8b..c606342292 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -5,17 +5,27 @@
 from pydantic import validator
 from itertools import groupby
 
-from monty.json import MontyDecoder 
+from monty.json import MontyDecoder
 from monty.tempfile import ScratchDir
 
-from pymatgen.core import Structure
+from pymatgen.core import Structure, Element
 from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
-from pymatgen.analysis.defects.core import Defect
-from pymatgen.analysis.defects.corrections import get_freysoldt_correction, get_freysoldt2d_correction
-from pymatgen.analysis.defects.thermo import DefectEntry, DefectSiteFinder, FormationEnergyDiagram
+from pymatgen.analysis.phase_diagram import PhaseDiagram
+from pymatgen.analysis.defects.core import Defect, DefectType
+from pymatgen.analysis.defects.corrections import (
+    get_freysoldt_correction,
+    get_freysoldt2d_correction,
+)
+from pymatgen.analysis.defects.thermo import (
+    DefectEntry,
+    DefectSiteFinder,
+    FormationEnergyDiagram,
+)
 from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
-from atomate2 import SETTINGS
 
+from emmet.core.utils import ValueEnum
+
+from atomate2 import SETTINGS
 from atomate2.common.schemas.structure import StructureMetadata
 from atomate2.cp2k.schemas.calc_types.utils import run_type, task_type, calc_type
 from atomate2.cp2k.schemas.calc_types.enums import CalcType, TaskType, RunType
@@ -26,6 +36,7 @@
 T = TypeVar("T", bound="DefectDoc")
 S = TypeVar("S", bound="DefectiveMaterialDoc")
 
+
 class DefectDoc(StructureMetadata):
     """
     A document used to represent a single defect. e.g. a O vacancy with a -2 charge.
@@ -36,11 +47,19 @@ class DefectDoc(StructureMetadata):
 
     property_name: ClassVar[str] = "defect"
 
-    defect: Defect = Field(None, description="Pymatgen defect object for this defect doc")
+    defect: Defect = Field(
+        None, description="Pymatgen defect object for this defect doc"
+    )
+
+    charge: int = Field(None, description="Charge state for this defect")
 
-    name: str = Field(None, description="Name of this defect as generated by the defect object")
+    name: str = Field(
+        None, description="Name of this defect as generated by the defect object"
+    )
 
-    material_id: str = Field(None, description="Unique material ID for the bulk material") #TODO Change to MPID
+    material_id: str = Field(
+        None, description="Unique material ID for the bulk material"
+    )  # TODO Change to MPID
 
     # TODO Should it be all (defect + bulk) ids?
     task_ids: List[str] = Field(
@@ -71,12 +90,15 @@ class DefectDoc(StructureMetadata):
     defect_entries: Mapping[RunType, DefectEntry] = Field(
         None, description="Dictionary for tracking entries for CP2K calculations"
     )
-    
+
     bulk_entries: Mapping[RunType, ComputedStructureEntry] = Field(
         None, description="Computed structure entry for the bulk calc."
     )
 
-    vbm: Mapping[RunType, float] = Field(None, description="VBM for bulk task of each run type. Used for aligning potential")
+    vbm: Mapping[RunType, float] = Field(
+        None,
+        description="VBM for bulk task of each run type. Used for aligning potential",
+    )
 
     last_updated: datetime = Field(
         description="Timestamp for when this document was last updated",
@@ -90,52 +112,64 @@ class DefectDoc(StructureMetadata):
 
     metadata: Dict = Field(description="Metadata for this defect")
 
-    def update(self, defect_task, bulk_task, dielectric, query='defect'):
-
-        defect_task_doc = TaskDocument(**defect_task)
-        bulk_task_doc = TaskDocument(**bulk_task)
-
-        rt = defect_task_doc.run_type
-        tt = defect_task_doc.task_type
-        ct = defect_task_doc.calc_type
+    def update(self, defect_task, bulk_task, dielectric, query="defect", key="task_id"):
 
         # Metadata
-        last_updated = max(dtsk.last_updated for dtsk, btsk in self.tasks.values()) if self.tasks else datetime.now()
-        created_at = min(dtsk.last_updated for dtsk, btsk in self.tasks.values()) if self.tasks else datetime.now()
-
-        if defect_task_doc.task_id in self.task_ids:
-            return
-        else:
-            self.last_updated = last_updated
-            self.created_at = created_at
-            self.task_ids.append(defect_task_doc.task_id)
-
-            def _run_type(x):
-                return run_type(x[0]['input']['dft']).value
-
-            def _compare(new, old):
-                # TODO return kpoint density
-                return new['nsites'] > old.nsites
-
-            if defect_task_doc.run_type not in self.tasks or _compare(defect_task, self.tasks[rt][0]):
-                self.run_types.update({defect_task_doc.task_id: rt})
-                self.task_types.update({defect_task_doc.task_id: tt})
-                self.calc_types.update({defect_task_doc.task_id: ct})
-                entry = self.__class__.get_defect_entry_from_tasks(
-                            defect_task=defect_task,
-                            bulk_task=bulk_task,
-                            dielectric=dielectric,
-                            query=query
-                        )
-                self.entries[rt] = entry
-                self.tasks[rt] = (defect_task_doc, bulk_task_doc)
-
-    def update_all(self, defect_tasks: List, bulk_tasks: List, dielectrics: List, query='defect'):
-        for defect_task, bulk_task, dielectric in zip(defect_tasks, bulk_tasks, dielectrics):
-            self.update(defect_task=defect_task, bulk_task=bulk_task, dielectric=dielectric, query=query)
+        self.last_updated = datetime.now()
+        self.created_at = datetime.now()
+
+        defect = self.get_defect_from_task(query=query, task=defect_task)
+        d_id = defect_task[key]
+        b_id = bulk_task[key]
+        defect_task = TaskDocument(**defect_task)
+        bulk_task = TaskDocument(**bulk_task)
+        defect_entry = self.get_defect_entry_from_tasks(
+            defect_task, bulk_task, defect, dielectric
+        )
+        bulk_entry = self.get_bulk_entry_from_task(bulk_task)
+
+        rt = defect_task.calcs_reversed[0].run_type
+        current_largest_sc = self.defect_entries[rt].sc_entry.composition.num_atoms
+        potential_largest_sc = defect_entry.sc_entry.composition.num_atoms
+        if (
+            rt not in self.defect_entries
+            or potential_largest_sc > current_largest_sc
+            or (
+                potential_largest_sc == current_largest_sc
+                and defect_entry.sc_entry.energy
+                < self.defect_entries[rt].sc_entry.energy
+            )
+        ):
+            self.defect_entries[rt] = defect_entry
+            self.bulk_entries[rt] = bulk_entry
+            self.best_tasks[rt] = (d_id, b_id)
+
+        self.all_tasks[rt].append((d_id, b_id))
+        self.metadata["convergence"].append((current_largest_sc, defect_entry.corrected_energy - bulk_entry.energy))
+
+    def update_all(
+        self, defect_tasks: List, bulk_tasks: List, dielectrics: List, query="defect"
+    ):
+        for defect_task, bulk_task, dielectric in zip(
+            defect_tasks, bulk_tasks, dielectrics
+        ):
+            self.update(
+                defect_task=defect_task,
+                bulk_task=bulk_task,
+                dielectric=dielectric,
+                query=query,
+            )
 
     @classmethod
-    def from_tasks(cls: Type[T], defect_tasks: List, bulk_tasks: List, dielectrics: List, query='defect', key="task_id", material_id=None) -> T:
+    def from_tasks(
+        cls: Type[T],
+        defect_tasks: List,
+        bulk_tasks: List,
+        dielectrics: List,
+        query="defect",
+        key="task_id",
+        material_id=None,
+    ) -> T:
         """
         The standard way to create this document.
         Args:
@@ -145,21 +179,31 @@ def from_tasks(cls: Type[T], defect_tasks: List, bulk_tasks: List, dielectrics:
         """
         defect_task_ids = [defect_task[key] for defect_task in defect_tasks]
         bulk_task_ids = [bulk_task[key] for bulk_task in bulk_tasks]
-        bulk_tasks= [TaskDocument(**bulk_task['output']) for bulk_task in bulk_tasks]
-        defects = [cls.get_defect_from_task(query=query, task=defect_task) for defect_task in defect_tasks]
-        defect_tasks = [TaskDocument(**defect_task['output']) for defect_task in defect_tasks]
-        
+        bulk_tasks = [TaskDocument(**bulk_task["output"]) for bulk_task in bulk_tasks]
+        defects = [
+            cls.get_defect_from_task(query=query, task=defect_task)
+            for defect_task in defect_tasks
+        ]
+        defect_tasks = [
+            TaskDocument(**defect_task["output"]) for defect_task in defect_tasks
+        ]
+
         # Metadata
         last_updated = datetime.now() or max(task.last_updated for task in defect_tasks)
         created_at = datetime.now() or min(task.completed_at for task in defect_tasks)
 
-        #deprecated_tasks = list(
-        #    {task.task_id for task in task_group if not task.is_valid}
-        #)
-
-        run_types = {id: task.calcs_reversed[0].run_type for id, task in zip(defect_task_ids, defect_tasks)}
-        task_types = {id: task.calcs_reversed[0].task_type for id, task in zip(defect_task_ids, defect_tasks)}
-        calc_types = {id: task.calcs_reversed[0].calc_type for id, task in zip(defect_task_ids, defect_tasks)}
+        run_types = {
+            id: task.calcs_reversed[0].run_type
+            for id, task in zip(defect_task_ids, defect_tasks)
+        }
+        task_types = {
+            id: task.calcs_reversed[0].task_type
+            for id, task in zip(defect_task_ids, defect_tasks)
+        }
+        calc_types = {
+            id: task.calcs_reversed[0].calc_type
+            for id, task in zip(defect_task_ids, defect_tasks)
+        }
 
         def _run_type(x):
             return x[0].calcs_reversed[0].run_type.value
@@ -174,47 +218,85 @@ def _sort(x):
         best_tasks = {}
         vbm = {}
         metadata = {}
-        for key, tasks_for_runtype in groupby(sorted(zip(defect_tasks, bulk_tasks, defects, dielectrics, defect_task_ids, bulk_task_ids), key=_run_type), key=_run_type):
+        for key, tasks_for_runtype in groupby(
+            sorted(
+                zip(
+                    defect_tasks,
+                    bulk_tasks,
+                    defects,
+                    dielectrics,
+                    defect_task_ids,
+                    bulk_task_ids,
+                ),
+                key=_run_type,
+            ),
+            key=_run_type,
+        ):
             sorted_tasks = sorted(tasks_for_runtype, key=_sort)
             ents = [
                 (
-                    cls.get_defect_entry_from_tasks(defect_task, bulk_task, defect, dielectric),
-                    cls.get_bulk_entry_from_task(bulk_task)
+                    cls.get_defect_entry_from_tasks(
+                        defect_task, bulk_task, defect, dielectric
+                    ),
+                    cls.get_bulk_entry_from_task(bulk_task),
                 )
                 for defect_task, bulk_task, defect, dielectric, did, bid in sorted_tasks
-                ]
+            ]
             rt = run_types[sorted_tasks[0][-2]]
             vbm[rt] = sorted_tasks[0][1].output.vbm
-            best_tasks[rt] = (sorted_tasks[0][-2], sorted_tasks[0][-1]) 
-            all_tasks[rt] = [ (s[-2], s[-1]) for s in sorted_tasks ]
-            metadata[key] = {'convergence': [(sorted_tasks[i][0].nsites, ents[i][0].corrected_energy) for i in range(len(ents))]}
+            best_tasks[rt] = (sorted_tasks[0][-2], sorted_tasks[0][-1])
+            all_tasks[rt] = [(s[-2], s[-1]) for s in sorted_tasks]
             defect_entries[rt], bulk_entries[rt] = ents[0]
+            metadata[key] = {
+                "convergence": [
+                    (
+                        sorted_tasks[i][0].nsites,
+                        defect_entries[rt].corrected_energy - bulk_entries[rt].energy,
+                    )
+                    for i in range(len(ents))
+                ]
+            }
+
+        v = next(iter(defect_entries.values()))
+        metadata["defect_origin"] = (
+            "intrinsic"
+            if all(
+                el in v.defect.structure.composition
+                for el in v.defect.element_changes.keys()
+            )
+            else "extrinsic"
+        )
 
-        v = next(iter(defect_entries.values())) 
         data = {
-                'defect_entries': defect_entries,
-                "bulk_entries": bulk_entries,
-                'run_types': run_types,
-                'task_types': task_types,
-                'calc_types': calc_types,
-                'last_updated': last_updated,
-                'created_at': created_at,
-                'task_ids': defect_task_ids,
-                #'deprecated_tasks': deprecated_tasks,
-                'all_tasks': all_tasks,
-                'best_tasks': best_tasks,
-                'material_id': material_id if material_id else v.parameters['material_id'],
-                'defect': v.defect, 
-                "name": v.defect.name,
-                "vbm": vbm,
-                'metadata': metadata,
+            "defect_entries": defect_entries,
+            "bulk_entries": bulk_entries,
+            "run_types": run_types,
+            "task_types": task_types,
+            "calc_types": calc_types,
+            "last_updated": last_updated,
+            "created_at": created_at,
+            "task_ids": defect_task_ids,
+            "all_tasks": all_tasks,
+            "best_tasks": best_tasks,
+            "material_id": material_id if material_id else v.parameters["material_id"],
+            "defect": v.defect,
+            "charge": v.charge_state,
+            "name": v.defect.name,
+            "vbm": vbm,
+            "metadata": metadata,
         }
         prim = SpacegroupAnalyzer(v.defect.structure).get_primitive_standard_structure()
         data.update(StructureMetadata.from_structure(prim).dict())
         return cls(**data)
 
     @classmethod
-    def get_defect_entry_from_tasks(cls, defect_task: TaskDocument, bulk_task: TaskDocument, defect: Defect, dielectric=None):
+    def get_defect_entry_from_tasks(
+        cls,
+        defect_task: TaskDocument,
+        bulk_task: TaskDocument,
+        defect: Defect,
+        dielectric=None,
+    ):
         """
         Extract a defect entry from a single pair (defect and bulk) of tasks.
 
@@ -225,22 +307,24 @@ def get_defect_entry_from_tasks(cls, defect_task: TaskDocument, bulk_task: TaskD
                 corrections will be performed, even if the defect is charged.
             query: Mongo-style query to retrieve the defect object from the defect task
         """
-        parameters = cls.get_parameters_from_tasks(defect_task=defect_task, bulk_task=bulk_task)
+        parameters = cls.get_parameters_from_tasks(
+            defect_task=defect_task, bulk_task=bulk_task
+        )
         if dielectric:
-            parameters['dielectric'] = dielectric
+            parameters["dielectric"] = dielectric
 
         corrections, metadata = cls.get_correction_from_parameters(parameters)
 
         sc_entry = ComputedStructureEntry(
-            structure=parameters['final_defect_structure'], 
-            energy=parameters['defect_energy']
-            )
+            structure=parameters["final_defect_structure"],
+            energy=parameters["defect_energy"],
+        )
 
         defect_entry = DefectEntry(
             defect=defect,
-            charge_state=parameters['charge_state'],
+            charge_state=parameters["charge_state"],
             sc_entry=sc_entry,
-            sc_defect_frac_coords=parameters['defect_frac_sc_coords'],
+            sc_defect_frac_coords=parameters["defect_frac_sc_coords"],
             corrections=corrections,
         )
 
@@ -265,36 +349,49 @@ def get_correction_from_parameters(cls, parameters) -> Tuple[Dict, Dict]:
 
     @classmethod
     def get_freysoldt_correction(cls, parameters) -> Tuple[Dict, Dict]:
-        if parameters['charge_state'] and not parameters.get("2d"):
+        if parameters["charge_state"] and not parameters.get("2d"):
             return get_freysoldt_correction(
-                q=parameters['charge_state'], dielectric=parameters['dielectric'], 
-                defect_locpot=parameters['defect_v_hartree'], 
-                bulk_locpot=parameters['bulk_v_hartree'], 
-                defect_frac_coords=parameters['defect_frac_sc_coords'],
-                )
+                q=parameters["charge_state"],
+                dielectric=parameters["dielectric"],
+                defect_locpot=parameters["defect_v_hartree"],
+                bulk_locpot=parameters["bulk_v_hartree"],
+                defect_frac_coords=parameters["defect_frac_sc_coords"],
+            )
         return {}, {}
-    
+
     @classmethod
     def get_freysoldt2d_correction(cls, parameters):
 
         from pymatgen.io.vasp.outputs import VolumetricData as VaspVolumetricData
 
-        if parameters['charge_state'] and parameters.get("2d"):
-            eps_parallel = (parameters['dielectric'][0][0] + parameters['dielectric'][1][1]) / 2
-            eps_perp = parameters['dielectric'][2][2]
-            dielectric = (eps_parallel - 1) / (1 - 1/eps_perp)
-            with ScratchDir('.'):
-                
-                lref = VaspVolumetricData(structure=parameters['bulk_v_hartree'].structure, data=parameters['bulk_v_hartree'].data)
-                ldef = VaspVolumetricData(structure=parameters['defect_v_hartree'].structure, data=parameters['defect_v_hartree'].data)
+        if parameters["charge_state"] and parameters.get("2d"):
+            eps_parallel = (
+                parameters["dielectric"][0][0] + parameters["dielectric"][1][1]
+            ) / 2
+            eps_perp = parameters["dielectric"][2][2]
+            dielectric = (eps_parallel - 1) / (1 - 1 / eps_perp)
+            with ScratchDir("."):
+
+                lref = VaspVolumetricData(
+                    structure=parameters["bulk_v_hartree"].structure,
+                    data=parameters["bulk_v_hartree"].data,
+                )
+                ldef = VaspVolumetricData(
+                    structure=parameters["defect_v_hartree"].structure,
+                    data=parameters["defect_v_hartree"].data,
+                )
                 lref.write_file("LOCPOT.ref")
                 ldef.write_file("LOCPOT.def")
 
                 return get_freysoldt2d_correction(
-                    q=parameters['charge_state'], dielectric=dielectric, defect_locpot=ldef, 
-                    bulk_locpot=lref, defect_frac_coords=parameters['defect_frac_sc_coords'], 
-                    energy_cutoff=520, slab_buffer=2
-                    )
+                    q=parameters["charge_state"],
+                    dielectric=dielectric,
+                    defect_locpot=ldef,
+                    bulk_locpot=lref,
+                    defect_frac_coords=parameters["defect_frac_sc_coords"],
+                    energy_cutoff=520,
+                    slab_buffer=2,
+                )
         return {}, {}
 
     @classmethod
@@ -302,11 +399,13 @@ def get_defect_from_task(cls, query, task):
         """
         Unpack a Mongo-style query and retrieve a defect object from a task.
         """
-        defect = unpack(query.split('.'), task)
+        defect = unpack(query.split("."), task)
         return MontyDecoder().process_decoded(defect)
 
     @classmethod
-    def get_parameters_from_tasks(cls, defect_task: TaskDocument, bulk_task: TaskDocument):
+    def get_parameters_from_tasks(
+        cls, defect_task: TaskDocument, bulk_task: TaskDocument
+    ):
         """
         Get parameters necessary to create a defect entry from defect and bulk task dicts
         Args:
@@ -317,34 +416,50 @@ def get_parameters_from_tasks(cls, defect_task: TaskDocument, bulk_task: TaskDoc
         final_defect_structure = defect_task.structure
         final_bulk_structure = bulk_task.structure
 
-        ghost = [index for index, prop in enumerate(final_defect_structure.site_properties.get("ghost")) if prop]
+        ghost = [
+            index
+            for index, prop in enumerate(
+                final_defect_structure.site_properties.get("ghost")
+            )
+            if prop
+        ]
         if ghost:
             defect_frac_sc_coords = final_defect_structure[ghost[0]].frac_coords
         else:
-            defect_frac_sc_coords = DefectSiteFinder(SETTINGS.SYMPREC).get_defect_fpos(defect_structure=final_defect_structure, base_structure=final_bulk_structure)
+            defect_frac_sc_coords = DefectSiteFinder(SETTINGS.SYMPREC).get_defect_fpos(
+                defect_structure=final_defect_structure,
+                base_structure=final_bulk_structure,
+            )
         parameters = {
-            'defect_energy': defect_task.output.energy,
-            'bulk_energy': bulk_task.output.energy,
-            'final_defect_structure': final_defect_structure,
-            'charge_state': defect_task.output.structure.charge,
-            'defect_frac_sc_coords': defect_frac_sc_coords,
-            'defect_v_hartree': MontyDecoder().process_decoded(defect_task.cp2k_objects['v_hartree']), # TODO CP2K spec name
-            'bulk_v_hartree': MontyDecoder().process_decoded(bulk_task.cp2k_objects['v_hartree']), # TODO CP2K spec name
+            "defect_energy": defect_task.output.energy,
+            "bulk_energy": bulk_task.output.energy,
+            "final_defect_structure": final_defect_structure,
+            "charge_state": defect_task.output.structure.charge,
+            "defect_frac_sc_coords": defect_frac_sc_coords,
+            "defect_v_hartree": MontyDecoder().process_decoded(
+                defect_task.cp2k_objects["v_hartree"]
+            ),  # TODO CP2K spec name
+            "bulk_v_hartree": MontyDecoder().process_decoded(
+                bulk_task.cp2k_objects["v_hartree"]
+            ),  # TODO CP2K spec name
         }
 
         if defect_task.tags and "2d" in defect_task.tags:
-            parameters['2d'] = True
+            parameters["2d"] = True
 
         return parameters
 
+
 class DefectiveMaterialDoc(StructureMetadata):
     """Document containing all / many defect tasks for a single material ID"""
 
     property_name: ClassVar[str] = "defective material"
 
-    material_id: str = Field(None, description="Unique material ID for the bulk material") #TODO Change to MPID
+    material_id: str = Field(
+        None, description="Unique material ID for the bulk material"
+    )  # TODO Change to MPID
 
-    formation_energy_diagrams: Mapping[RunType, FormationEnergyDiagram] = Field(None, description="")
+    defect_docs: List[DefectDoc] = Field(None, description="Defect Docs")
 
     last_updated: datetime = Field(
         description="Timestamp for when this document was last updated",
@@ -359,46 +474,55 @@ class DefectiveMaterialDoc(StructureMetadata):
     metadata: Dict = Field(None, description="Metadata for this object")
 
     @classmethod
-    def from_docs(cls: Type["S"], defect_docs: DefectDoc, thermo: Dict, dos) -> S:
-        """
-        # Metadata
-        metadata = {}
-        last_updated = datetime.now() 
-        created_at = datetime.now() 
-
-        bulk_ents = {}
-        dfct_ents = {}
-        formation_energy_diagrams = {}
-        els = set()
-        for doc in defect_docs:
-            els = els | set(doc.defect.element_changes.keys())
-            for rt, defect_entry in doc.defect_entries.items():
-                if rt not in dfct_ents:
-                    dfct_ents[rt] = []
-                dfct_ents[rt].append(defect_entry)
-            bulk_ents[rt] = doc.bulk_entries[rt]
+    def from_docs(cls: Type["S"], defect_docs: DefectDoc, material_id: str) -> S:
+        return cls(
+            defect_docs=defect_docs,
+            material_id=material_id,
+            last_updated=max(d.last_updated for d in defect_docs),
+            created_at=datetime.now(),
+        )
 
-        atomic_entries = [ComputedEntry(composition=str(el), energy=thermo[el]) for el in els]
+    @property
+    def element_set(self) -> set:
+        els = set(Element(e) for e in self.defect_docs[0].defect.structure.symbol_set)
+        for d in self.defect_docs:
+            els = els | set(d.defect.element_changes.keys())
+        return els
 
-        for rt in dfct_ents:
+    def get_formation_energy_diagram(
+        self,
+        run_type: RunType | str,
+        atomic_entries: List[ComputedEntry],
+        phase_diagram: PhaseDiagram,
+        filters: Dict | None = None,
+    ) -> FormationEnergyDiagram:
 
-            pd = PhaseDiagram(mp_entries)
-            cbm, vbm = dos.get_cbm_vbm()
-            
-            adjusted_entries = _get_adjusted_pd_entries(
-                    phase_diagram=pd, atomic_entries=atomic_entries
-                )
+        filters = filters if filters else {}
 
-            formation_energy_diagrams[rt] = FormationEnergyDiagram.with_atomic_entries(
-                                                bulk_entry=bulk_ents[rt], defect_entries=dfct_ents[rt],
-                                                atomic_entries=atomic_entries, phase_diagram=pd, vbm=vbm,
-                                                band_gap=cbm-vbm,
-                                            )
-        """
+        els = set()
+        defect_entries = []
+        bulk_entries = []
+        vbms = []
+        for doc in self.defect_docs:
+            els = els | set(doc.defect.element_changes.keys())
+            defect_entries.append(doc.defect_entries.get(run_type))
+            bulk_entries.append(doc.bulk_entries.get(run_type))
+            vbms.append(doc.vbm.get(run_type))
+
+        # TODO bulks and vbms
+        # form en diagram takes one bulk entry and one bulk vbm
+        # These, however, can be different for each defect/bulk task pair
+        # Need to convert the differences into energy adjustments so that
+        # form en diagram is consistent with all of them
+
+        return FormationEnergyDiagram.with_atomic_entries(
+            bulk_entry=bulk_entries[0],
+            defect_entries=defect_entries,
+            atomic_entries=atomic_entries,
+            phase_diagram=phase_diagram,
+            vbm=vbms[0],
+        )
 
-        raise NotImplementedError
-        
-       
 
 def unpack(query, d):
     if not query:
diff --git a/src/atomate2/cp2k/sets/defect.py b/src/atomate2/cp2k/sets/defect.py
index 20c0fbbf12..62e68373dc 100644
--- a/src/atomate2/cp2k/sets/defect.py
+++ b/src/atomate2/cp2k/sets/defect.py
@@ -17,6 +17,8 @@
 @dataclass
 class DefectSetGenerator(Cp2kInputGenerator):
     """
+    Base input set generator for defect calculations. Adds printing of the 
+    partial density of states and the electrostatic potential.
     """
 
     def get_input_updates(self, structure: Structure, *args, **kwargs) -> dict:

From 3b7b41cd9d5e1585ae4a2f11556cde289fd6b18b Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Wed, 7 Dec 2022 21:43:16 -0800
Subject: [PATCH 20/50] Defects

---
 src/atomate2/cp2k/builders/defect.py | 25 +++++++++++++++++++++++--
 src/atomate2/cp2k/jobs/defect.py     |  2 +-
 src/atomate2/cp2k/schemas/defect.py  |  6 +++---
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/src/atomate2/cp2k/builders/defect.py b/src/atomate2/cp2k/builders/defect.py
index ac19b55bf0..6f27bc0ea8 100644
--- a/src/atomate2/cp2k/builders/defect.py
+++ b/src/atomate2/cp2k/builders/defect.py
@@ -312,7 +312,10 @@ def get_items(self) -> Iterator[List[Dict]]:
         # TODO Seems slow
         not_allowed = {
             doc[self.tasks.key] 
-            for doc in self.tasks.query(criteria={self.tasks.key: {"$in": list(defect_tasks)}})
+            for doc in self.tasks.query(
+                criteria={self.tasks.key: {"$in": list(defect_tasks)}}, 
+                properties=['output.calcs_reversed']
+                )
             if TaskType(doc['output']['calcs_reversed'][0]['task_type']) not in self.allowed_dfct_types
         }
         if not_allowed:
@@ -331,7 +334,10 @@ def get_items(self) -> Iterator[List[Dict]]:
         # TODO seems slow
         not_allowed = {
             doc[self.tasks.key] 
-            for doc in self.tasks.query(criteria={self.tasks.key: {"$in": list(bulk_tasks)}})
+            for doc in self.tasks.query(
+                criteria={self.tasks.key: {"$in": list(bulk_tasks)}},
+                properties=['output.calcs_reversed']
+                )
             if TaskType(doc['output']['calcs_reversed'][0]['task_type']) not in self.allowed_bulk_types
         }
         if not_allowed:
@@ -670,6 +676,9 @@ def __are_bulk_and_defect_commensurate(self, b, d):
         """
         # TODO add settings
         sm = StructureMatcher(
+            ltol = 1e-3,
+            stol = 0.1,
+            angle_tol = 1,
             primitive_cell=False,
             scale=True,
             attempt_supercell=False,
@@ -678,6 +687,15 @@ def __are_bulk_and_defect_commensurate(self, b, d):
         )
         rtb = b.get('output').get('input').get('xc').split("+U")[0]
         rtd = d.get('output').get('input').get('xc').split("+U")[0]
+        baux = {
+            dat['element']: dat.get('auxiliary_basis')
+            for dat in b['output']['input']['atomic_kind_info']['atomic_kinds'].values()
+            }
+        daux = {
+            dat['element']: dat.get('auxiliary_basis')
+            for dat in d['output']['input']['atomic_kind_info']['atomic_kinds'].values()
+            }
+
         if rtb == rtd: 
             if sm.fit(self.__get_pristine_supercell(d), self.__get_pristine_supercell(b)):
                     cib = Cp2kInput.from_dict(b['output']['calcs_reversed'][0]['input']['cp2k_input'])
@@ -685,6 +703,9 @@ def __are_bulk_and_defect_commensurate(self, b, d):
                     bis_ot = cib.check("force_eval/dft/scf/ot")
                     dis_ot = cid.check("force_eval/dft/scf/ot")
                     if (bis_ot and dis_ot) or (not bis_ot and not dis_ot):
+                        for el in baux:
+                            if baux[el].upper() != daux[el].upper():
+                                return False
                         return True
         return False
 
diff --git a/src/atomate2/cp2k/jobs/defect.py b/src/atomate2/cp2k/jobs/defect.py
index 1b712ed58e..3147532f23 100644
--- a/src/atomate2/cp2k/jobs/defect.py
+++ b/src/atomate2/cp2k/jobs/defect.py
@@ -50,7 +50,7 @@ def make(self, defect: Defect | Structure, charge: int = 0, prev_cp2k_dir: str |
             )
 
             if isinstance(defect, Vacancy):
-                structure.sites[-1].properties['ghost'] = True
+                structure.add_site_property("ghost", [False]*(len(structure.sites)-1) + [True])
 
             # provenance stuff
             recursive_update(self.write_additional_data, {
diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index c606342292..0890e71231 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -20,6 +20,7 @@
     DefectEntry,
     DefectSiteFinder,
     FormationEnergyDiagram,
+    MultiFormationEnergyDiagram
 )
 from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
 
@@ -495,7 +496,7 @@ def get_formation_energy_diagram(
         atomic_entries: List[ComputedEntry],
         phase_diagram: PhaseDiagram,
         filters: Dict | None = None,
-    ) -> FormationEnergyDiagram:
+    ) -> MultiFormationEnergyDiagram:
 
         filters = filters if filters else {}
 
@@ -515,7 +516,7 @@ def get_formation_energy_diagram(
         # Need to convert the differences into energy adjustments so that
         # form en diagram is consistent with all of them
 
-        return FormationEnergyDiagram.with_atomic_entries(
+        return MultiFormationEnergyDiagram.with_atomic_entries(
             bulk_entry=bulk_entries[0],
             defect_entries=defect_entries,
             atomic_entries=atomic_entries,
@@ -523,7 +524,6 @@ def get_formation_energy_diagram(
             vbm=vbms[0],
         )
 
-
 def unpack(query, d):
     if not query:
         return d

From 9c8b97776c4fdf6594b836d47a4efb49ae28a524 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 13 Dec 2022 16:09:35 -0800
Subject: [PATCH 21/50] Round sc_matrix

---
 src/atomate2/cp2k/builders/defect.py | 2 +-
 src/atomate2/cp2k/schemas/defect.py  | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/atomate2/cp2k/builders/defect.py b/src/atomate2/cp2k/builders/defect.py
index 6f27bc0ea8..d5dcee920d 100644
--- a/src/atomate2/cp2k/builders/defect.py
+++ b/src/atomate2/cp2k/builders/defect.py
@@ -766,7 +766,7 @@ def __get_pristine_supercell(self, task):
             defect = MontyDecoder().process_decoded(d)
             s = defect.structure.copy()
             sc_mat = out_structure.lattice.matrix.dot(np.linalg.inv(s.lattice.matrix))
-            s.make_supercell(sc_mat)
+            s.make_supercell(sc_mat.round())
             return s
         else:
             return out_structure
diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index 0890e71231..0518fb0e2b 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -524,6 +524,7 @@ def get_formation_energy_diagram(
             vbm=vbms[0],
         )
 
+
 def unpack(query, d):
     if not query:
         return d

From 1009f18d41224c7a0be841c81a97b0b693a8679c Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 13 Dec 2022 16:28:14 -0800
Subject: [PATCH 22/50] Module import

---
 src/atomate2/cp2k/schemas/defect.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index 0518fb0e2b..b9dca76b69 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -12,7 +12,7 @@
 from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
 from pymatgen.analysis.phase_diagram import PhaseDiagram
 from pymatgen.analysis.defects.core import Defect, DefectType
-from pymatgen.analysis.defects.corrections import (
+from pymatgen.analysis.defects.corrections.freysoldt import (
     get_freysoldt_correction,
     get_freysoldt2d_correction,
 )

From 4566e9d9426073872f461a6c2cfd137030d525e9 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 16 Dec 2022 11:24:52 -0800
Subject: [PATCH 23/50] defect

---
 src/atomate2/cp2k/builders/defect.py |  68 ++++++-----
 src/atomate2/cp2k/schemas/defect.py  | 170 ++++++++-------------------
 2 files changed, 86 insertions(+), 152 deletions(-)

diff --git a/src/atomate2/cp2k/builders/defect.py b/src/atomate2/cp2k/builders/defect.py
index d5dcee920d..939bba1661 100644
--- a/src/atomate2/cp2k/builders/defect.py
+++ b/src/atomate2/cp2k/builders/defect.py
@@ -404,8 +404,9 @@ def get_items(self) -> Iterator[List[Dict]]:
                 continue 
             doc = self.__get_defect_doc(defect)
             item_bundle = self.__get_item_bundle(task_ids)
-            material_id = self.mpid_map[item_bundle[0][1][self.tasks.key]]
-            yield doc, item_bundle, material_id
+            m = next(iter(task_ids.values()))[1]
+            material_id = self.mpid_map[m]
+            yield doc, item_bundle, material_id, task_ids
 
     def process_item(self, items):
         """
@@ -418,20 +419,17 @@ def process_item(self, items):
 
         returns: the defect document as a dictionary
         """
-        defect_doc, item_bundle, material_id = items
+        defect_doc, item_bundle, material_id, task_ids = items
         self.logger.info(f"Processing group of {len(item_bundle)} defects into DefectDoc")
         if item_bundle:
-            defect_tasks, bulk_tasks, dielectrics = list(zip(*item_bundle))
-            if defect_doc:
-                defect_doc.update_all(
-                    defect_tasks=defect_tasks, bulk_tasks=bulk_tasks, 
-                    dielectrics=dielectrics, query=self.defect_query
-                    )
-            else:
-                defect_doc = DefectDoc.from_tasks(
-                    defect_tasks=defect_tasks, bulk_tasks=bulk_tasks, dielectrics=dielectrics,
-                    query=self.defect_query, key=self.tasks.key, material_id=material_id
-                    )
+            for _, (defect_task, bulk_task, dielectric) in item_bundle.items():
+                if not defect_doc:
+                    defect_doc = DefectDoc.from_tasks(
+                        defect_task=defect_task, bulk_task=bulk_task, dielectric=dielectric,
+                        query=self.defect_query, key=self.tasks.key, material_id=material_id
+                        )
+                else:
+                    defect_doc.update_one(defect_task, bulk_task, dielectric, query=self.defect_query, key=self.tasks.key) # TODO Atomate2Store wrapper
             return jsanitize(defect_doc.dict(), allow_bson=True, enum_values=True, strict=True)
         return {}
 
@@ -581,16 +579,15 @@ def __get_item_bundle(self, task_ids):
             bulk_tasks: possible bulk tasks to match to defects
             defect_task_group: group of equivalent defects (defined by PointDefectComparator)
 
-        returns: [(defect task dict, bulk_task_dict, dielectric dict), ...]
+        returns: dict {run type: (defect task dict, bulk_task_dict, dielectric dict)}
         """
-        return [
-            (
-                self.tasks.query_one(criteria={self.tasks.key: defect_tasks_id}, load=True),
-                self.tasks.query_one(criteria={self.tasks.key: bulk_tasks_id}, load=True), # load all for now
-                self.__get_dielectric(self._mpid_map[bulk_tasks_id]),
-            )
-            for defect_tasks_id, bulk_tasks_id in task_ids
-        ]
+        return {
+            rt: (
+                self.tasks.query_one(criteria={self.tasks.key: pairs[0]}, load=True), 
+                self.tasks.query_one(criteria={self.tasks.key: pairs[1]}, load=True), 
+                self.__get_dielectric(self._mpid_map[pairs[1]])
+                ) for rt, pairs in task_ids.items()
+                }
 
     def _get_mpid(self, structure):
         """
@@ -615,17 +612,14 @@ def _get_mpid(self, structure):
                 return m['material_id']
         return None
 
-    def __match_defects_to_bulks(self, bulk_ids, defect_ids):
+    def __match_defects_to_bulks(self, bulk_ids, defect_ids) -> list[tuple]:
         """
         Given task_ids of bulk and defect tasks, match the defects to a bulk task that has
         commensurate:
-
             - Composition
             - Number of sites
             - Symmetry
-
         """
-
         self.logger.debug(f"Finding bulk/defect task combinations.")
         self.logger.debug(f"Bulk tasks: {bulk_ids}")
         self.logger.debug(f"Defect tasks: {defect_ids}")
@@ -637,6 +631,7 @@ def __match_defects_to_bulks(self, bulk_ids, defect_ids):
             'output.input',
             'output.nsites',
             'output.output.structure',
+            'output.output.energy',
             'output.calcs_reversed' 
         ]
         defects = list(self.tasks.query(criteria={self.tasks.key: {'$in': list(defect_ids)}}, properties=props))
@@ -653,14 +648,27 @@ def __match_defects_to_bulks(self, bulk_ids, defect_ids):
         ) 
         
         pairs = [
-            (defect[self.tasks.key], bulk[self.tasks.key])
+            (defect, bulk)
             for bulk in bulks
             for defect in defects
             if self.__are_bulk_and_defect_commensurate(bulk, defect)
         ]
-
         self.logger.debug(f"Found {len(pairs)} commensurate bulk/defect pairs")
-        return pairs
+
+        def key(x):
+            return -x[0]['output']['nsites'], x[0]['output']['output']['energy']
+        def _run_type(x):
+            return x[0]['output']['calcs_reversed'][0]['run_type']
+
+        rt_pairs = {}
+        for rt, group in groupby(pairs, key=_run_type):
+            rt_pairs[rt] = [
+                (defect[self.tasks.key], bulk[self.tasks.key]) 
+                for defect, bulk in sorted(list(group), key=key)
+                ]
+
+        # Return only the first (best) pair for each rt
+        return {rt: lst[0] for rt, lst in rt_pairs.items()}
 
     # TODO Checking for same dft settings (e.g. OT/diag) is a little cumbersome. 
     # Maybe, in future, task doc can be defined to have OT/diag as part of input summary 
diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index b9dca76b69..5202822c80 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -5,6 +5,8 @@
 from pydantic import validator
 from itertools import groupby
 
+import numpy as np
+
 from monty.json import MontyDecoder
 from monty.tempfile import ScratchDir
 
@@ -80,14 +82,6 @@ class DefectDoc(StructureMetadata):
         description="Run types for all the calculations that make up this material",
     )
 
-    best_tasks: Mapping[RunType, Tuple[str, str]] = Field(
-        None, description="Task ids (defect task, bulk task) for all tasks of a RunType"
-    )
-
-    all_tasks: Mapping[RunType, List[Tuple[str, str]]] = Field(
-        None, description="Task ids (defect task, bulk task) for all tasks of a RunType"
-    )
-
     defect_entries: Mapping[RunType, DefectEntry] = Field(
         None, description="Dictionary for tracking entries for CP2K calculations"
     )
@@ -111,9 +105,10 @@ class DefectDoc(StructureMetadata):
         default_factory=datetime.utcnow,
     )
 
-    metadata: Dict = Field(description="Metadata for this defect")
+    metadata: Dict = Field(None, description="Metadata for this defect")
 
-    def update(self, defect_task, bulk_task, dielectric, query="defect", key="task_id"):
+    # TODO The sorting here should also maybe be done by builder
+    def update_one(self, defect_task, bulk_task, dielectric, query="defect", key="task_id"):
 
         # Metadata
         self.last_updated = datetime.now()
@@ -122,19 +117,20 @@ def update(self, defect_task, bulk_task, dielectric, query="defect", key="task_i
         defect = self.get_defect_from_task(query=query, task=defect_task)
         d_id = defect_task[key]
         b_id = bulk_task[key]
-        defect_task = TaskDocument(**defect_task)
-        bulk_task = TaskDocument(**bulk_task)
+        defect_task = TaskDocument(**defect_task['output'])
+        bulk_task = TaskDocument(**bulk_task['output']) # TODO Atomate2Store 
         defect_entry = self.get_defect_entry_from_tasks(
             defect_task, bulk_task, defect, dielectric
         )
         bulk_entry = self.get_bulk_entry_from_task(bulk_task)
 
         rt = defect_task.calcs_reversed[0].run_type
-        current_largest_sc = self.defect_entries[rt].sc_entry.composition.num_atoms
+        tt = defect_task.calcs_reversed[0].task_type
+        ct = defect_task.calcs_reversed[0].calc_type
+        current_largest_sc = self.defect_entries[rt].sc_entry.composition.num_atoms if rt in self.defect_entries else 0
         potential_largest_sc = defect_entry.sc_entry.composition.num_atoms
         if (
-            rt not in self.defect_entries
-            or potential_largest_sc > current_largest_sc
+            potential_largest_sc > current_largest_sc
             or (
                 potential_largest_sc == current_largest_sc
                 and defect_entry.sc_entry.energy
@@ -143,18 +139,19 @@ def update(self, defect_task, bulk_task, dielectric, query="defect", key="task_i
         ):
             self.defect_entries[rt] = defect_entry
             self.bulk_entries[rt] = bulk_entry
-            self.best_tasks[rt] = (d_id, b_id)
+            self.run_types[rt] = d_id
+            self.task_types[tt] = d_id
+            self.calc_types[ct] = d_id
 
-        self.all_tasks[rt].append((d_id, b_id))
-        self.metadata["convergence"].append((current_largest_sc, defect_entry.corrected_energy - bulk_entry.energy))
+        self.task_ids = list(set(self.task_ids) | set(d_id))
 
-    def update_all(
+    def update_many(
         self, defect_tasks: List, bulk_tasks: List, dielectrics: List, query="defect"
     ):
         for defect_task, bulk_task, dielectric in zip(
             defect_tasks, bulk_tasks, dielectrics
         ):
-            self.update(
+            self.update_one(
                 defect_task=defect_task,
                 bulk_task=bulk_task,
                 dielectric=dielectric,
@@ -162,15 +159,7 @@ def update_all(
             )
 
     @classmethod
-    def from_tasks(
-        cls: Type[T],
-        defect_tasks: List,
-        bulk_tasks: List,
-        dielectrics: List,
-        query="defect",
-        key="task_id",
-        material_id=None,
-    ) -> T:
+    def from_tasks(cls: Type[T], defect_task, bulk_task, dielectric, query="defect", key="task_id", material_id=None) -> T:
         """
         The standard way to create this document.
         Args:
@@ -178,92 +167,30 @@ def from_tasks(
                 series of DefectEntry objects.
             query: How to retrieve the defect object stored in the task.
         """
-        defect_task_ids = [defect_task[key] for defect_task in defect_tasks]
-        bulk_task_ids = [bulk_task[key] for bulk_task in bulk_tasks]
-        bulk_tasks = [TaskDocument(**bulk_task["output"]) for bulk_task in bulk_tasks]
-        defects = [
-            cls.get_defect_from_task(query=query, task=defect_task)
-            for defect_task in defect_tasks
-        ]
-        defect_tasks = [
-            TaskDocument(**defect_task["output"]) for defect_task in defect_tasks
-        ]
+        defect_task_id = defect_task[key]
+        defect = cls.get_defect_from_task(query=query, task=defect_task)
+        defect_task = TaskDocument(**defect_task["output"])
+        bulk_task = TaskDocument(**bulk_task['output'])
 
         # Metadata
-        last_updated = datetime.now() or max(task.last_updated for task in defect_tasks)
-        created_at = datetime.now() or min(task.completed_at for task in defect_tasks)
+        last_updated = datetime.now() 
+        created_at = datetime.now() 
 
-        run_types = {
-            id: task.calcs_reversed[0].run_type
-            for id, task in zip(defect_task_ids, defect_tasks)
-        }
-        task_types = {
-            id: task.calcs_reversed[0].task_type
-            for id, task in zip(defect_task_ids, defect_tasks)
-        }
-        calc_types = {
-            id: task.calcs_reversed[0].calc_type
-            for id, task in zip(defect_task_ids, defect_tasks)
-        }
-
-        def _run_type(x):
-            return x[0].calcs_reversed[0].run_type.value
-
-        def _sort(x):
-            # TODO return kpoint density, currently just does supercell size
-            return -x[0].nsites, x[0].output.energy
+        rt = defect_task.calcs_reversed[0].run_type
+        run_types = {defect_task_id: defect_task.calcs_reversed[0].run_type}
+        task_types = {defect_task_id: defect_task.calcs_reversed[0].task_type} 
+        calc_types = {defect_task_id: defect_task.calcs_reversed[0].calc_type}
 
-        defect_entries = {}
-        bulk_entries = {}
-        all_tasks = {}
-        best_tasks = {}
-        vbm = {}
         metadata = {}
-        for key, tasks_for_runtype in groupby(
-            sorted(
-                zip(
-                    defect_tasks,
-                    bulk_tasks,
-                    defects,
-                    dielectrics,
-                    defect_task_ids,
-                    bulk_task_ids,
-                ),
-                key=_run_type,
-            ),
-            key=_run_type,
-        ):
-            sorted_tasks = sorted(tasks_for_runtype, key=_sort)
-            ents = [
-                (
-                    cls.get_defect_entry_from_tasks(
-                        defect_task, bulk_task, defect, dielectric
-                    ),
-                    cls.get_bulk_entry_from_task(bulk_task),
-                )
-                for defect_task, bulk_task, defect, dielectric, did, bid in sorted_tasks
-            ]
-            rt = run_types[sorted_tasks[0][-2]]
-            vbm[rt] = sorted_tasks[0][1].output.vbm
-            best_tasks[rt] = (sorted_tasks[0][-2], sorted_tasks[0][-1])
-            all_tasks[rt] = [(s[-2], s[-1]) for s in sorted_tasks]
-            defect_entries[rt], bulk_entries[rt] = ents[0]
-            metadata[key] = {
-                "convergence": [
-                    (
-                        sorted_tasks[i][0].nsites,
-                        defect_entries[rt].corrected_energy - bulk_entries[rt].energy,
-                    )
-                    for i in range(len(ents))
-                ]
-            }
-
-        v = next(iter(defect_entries.values()))
+        defect_entries = {rt: cls.get_defect_entry_from_tasks(defect_task, bulk_task, defect, dielectric)}
+        bulk_entries = {rt: cls.get_bulk_entry_from_task(bulk_task)}
+        vbm = {rt: bulk_task.output.vbm}
+
         metadata["defect_origin"] = (
             "intrinsic"
             if all(
-                el in v.defect.structure.composition
-                for el in v.defect.element_changes.keys()
+                el in defect_entries[rt].defect.structure.composition
+                for el in defect_entries[rt].defect.element_changes.keys()
             )
             else "extrinsic"
         )
@@ -276,17 +203,15 @@ def _sort(x):
             "calc_types": calc_types,
             "last_updated": last_updated,
             "created_at": created_at,
-            "task_ids": defect_task_ids,
-            "all_tasks": all_tasks,
-            "best_tasks": best_tasks,
-            "material_id": material_id if material_id else v.parameters["material_id"],
-            "defect": v.defect,
-            "charge": v.charge_state,
-            "name": v.defect.name,
+            "task_ids": list(defect_task_id),
+            "material_id": material_id,
+            "defect": defect_entries[rt].defect,
+            "charge": defect_entries[rt].charge_state,
+            "name": defect_entries[rt].defect.name,
             "vbm": vbm,
             "metadata": metadata,
         }
-        prim = SpacegroupAnalyzer(v.defect.structure).get_primitive_standard_structure()
+        prim = SpacegroupAnalyzer(defect_entries[rt].defect.structure).get_primitive_standard_structure()
         data.update(StructureMetadata.from_structure(prim).dict())
         return cls(**data)
 
@@ -343,21 +268,22 @@ def get_correction_from_parameters(cls, parameters) -> Tuple[Dict, Dict]:
         corrections = {}
         metadata = {}
         for correction in ["get_freysoldt_correction", "get_freysoldt2d_correction"]:
-            c, m = getattr(cls, correction)(parameters)
-            corrections.update(c)
-            metadata.update(m)
+            corr, met = getattr(cls, correction)(parameters)
+            corrections.update(corr)
+            metadata.update(met)
         return corrections, metadata
 
     @classmethod
     def get_freysoldt_correction(cls, parameters) -> Tuple[Dict, Dict]:
         if parameters["charge_state"] and not parameters.get("2d"):
-            return get_freysoldt_correction(
+            es, pot, met = get_freysoldt_correction(
                 q=parameters["charge_state"],
-                dielectric=parameters["dielectric"],
+                dielectric=np.array(parameters["dielectric"]), # TODO pmg-analysis expects np array here
                 defect_locpot=parameters["defect_v_hartree"],
                 bulk_locpot=parameters["bulk_v_hartree"],
                 defect_frac_coords=parameters["defect_frac_sc_coords"],
             )
+            return {"electrostatic": es, "potential_alignment": pot}, met
         return {}, {}
 
     @classmethod
@@ -384,7 +310,7 @@ def get_freysoldt2d_correction(cls, parameters):
                 lref.write_file("LOCPOT.ref")
                 ldef.write_file("LOCPOT.def")
 
-                return get_freysoldt2d_correction(
+                es, pot, met = get_freysoldt2d_correction(
                     q=parameters["charge_state"],
                     dielectric=dielectric,
                     defect_locpot=ldef,
@@ -393,6 +319,7 @@ def get_freysoldt2d_correction(cls, parameters):
                     energy_cutoff=520,
                     slab_buffer=2,
                 )
+                return {"electrostatic": es, "potential_alignment": pot}, met
         return {}, {}
 
     @classmethod
@@ -413,7 +340,6 @@ def get_parameters_from_tasks(
             defect_task: task dict for the defect calculation
             bulk_task: task dict for the bulk calculation
         """
-
         final_defect_structure = defect_task.structure
         final_bulk_structure = bulk_task.structure
 

From 8a2eb09de632518965569b6946098a942d222041 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 16 Dec 2022 12:07:04 -0800
Subject: [PATCH 24/50] task_ids

---
 src/atomate2/cp2k/schemas/defect.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index 5202822c80..ba959f9922 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -143,7 +143,7 @@ def update_one(self, defect_task, bulk_task, dielectric, query="defect", key="ta
             self.task_types[tt] = d_id
             self.calc_types[ct] = d_id
 
-        self.task_ids = list(set(self.task_ids) | set(d_id))
+        self.task_ids = list(set(self.task_ids) | {d_id})
 
     def update_many(
         self, defect_tasks: List, bulk_tasks: List, dielectrics: List, query="defect"

From c92f11da73e91b988e373a917614f9f8f5f0cdff Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 16 Dec 2022 12:11:15 -0800
Subject: [PATCH 25/50] task_ids

---
 src/atomate2/cp2k/schemas/defect.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index ba959f9922..097a5ea804 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -203,7 +203,7 @@ def from_tasks(cls: Type[T], defect_task, bulk_task, dielectric, query="defect",
             "calc_types": calc_types,
             "last_updated": last_updated,
             "created_at": created_at,
-            "task_ids": list(defect_task_id),
+            "task_ids": [defect_task_id],
             "material_id": material_id,
             "defect": defect_entries[rt].defect,
             "charge": defect_entries[rt].charge_state,

From e1d5873725bae8b6b071abb7eb8db2d4bd197d03 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Mon, 19 Dec 2022 10:33:23 -0800
Subject: [PATCH 26/50] Fix

---
 src/atomate2/cp2k/builders/defect.py | 4 ++--
 src/atomate2/cp2k/schemas/defect.py  | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/atomate2/cp2k/builders/defect.py b/src/atomate2/cp2k/builders/defect.py
index 939bba1661..305c9eb5f6 100644
--- a/src/atomate2/cp2k/builders/defect.py
+++ b/src/atomate2/cp2k/builders/defect.py
@@ -696,11 +696,11 @@ def __are_bulk_and_defect_commensurate(self, b, d):
         rtb = b.get('output').get('input').get('xc').split("+U")[0]
         rtd = d.get('output').get('input').get('xc').split("+U")[0]
         baux = {
-            dat['element']: dat.get('auxiliary_basis')
+            dat['element']: dat.get('auxiliary_basis', "")
             for dat in b['output']['input']['atomic_kind_info']['atomic_kinds'].values()
             }
         daux = {
-            dat['element']: dat.get('auxiliary_basis')
+            dat['element']: dat.get('auxiliary_basis', "")
             for dat in d['output']['input']['atomic_kind_info']['atomic_kinds'].values()
             }
 
diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index 097a5ea804..6257c71942 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -139,9 +139,9 @@ def update_one(self, defect_task, bulk_task, dielectric, query="defect", key="ta
         ):
             self.defect_entries[rt] = defect_entry
             self.bulk_entries[rt] = bulk_entry
-            self.run_types[rt] = d_id
-            self.task_types[tt] = d_id
-            self.calc_types[ct] = d_id
+            self.run_types[d_id] = rt
+            self.task_types[d_id] = tt
+            self.calc_types[d_id] = ct
 
         self.task_ids = list(set(self.task_ids) | {d_id})
 

From 302cff84efe5fc94815fbd174a357f19ac1bab9c Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Mon, 19 Dec 2022 11:16:17 -0800
Subject: [PATCH 27/50] defects

---
 src/atomate2/cp2k/builders/defect.py |  8 +++++---
 src/atomate2/cp2k/schemas/defect.py  | 10 ++++++----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/atomate2/cp2k/builders/defect.py b/src/atomate2/cp2k/builders/defect.py
index 305c9eb5f6..029ab4ded0 100644
--- a/src/atomate2/cp2k/builders/defect.py
+++ b/src/atomate2/cp2k/builders/defect.py
@@ -696,13 +696,15 @@ def __are_bulk_and_defect_commensurate(self, b, d):
         rtb = b.get('output').get('input').get('xc').split("+U")[0]
         rtd = d.get('output').get('input').get('xc').split("+U")[0]
         baux = {
-            dat['element']: dat.get('auxiliary_basis', "")
+            dat['element']: dat.get('auxiliary_basis')
             for dat in b['output']['input']['atomic_kind_info']['atomic_kinds'].values()
             }
         daux = {
-            dat['element']: dat.get('auxiliary_basis', "")
+            dat['element']: dat.get('auxiliary_basis')
             for dat in d['output']['input']['atomic_kind_info']['atomic_kinds'].values()
             }
+        baux = baux.upper() if baux else baux
+        daux = daux.upper() if daux else daux
 
         if rtb == rtd: 
             if sm.fit(self.__get_pristine_supercell(d), self.__get_pristine_supercell(b)):
@@ -712,7 +714,7 @@ def __are_bulk_and_defect_commensurate(self, b, d):
                     dis_ot = cid.check("force_eval/dft/scf/ot")
                     if (bis_ot and dis_ot) or (not bis_ot and not dis_ot):
                         for el in baux:
-                            if baux[el].upper() != daux[el].upper():
+                            if baux[el] != daux[el]:
                                 return False
                         return True
         return False
diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index 6257c71942..cd9392fcac 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -142,6 +142,7 @@ def update_one(self, defect_task, bulk_task, dielectric, query="defect", key="ta
             self.run_types[d_id] = rt
             self.task_types[d_id] = tt
             self.calc_types[d_id] = ct
+            self.vbm[rt] = bulk_task.output.vbm
 
         self.task_ids = list(set(self.task_ids) | {d_id})
 
@@ -431,10 +432,11 @@ def get_formation_energy_diagram(
         bulk_entries = []
         vbms = []
         for doc in self.defect_docs:
-            els = els | set(doc.defect.element_changes.keys())
-            defect_entries.append(doc.defect_entries.get(run_type))
-            bulk_entries.append(doc.bulk_entries.get(run_type))
-            vbms.append(doc.vbm.get(run_type))
+            if doc.defect_entries.get(run_type):
+                els = els | set(doc.defect.element_changes.keys())
+                defect_entries.append(doc.defect_entries.get(run_type))
+                bulk_entries.append(doc.bulk_entries.get(run_type))
+                vbms.append(doc.vbm.get(run_type))
 
         # TODO bulks and vbms
         # form en diagram takes one bulk entry and one bulk vbm

From 0abd29de3e5db8b5adde0589dcc73b2009f34998 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Mon, 19 Dec 2022 11:17:40 -0800
Subject: [PATCH 28/50] No upper

---
 src/atomate2/cp2k/builders/defect.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/atomate2/cp2k/builders/defect.py b/src/atomate2/cp2k/builders/defect.py
index 029ab4ded0..ef24a33948 100644
--- a/src/atomate2/cp2k/builders/defect.py
+++ b/src/atomate2/cp2k/builders/defect.py
@@ -703,8 +703,6 @@ def __are_bulk_and_defect_commensurate(self, b, d):
             dat['element']: dat.get('auxiliary_basis')
             for dat in d['output']['input']['atomic_kind_info']['atomic_kinds'].values()
             }
-        baux = baux.upper() if baux else baux
-        daux = daux.upper() if daux else daux
 
         if rtb == rtd: 
             if sm.fit(self.__get_pristine_supercell(d), self.__get_pristine_supercell(b)):

From e4e456a357545d06c9679028a8094f63d57c4dfe Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 20 Dec 2022 12:21:40 -0800
Subject: [PATCH 29/50] builder test

---
 src/atomate2/cp2k/builders/defect.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/atomate2/cp2k/builders/defect.py b/src/atomate2/cp2k/builders/defect.py
index ef24a33948..51be413765 100644
--- a/src/atomate2/cp2k/builders/defect.py
+++ b/src/atomate2/cp2k/builders/defect.py
@@ -486,6 +486,7 @@ def __filter_and_group_tasks(self, tasks):
             # TODO remove oxidation state because spins/oxidation cause errors in comparison.
             #  but they shouldnt if those props are close in value
             d['structure'].remove_oxidation_states()
+            d['defect'].user_charges = [d['structure'].charge]
 
         def key(x):
             s = x['defect'].structure
@@ -505,10 +506,6 @@ def are_equal(x, y):
             if x['structure'].charge != y['structure'].charge:
                 return False
 
-            # Are the defect objects eq.
-            if x['defect'] == y['defect']:
-                return True
-
             # Are the final structures equal
             # element-changes needed for ghost vacancies, since sm.fit can't distinguish them
             if x['defect'].element_changes == y['defect'].element_changes and \
@@ -555,10 +552,24 @@ def __get_defect_doc(self, defect):
             for doc in self.defects.query(criteria={'material_id': material_id}, properties=None)
         ]
         for doc in docs:
-            if defect == doc.defect:
+            if self.__defect_match(defect, doc.defect):
                 return doc
         return None
 
+    def __defect_match(self, x, y):
+
+        sm = StructureMatcher()
+
+        # Defects with diff charges return true for the native __eq__
+        if x.user_charges[0] != y.user_charges[0]:
+            return False
+
+        if x.element_changes == y.element_changes and \
+                sm.fit(x.defect_structure, y.defect_structure):
+            return True
+
+        return False
+
     # TODO should move to returning dielectric doc or continue returning the total diel tensor?
     def __get_dielectric(self, key):
         """

From 66ef79336b86daeb341c98cb2bdfa326ffaf22c2 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 20 Dec 2022 13:54:21 -0800
Subject: [PATCH 30/50] 2d debug

---
 src/atomate2/cp2k/builders/defect.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/atomate2/cp2k/builders/defect.py b/src/atomate2/cp2k/builders/defect.py
index 51be413765..94a1ea37f8 100644
--- a/src/atomate2/cp2k/builders/defect.py
+++ b/src/atomate2/cp2k/builders/defect.py
@@ -403,6 +403,8 @@ def get_items(self) -> Iterator[List[Dict]]:
             if not task_ids:
                 continue 
             doc = self.__get_defect_doc(defect)
+            if doc:
+                self.logger.info(f"DOC IS {doc.defect.__repr__()}")
             item_bundle = self.__get_item_bundle(task_ids)
             m = next(iter(task_ids.values()))[1]
             material_id = self.mpid_map[m]
@@ -557,13 +559,12 @@ def __get_defect_doc(self, defect):
         return None
 
     def __defect_match(self, x, y):
-
+        """Match two defects, including there charges"""
         sm = StructureMatcher()
-
-        # Defects with diff charges return true for the native __eq__
         if x.user_charges[0] != y.user_charges[0]:
             return False
 
+        # Elem. changes needed to distinguish ghost vacancies
         if x.element_changes == y.element_changes and \
                 sm.fit(x.defect_structure, y.defect_structure):
             return True

From abf89aafbbdae265bc023753377502775f1879c0 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Wed, 21 Dec 2022 16:02:58 -0800
Subject: [PATCH 31/50] task ids

---
 src/atomate2/cp2k/builders/defect.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/atomate2/cp2k/builders/defect.py b/src/atomate2/cp2k/builders/defect.py
index 94a1ea37f8..d2a135763b 100644
--- a/src/atomate2/cp2k/builders/defect.py
+++ b/src/atomate2/cp2k/builders/defect.py
@@ -408,7 +408,7 @@ def get_items(self) -> Iterator[List[Dict]]:
             item_bundle = self.__get_item_bundle(task_ids)
             m = next(iter(task_ids.values()))[1]
             material_id = self.mpid_map[m]
-            yield doc, item_bundle, material_id, task_ids
+            yield doc, item_bundle, material_id, defect_task_group
 
     def process_item(self, items):
         """
@@ -432,6 +432,7 @@ def process_item(self, items):
                         )
                 else:
                     defect_doc.update_one(defect_task, bulk_task, dielectric, query=self.defect_query, key=self.tasks.key) # TODO Atomate2Store wrapper
+                defect_doc.task_ids = list(set(task_ids + defect_doc.task_ids)) # TODO should I store the bulk id too?
             return jsanitize(defect_doc.dict(), allow_bson=True, enum_values=True, strict=True)
         return {}
 

From a852fcbee12221c74e7df245b07eb30374a7c6ad Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 23 Dec 2022 10:15:44 -0800
Subject: [PATCH 32/50] First pass at defect validation schema

---
 src/atomate2/cp2k/schemas/defect.py | 65 ++++++++++++++++++-----------
 1 file changed, 40 insertions(+), 25 deletions(-)

diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index cd9392fcac..26644009eb 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -11,9 +11,11 @@
 from monty.tempfile import ScratchDir
 
 from pymatgen.core import Structure, Element
+from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
 from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
+from pymatgen.io.cp2k.utils import get_truncated_coulomb_cutoff
 from pymatgen.analysis.phase_diagram import PhaseDiagram
-from pymatgen.analysis.defects.core import Defect, DefectType
+from pymatgen.analysis.defects.core import Defect
 from pymatgen.analysis.defects.corrections.freysoldt import (
     get_freysoldt_correction,
     get_freysoldt2d_correction,
@@ -21,16 +23,12 @@
 from pymatgen.analysis.defects.thermo import (
     DefectEntry,
     DefectSiteFinder,
-    FormationEnergyDiagram,
     MultiFormationEnergyDiagram
 )
-from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
-
-from emmet.core.utils import ValueEnum
+from pymatgen.analysis.defects.finder import DefectSiteFinder
 
 from atomate2 import SETTINGS
 from atomate2.common.schemas.structure import StructureMetadata
-from atomate2.cp2k.schemas.calc_types.utils import run_type, task_type, calc_type
 from atomate2.cp2k.schemas.calc_types.enums import CalcType, TaskType, RunType
 from atomate2.cp2k.schemas.task import TaskDocument
 
@@ -38,7 +36,7 @@
 
 T = TypeVar("T", bound="DefectDoc")
 S = TypeVar("S", bound="DefectiveMaterialDoc")
-
+V = TypeVar("V", bound="DefectValidation")
 
 class DefectDoc(StructureMetadata):
     """
@@ -49,26 +47,19 @@ class DefectDoc(StructureMetadata):
     """
 
     property_name: ClassVar[str] = "defect"
-
     defect: Defect = Field(
         None, description="Pymatgen defect object for this defect doc"
     )
-
     charge: int = Field(None, description="Charge state for this defect")
-
     name: str = Field(
         None, description="Name of this defect as generated by the defect object"
     )
-
     material_id: str = Field(
         None, description="Unique material ID for the bulk material"
     )  # TODO Change to MPID
-
-    # TODO Should it be all (defect + bulk) ids?
     task_ids: List[str] = Field(
         None, description="All defect task ids used in creating this defect doc."
     )
-
     calc_types: Mapping[str, CalcType] = Field(  # type: ignore
         None,
         description="Calculation types for all the calculations that make up this material",
@@ -81,31 +72,26 @@ class DefectDoc(StructureMetadata):
         None,
         description="Run types for all the calculations that make up this material",
     )
-
     defect_entries: Mapping[RunType, DefectEntry] = Field(
         None, description="Dictionary for tracking entries for CP2K calculations"
     )
-
     bulk_entries: Mapping[RunType, ComputedStructureEntry] = Field(
         None, description="Computed structure entry for the bulk calc."
     )
-
     vbm: Mapping[RunType, float] = Field(
         None,
         description="VBM for bulk task of each run type. Used for aligning potential",
     )
-
     last_updated: datetime = Field(
         description="Timestamp for when this document was last updated",
         default_factory=datetime.utcnow,
     )
-
     created_at: datetime = Field(
         description="Timestamp for when this material document was first created",
         default_factory=datetime.utcnow,
     )
-
     metadata: Dict = Field(None, description="Metadata for this defect")
+    valid: Mapping[RunType, Dict] = Field(None, description="Whether each run type has a valid entry")
 
     # TODO The sorting here should also maybe be done by builder
     def update_one(self, defect_task, bulk_task, dielectric, query="defect", key="task_id"):
@@ -119,7 +105,7 @@ def update_one(self, defect_task, bulk_task, dielectric, query="defect", key="ta
         b_id = bulk_task[key]
         defect_task = TaskDocument(**defect_task['output'])
         bulk_task = TaskDocument(**bulk_task['output']) # TODO Atomate2Store 
-        defect_entry = self.get_defect_entry_from_tasks(
+        defect_entry, valid = self.get_defect_entry_from_tasks(
             defect_task, bulk_task, defect, dielectric
         )
         bulk_entry = self.get_bulk_entry_from_task(bulk_task)
@@ -143,6 +129,7 @@ def update_one(self, defect_task, bulk_task, dielectric, query="defect", key="ta
             self.task_types[d_id] = tt
             self.calc_types[d_id] = ct
             self.vbm[rt] = bulk_task.output.vbm
+            self.valid[rt] = valid
 
         self.task_ids = list(set(self.task_ids) | {d_id})
 
@@ -183,7 +170,9 @@ def from_tasks(cls: Type[T], defect_task, bulk_task, dielectric, query="defect",
         calc_types = {defect_task_id: defect_task.calcs_reversed[0].calc_type}
 
         metadata = {}
-        defect_entries = {rt: cls.get_defect_entry_from_tasks(defect_task, bulk_task, defect, dielectric)}
+        defect_entry, valid = cls.get_defect_entry_from_tasks(defect_task, bulk_task, defect, dielectric)
+        valid = {rt: valid}
+        defect_entries = {rt: defect_entry}
         bulk_entries = {rt: cls.get_bulk_entry_from_task(bulk_task)}
         vbm = {rt: bulk_task.output.vbm}
 
@@ -211,6 +200,7 @@ def from_tasks(cls: Type[T], defect_task, bulk_task, dielectric, query="defect",
             "name": defect_entries[rt].defect.name,
             "vbm": vbm,
             "metadata": metadata,
+            "valid": valid,
         }
         prim = SpacegroupAnalyzer(defect_entries[rt].defect.structure).get_primitive_standard_structure()
         data.update(StructureMetadata.from_structure(prim).dict())
@@ -254,8 +244,8 @@ def get_defect_entry_from_tasks(
             sc_defect_frac_coords=parameters["defect_frac_sc_coords"],
             corrections=corrections,
         )
-
-        return defect_entry
+        valid = DefectValidation().process_entry(parameters)
+        return defect_entry, valid
 
     @classmethod
     def get_bulk_entry_from_task(cls, bulk_task: TaskDocument):
@@ -292,7 +282,7 @@ def get_freysoldt2d_correction(cls, parameters):
 
         from pymatgen.io.vasp.outputs import VolumetricData as VaspVolumetricData
 
-        if parameters["charge_state"] and parameters.get("2d"):
+        if False: #parameters["charge_state"] and parameters.get("2d"):
             eps_parallel = (
                 parameters["dielectric"][0][0] + parameters["dielectric"][1][1]
             ) / 2
@@ -361,6 +351,7 @@ def get_parameters_from_tasks(
         parameters = {
             "defect_energy": defect_task.output.energy,
             "bulk_energy": bulk_task.output.energy,
+            "initial_defect_structure": defect_task.input.structure,
             "final_defect_structure": final_defect_structure,
             "charge_state": defect_task.output.structure.charge,
             "defect_frac_sc_coords": defect_frac_sc_coords,
@@ -377,6 +368,30 @@ def get_parameters_from_tasks(
 
         return parameters
 
+class DefectValidation(BaseModel):
+    """Validate a task document for defect processing"""
+
+    MAX_ATOMIC_RELAXATION: float = Field(
+        0.02, 
+        description="Threshold for the mean absolute displacement of atoms outside a defect's radius of isolution"
+        )
+
+    def process_entry(self, parameters) -> V:
+        v = {} 
+        v.update(self._atomic_relaxation(parameters))
+        return v
+
+    def _atomic_relaxation(self, parameters):
+        in_struc = parameters["initial_defect_structure"]
+        out_struc = parameters["final_defect_structure"]
+        sites = out_struc.get_sites_in_sphere(parameters['defect_frac_sc_coords'], get_truncated_coulomb_cutoff(in_struc), include_index=True)
+        inside_sphere = [site.index for site in sites]
+        outside_sphere = [i for i in range(len(out_struc)) if i not in inside_sphere]
+        distances = np.array([site.distance(in_struc[i]) for i, site in enumerate(out_struc)])
+        distances_outside = distances[outside_sphere]
+        if np.mean(distances_outside) > self.MAX_ATOMIC_RELAXATION:
+            return {"atomic_relaxation": False}
+        return {"atomic_relaxation": True}
 
 class DefectiveMaterialDoc(StructureMetadata):
     """Document containing all / many defect tasks for a single material ID"""

From 32a83e9cb449086dbba82ee7bbddc84ea6ff0b86 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 23 Dec 2022 10:30:03 -0800
Subject: [PATCH 33/50] Store bulk defect pair drop others

---
 src/atomate2/cp2k/schemas/defect.py | 29 +++++++----------------------
 1 file changed, 7 insertions(+), 22 deletions(-)

diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index 26644009eb..a0835f5e8c 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -57,21 +57,11 @@ class DefectDoc(StructureMetadata):
     material_id: str = Field(
         None, description="Unique material ID for the bulk material"
     )  # TODO Change to MPID
+    defect_ids: Mapping[RunType, str] = Field(None, description="Map run types of defect entry to task id")
+    bulk_ids: Mapping[RunType, str] = Field(None, description="Map run types of bulk entry to task id")
     task_ids: List[str] = Field(
         None, description="All defect task ids used in creating this defect doc."
     )
-    calc_types: Mapping[str, CalcType] = Field(  # type: ignore
-        None,
-        description="Calculation types for all the calculations that make up this material",
-    )
-    task_types: Mapping[str, TaskType] = Field(
-        None,
-        description="Task types for all the calculations that make up this material",
-    )
-    run_types: Mapping[str, RunType] = Field(
-        None,
-        description="Run types for all the calculations that make up this material",
-    )
     defect_entries: Mapping[RunType, DefectEntry] = Field(
         None, description="Dictionary for tracking entries for CP2K calculations"
     )
@@ -93,7 +83,6 @@ class DefectDoc(StructureMetadata):
     metadata: Dict = Field(None, description="Metadata for this defect")
     valid: Mapping[RunType, Dict] = Field(None, description="Whether each run type has a valid entry")
 
-    # TODO The sorting here should also maybe be done by builder
     def update_one(self, defect_task, bulk_task, dielectric, query="defect", key="task_id"):
 
         # Metadata
@@ -124,10 +113,9 @@ def update_one(self, defect_task, bulk_task, dielectric, query="defect", key="ta
             )
         ):
             self.defect_entries[rt] = defect_entry
+            self.defect_ids[rt] = d_id
             self.bulk_entries[rt] = bulk_entry
-            self.run_types[d_id] = rt
-            self.task_types[d_id] = tt
-            self.calc_types[d_id] = ct
+            self.bulk_ids[b_id] = b_id
             self.vbm[rt] = bulk_task.output.vbm
             self.valid[rt] = valid
 
@@ -158,6 +146,7 @@ def from_tasks(cls: Type[T], defect_task, bulk_task, dielectric, query="defect",
         defect_task_id = defect_task[key]
         defect = cls.get_defect_from_task(query=query, task=defect_task)
         defect_task = TaskDocument(**defect_task["output"])
+        bulk_task_id = bulk_task[key]
         bulk_task = TaskDocument(**bulk_task['output'])
 
         # Metadata
@@ -165,9 +154,6 @@ def from_tasks(cls: Type[T], defect_task, bulk_task, dielectric, query="defect",
         created_at = datetime.now() 
 
         rt = defect_task.calcs_reversed[0].run_type
-        run_types = {defect_task_id: defect_task.calcs_reversed[0].run_type}
-        task_types = {defect_task_id: defect_task.calcs_reversed[0].task_type} 
-        calc_types = {defect_task_id: defect_task.calcs_reversed[0].calc_type}
 
         metadata = {}
         defect_entry, valid = cls.get_defect_entry_from_tasks(defect_task, bulk_task, defect, dielectric)
@@ -188,9 +174,8 @@ def from_tasks(cls: Type[T], defect_task, bulk_task, dielectric, query="defect",
         data = {
             "defect_entries": defect_entries,
             "bulk_entries": bulk_entries,
-            "run_types": run_types,
-            "task_types": task_types,
-            "calc_types": calc_types,
+            "defect_ids": {rt: defect_task_id},
+            "bulk_ids": {rt: bulk_task_id},
             "last_updated": last_updated,
             "created_at": created_at,
             "task_ids": [defect_task_id],

From eaa6361ea265cef65049ba61da01dc96fed81b04 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 23 Dec 2022 17:18:58 -0800
Subject: [PATCH 34/50] Defects

---
 src/atomate2/cp2k/builders/defect.py | 81 +++++++++++++++++-----------
 src/atomate2/cp2k/schemas/defect.py  | 32 ++++++-----
 2 files changed, 68 insertions(+), 45 deletions(-)

diff --git a/src/atomate2/cp2k/builders/defect.py b/src/atomate2/cp2k/builders/defect.py
index d2a135763b..9f2ccce408 100644
--- a/src/atomate2/cp2k/builders/defect.py
+++ b/src/atomate2/cp2k/builders/defect.py
@@ -1,10 +1,7 @@
 from datetime import datetime
-from itertools import chain, groupby, combinations
-from re import A
-from tkinter import W
+from itertools import groupby
 from typing import Dict, Iterator, List, Literal, Optional
-from copy import deepcopy
-from math import ceil
+
 import numpy as np
 from monty.json import MontyDecoder, jsanitize
 
@@ -14,7 +11,6 @@
 
 from pymatgen.core import Structure
 from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher
-from pymatgen.electronic_structure.dos import CompleteDos
 from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
 from pymatgen.io.cp2k.inputs import Cp2kInput
 
@@ -24,7 +20,6 @@
 from atomate2.cp2k.schemas.task import TaskDocument
 from atomate2.cp2k.schemas.defect import DefectDoc, DefectiveMaterialDoc
 from atomate2.cp2k.schemas.calc_types import TaskType
-from atomate2.cp2k.schemas.calc_types.utils import run_type
 
 from emmet.core.electronic_structure import ElectronicStructureDoc
 
@@ -64,7 +59,7 @@ class DefectBuilder(Builder):
             TaskType.Structure_Optimization.value, 
             TaskType.Static.value
             ]
-    
+
     def __init__(
         self,
         tasks: Store,
@@ -253,7 +248,7 @@ def prechunk(self, number_splits: int) -> Iterator[Dict]:
             for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
         }
 
-        N = ceil(len(defect_tasks) / number_splits)
+        N = np.ceil(len(defect_tasks) / number_splits)
         for task_chunk in grouper(defect_tasks, N):
             yield {"query": {"task_id": {"$in": task_chunk + list(bulk_tasks)}}}
 
@@ -372,7 +367,6 @@ def get_items(self) -> Iterator[List[Dict]]:
             for d in self.defects.query({}, ["task_ids"])
             for t_id in d.get("task_ids", [])
         }
-
         all_tasks = defect_tasks | bulk_tasks
 
         self.logger.debug("All tasks: {}".format(len(all_tasks)))
@@ -496,25 +490,11 @@ def key(x):
             return get_sg(s), s.composition.reduced_composition
 
         def are_equal(x, y):
-            """
-            To decide if defects are equal. Either the defect objects are
-            equal, OR two different defect objects relaxed to the same final structure
-            (common with interstitials).
-
-            TODO Need a way to do the output structure comparison for a X atom defect cell
-            TODO which can be embedded in a Y atom defect cell up to tolerance.
-            """
-
-            # Defects with diff charges return true for the native __eq__
+            """To decide if defects are equal."""
             if x['structure'].charge != y['structure'].charge:
                 return False
-
-            # Are the final structures equal
-            # element-changes needed for ghost vacancies, since sm.fit can't distinguish them
-            if x['defect'].element_changes == y['defect'].element_changes and \
-                    sm.fit(x['structure'], y['structure']):
+            if x['defect'] == y['defect']:
                 return True
-
             return False
 
         sorted_s_list = sorted(enumerate(defects), key=lambda x: key(x[1]))
@@ -533,7 +513,7 @@ def are_equal(x, y):
                     (defects[i]['defect'], [defects[i][self.tasks.key] for i in matches])
                 )
 
-        self.logger.debug(f"All groups {all_groups}")
+        self.logger.debug(f"{len(all_groups)} groups")
         return all_groups
 
     def __get_defect_from_task(self, task):
@@ -654,7 +634,7 @@ def __match_defects_to_bulks(self, bulk_ids, defect_ids) -> list[tuple]:
             self.tasks.query(
                 criteria={
                     self.tasks.key: {'$in': list(bulk_ids)},
-                    'output.composition_reduced': jsanitize(ps.composition.to_reduced_dict),
+                    'output.formula_pretty': jsanitize(ps.composition.reduced_formula),
                 },
                 properties=props
             )
@@ -792,10 +772,7 @@ def __get_pristine_supercell(self, task):
         else:
             return out_structure
 
-#TODO Major problem with this builder. materials store is used to sync the diel, elec, and pd with a single material id
-#TODO This is a problem because the material id in vasp store is not synced to cp2k store
-#TODO Also the chempots needed to adjust entries must come from cp2k, but you need to give vasp to sync the others
-#TODO Thermo store is being replaced with a manual definition of chempots until further notice
+
 class DefectiveMaterialBuilder(Builder):
 
     """
@@ -960,6 +937,46 @@ def __get_thermos(self, composition) -> List:
         return list(self.thermo.query(criteria={'elements': {"$size": 1}}, properties=None))
 
 
+class DefectValidator(Builder):
+
+    def __init__(
+        self, 
+        tasks: Store, 
+        defect_validation: Store, 
+        chunk_size: int = 1000,
+        defect_query = 'output.additional_json.info.defect',
+       ):
+        self.tasks = tasks
+        self.defect_validation = defect_validation
+        self.chunk_size = chunk_size
+        self.defect_query = defect_query
+        super().__init__(sources=tasks, targets=defect_validation, chunk_size=chunk_size)
+
+    def get_items(self):
+        self.logger.info("Getting tasks")
+        tids = list(self.tasks.query(criteria={self.defect_query: {"$exists": True}}, properties=[self.tasks.key]))
+        self.logger.info(f"{len(tids)} to process")
+        for t in self.tasks.query():
+            yield t
+    
+    def process_item(self, item):
+        from atomate2.cp2k.schemas.defect import DefectValidation
+        tid = item[self.tasks.key]
+        return jsanitize(DefectValidation.process_task(item, tid).dict(), allow_bson=True, enum_values=True, strict=True)
+
+    def update_targets(self, items: List):
+        """
+        Inserts the new task_types into the task_types collection
+        """
+        items = [item for item in items if item]
+        if len(items) > 0:
+            self.logger.info(f"Updating {len(items)} defects")
+            self.defect_validation.update(items, key=self.defect_validation.key)
+        else:
+            self.logger.info("No items to update")
+        return super().update_targets(items)
+
+
 def unpack(query, d):
     """
     Unpack a mongo-style query into dictionary retrieval
diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index a0835f5e8c..5740f9e6ed 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -1,6 +1,6 @@
 from datetime import datetime
 from tokenize import group
-from typing import ClassVar, TypeVar, Type, Dict, Tuple, Mapping, List
+from typing import ClassVar, TypeVar, Type, Dict, Tuple, Mapping, List, Callable
 from pydantic import BaseModel, Field
 from pydantic import validator
 from itertools import groupby
@@ -15,7 +15,7 @@
 from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
 from pymatgen.io.cp2k.utils import get_truncated_coulomb_cutoff
 from pymatgen.analysis.phase_diagram import PhaseDiagram
-from pymatgen.analysis.defects.core import Defect
+from pymatgen.analysis.defects.core import Defect, Adsorbate
 from pymatgen.analysis.defects.corrections.freysoldt import (
     get_freysoldt_correction,
     get_freysoldt2d_correction,
@@ -115,7 +115,7 @@ def update_one(self, defect_task, bulk_task, dielectric, query="defect", key="ta
             self.defect_entries[rt] = defect_entry
             self.defect_ids[rt] = d_id
             self.bulk_entries[rt] = bulk_entry
-            self.bulk_ids[b_id] = b_id
+            self.bulk_ids[rt] = b_id
             self.vbm[rt] = bulk_task.output.vbm
             self.valid[rt] = valid
 
@@ -229,6 +229,7 @@ def get_defect_entry_from_tasks(
             sc_defect_frac_coords=parameters["defect_frac_sc_coords"],
             corrections=corrections,
         )
+        parameters['defect'] = defect
         valid = DefectValidation().process_entry(parameters)
         return defect_entry, valid
 
@@ -361,9 +362,12 @@ class DefectValidation(BaseModel):
         description="Threshold for the mean absolute displacement of atoms outside a defect's radius of isolution"
         )
 
+    DESORPTION_DISTANCE: float = Field(3, description="Distance to consider adsorbate as desorbed")
+
     def process_entry(self, parameters) -> V:
         v = {} 
         v.update(self._atomic_relaxation(parameters))
+        v.update(self._desorption(parameters))
         return v
 
     def _atomic_relaxation(self, parameters):
@@ -378,6 +382,15 @@ def _atomic_relaxation(self, parameters):
             return {"atomic_relaxation": False}
         return {"atomic_relaxation": True}
 
+    def _desorption(self, parameters):
+        if isinstance(parameters['defect'], Adsorbate):
+            out_struc = parameters["final_defect_structure"]
+            defect_site =  out_struc.get_sites_in_sphere(parameters['defect_frac_sc_coords'], 0.1, include_index=True)[0]
+            distances = [defect_site.distance(site) for site in out_struc]
+            if all(d > self.DESORPTION_DISTANCE for d in distances):
+                return {'desorption': False}
+        return {'desorption': True}
+
 class DefectiveMaterialDoc(StructureMetadata):
     """Document containing all / many defect tasks for a single material ID"""
 
@@ -422,28 +435,21 @@ def get_formation_energy_diagram(
         run_type: RunType | str,
         atomic_entries: List[ComputedEntry],
         phase_diagram: PhaseDiagram,
-        filters: Dict | None = None,
+        filters: List[Callable, None] = None,
     ) -> MultiFormationEnergyDiagram:
 
-        filters = filters if filters else {}
-
+        filters = filters if filters else lambda _: True
         els = set()
         defect_entries = []
         bulk_entries = []
         vbms = []
-        for doc in self.defect_docs:
+        for doc in filter(lambda x: all(f(x) for f in filters), self.defect_docs):
             if doc.defect_entries.get(run_type):
                 els = els | set(doc.defect.element_changes.keys())
                 defect_entries.append(doc.defect_entries.get(run_type))
                 bulk_entries.append(doc.bulk_entries.get(run_type))
                 vbms.append(doc.vbm.get(run_type))
 
-        # TODO bulks and vbms
-        # form en diagram takes one bulk entry and one bulk vbm
-        # These, however, can be different for each defect/bulk task pair
-        # Need to convert the differences into energy adjustments so that
-        # form en diagram is consistent with all of them
-
         return MultiFormationEnergyDiagram.with_atomic_entries(
             bulk_entry=bulk_entries[0],
             defect_entries=defect_entries,

From a9388e8ff7c3068d69c46bc265da469fb48283c1 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 30 Dec 2022 21:44:20 -0800
Subject: [PATCH 35/50] Updates

---
 src/atomate2/cp2k/schemas/defect.py | 12 +++++++++---
 src/atomate2/cp2k/sets/defect.py    |  6 ++----
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index 5740f9e6ed..aa946c3d2a 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -365,12 +365,14 @@ class DefectValidation(BaseModel):
     DESORPTION_DISTANCE: float = Field(3, description="Distance to consider adsorbate as desorbed")
 
     def process_entry(self, parameters) -> V:
+        """Gets a dictionary of {validator: result}. Result true for passing, false for failing."""
         v = {} 
         v.update(self._atomic_relaxation(parameters))
         v.update(self._desorption(parameters))
         return v
 
     def _atomic_relaxation(self, parameters):
+        """Returns false if the mean displacement outside the isolation radius is greater than the cutoff"""
         in_struc = parameters["initial_defect_structure"]
         out_struc = parameters["final_defect_structure"]
         sites = out_struc.get_sites_in_sphere(parameters['defect_frac_sc_coords'], get_truncated_coulomb_cutoff(in_struc), include_index=True)
@@ -383,10 +385,14 @@ def _atomic_relaxation(self, parameters):
         return {"atomic_relaxation": True}
 
     def _desorption(self, parameters):
+        """Returns false if any atom is too far from all other atoms."""
         if isinstance(parameters['defect'], Adsorbate):
             out_struc = parameters["final_defect_structure"]
-            defect_site =  out_struc.get_sites_in_sphere(parameters['defect_frac_sc_coords'], 0.1, include_index=True)[0]
-            distances = [defect_site.distance(site) for site in out_struc]
+            defect_site =  out_struc.get_sites_in_sphere(
+                out_struc.lattice.get_cartesian_coords(parameters['defect_frac_sc_coords']), 
+                0.1, include_index=True
+                )[0]
+            distances = [defect_site.distance(site) for i, site in enumerate(out_struc) if i != defect_site.index]
             if all(d > self.DESORPTION_DISTANCE for d in distances):
                 return {'desorption': False}
         return {'desorption': True}
@@ -435,7 +441,7 @@ def get_formation_energy_diagram(
         run_type: RunType | str,
         atomic_entries: List[ComputedEntry],
         phase_diagram: PhaseDiagram,
-        filters: List[Callable, None] = None,
+        filters: List[Callable] | None = None,
     ) -> MultiFormationEnergyDiagram:
 
         filters = filters if filters else lambda _: True
diff --git a/src/atomate2/cp2k/sets/defect.py b/src/atomate2/cp2k/sets/defect.py
index 62e68373dc..0fc8574c86 100644
--- a/src/atomate2/cp2k/sets/defect.py
+++ b/src/atomate2/cp2k/sets/defect.py
@@ -10,7 +10,6 @@
 from atomate2.cp2k.sets.base import Cp2kInputGenerator, multiple_input_updators
 from atomate2.cp2k.sets.core import (
     HybridSetGenerator, StaticSetGenerator, RelaxSetGenerator, CellOptSetGenerator,
-    HybridStaticSetGenerator, HybridRelaxSetGenerator, HybridCellOptSetGenerator
 ) 
 logger = logging.getLogger(__name__)
 
@@ -22,9 +21,8 @@ class DefectSetGenerator(Cp2kInputGenerator):
     """
 
     def get_input_updates(self, structure: Structure, *args, **kwargs) -> dict:
-        """
-        """
-        return {'print_v_hartree': True, "print_pdos": True}
+        """Get input updates"""
+        return {'print_v_hartree': True, "print_pdos": True, "print_dos": True}
 
 @dataclass
 @multiple_input_updators()

From 38862bbe05fb670fad833da28033978df1df9caf Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 3 Jan 2023 09:12:56 -0800
Subject: [PATCH 36/50] Make list

---
 src/atomate2/cp2k/schemas/defect.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index aa946c3d2a..4e6120dfab 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -444,7 +444,7 @@ def get_formation_energy_diagram(
         filters: List[Callable] | None = None,
     ) -> MultiFormationEnergyDiagram:
 
-        filters = filters if filters else lambda _: True
+        filters = filters if filters else [lambda _: True]
         els = set()
         defect_entries = []
         bulk_entries = []

From a52a676ec2dbd08be06aded0f399e5b0e4833e35 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 6 Jan 2023 10:30:15 -0800
Subject: [PATCH 37/50] basic def test

---
 tests/cp2k/sets/test_defect.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 tests/cp2k/sets/test_defect.py

diff --git a/tests/cp2k/sets/test_defect.py b/tests/cp2k/sets/test_defect.py
new file mode 100644
index 0000000000..631c6825f2
--- /dev/null
+++ b/tests/cp2k/sets/test_defect.py
@@ -0,0 +1,18 @@
+import pytest
+
+def test_input_generators(si_structure):
+    from atomate2.cp2k.sets.defect import (
+        DefectSetGenerator, DefectStaticSetGenerator, DefectRelaxSetGenerator, DefectCellOptSetGenerator,
+        DefectHybridStaticSetGenerator, DefectHybridRelaxSetGenerator, DefectHybridCellOptSetGenerator
+    )
+    
+    # check that all generators give the correct printing
+    for gen in [
+        DefectSetGenerator(), DefectStaticSetGenerator(), DefectRelaxSetGenerator(), 
+        DefectCellOptSetGenerator(), DefectHybridStaticSetGenerator(),
+        DefectHybridRelaxSetGenerator(), DefectHybridCellOptSetGenerator()
+        ]:
+        input_set = gen.get_input_set(si_structure)
+        assert input_set.cp2k_input.check("FORCE_EVAL/DFT/PRINT/PDOS") or input_set.cp2k_input.check("FORCE_EVAL/DFT/PRINT/DOS")
+        assert input_set.cp2k_input.check("FORCE_EVAL/DFT/PRINT/V_HARTREE_CUBE")
+    

From 802308eb919f953eb7fddacf8597638dace310af Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 6 Jan 2023 13:42:37 -0800
Subject: [PATCH 38/50] Use __post_init__ instead

---
 src/atomate2/cp2k/sets/defect.py | 53 +++++++++++++++++++-------------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/src/atomate2/cp2k/sets/defect.py b/src/atomate2/cp2k/sets/defect.py
index 0fc8574c86..33f06e5019 100644
--- a/src/atomate2/cp2k/sets/defect.py
+++ b/src/atomate2/cp2k/sets/defect.py
@@ -7,16 +7,19 @@
 
 from pymatgen.core import Structure
 
-from atomate2.cp2k.sets.base import Cp2kInputGenerator, multiple_input_updators
+from atomate2.cp2k.sets.base import Cp2kInputGenerator
 from atomate2.cp2k.sets.core import (
-    HybridSetGenerator, StaticSetGenerator, RelaxSetGenerator, CellOptSetGenerator,
-) 
+    StaticSetGenerator, RelaxSetGenerator, CellOptSetGenerator,
+    HybridStaticSetGenerator, HybridRelaxSetGenerator, HybridCellOptSetGenerator
+)
 logger = logging.getLogger(__name__)
 
+DEFECT_SET_UPDATES = {'print_v_hartree': True, "print_pdos": True, "print_dos": True}
+
 @dataclass
 class DefectSetGenerator(Cp2kInputGenerator):
     """
-    Base input set generator for defect calculations. Adds printing of the 
+    Base input set generator for defect calculations. Adds printing of the
     partial density of states and the electrostatic potential.
     """
 
@@ -25,31 +28,37 @@ def get_input_updates(self, structure: Structure, *args, **kwargs) -> dict:
         return {'print_v_hartree': True, "print_pdos": True, "print_dos": True}
 
 @dataclass
-@multiple_input_updators()
-class DefectStaticSetGenerator(DefectSetGenerator, StaticSetGenerator):
-    pass    
+class DefectStaticSetGenerator(StaticSetGenerator):
+
+    def __post_init__(self):
+        self.user_input_settings.update(DEFECT_SET_UPDATES)
 
 @dataclass
-@multiple_input_updators()
-class DefectRelaxSetGenerator(DefectSetGenerator, RelaxSetGenerator):
-    pass
+class DefectRelaxSetGenerator(RelaxSetGenerator):
+
+    def __post_init__(self):
+        self.user_input_settings.update(DEFECT_SET_UPDATES)
 
 @dataclass
-@multiple_input_updators()
-class DefectCellOptSetGenerator(DefectSetGenerator, CellOptSetGenerator):
-    pass
+class DefectCellOptSetGenerator(CellOptSetGenerator):
+
+    def __post_init__(self):
+        self.user_input_settings.update(DEFECT_SET_UPDATES)
 
 @dataclass
-@multiple_input_updators()
-class DefectHybridStaticSetGenerator(DefectSetGenerator, StaticSetGenerator, HybridSetGenerator):
-    pass   
+class DefectHybridStaticSetGenerator(HybridStaticSetGenerator):
+
+    def __post_init__(self):
+        self.user_input_settings.update(DEFECT_SET_UPDATES)
 
 @dataclass
-@multiple_input_updators()
-class DefectHybridRelaxSetGenerator(DefectSetGenerator, RelaxSetGenerator, HybridSetGenerator):
-    pass
+class DefectHybridRelaxSetGenerator(HybridRelaxSetGenerator):
+
+    def __post_init__(self):
+        self.user_input_settings.update(DEFECT_SET_UPDATES)
 
 @dataclass
-@multiple_input_updators()
-class DefectHybridCellOptSetGenerator(DefectSetGenerator, CellOptSetGenerator, HybridSetGenerator):
-    pass 
\ No newline at end of file
+class DefectHybridCellOptSetGenerator(HybridCellOptSetGenerator):
+
+    def __post_init__(self):
+        self.user_input_settings.update(DEFECT_SET_UPDATES)
\ No newline at end of file

From 791381c73b846807dbd27f915c467929e6746c94 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Mon, 9 Jan 2023 10:54:08 -0800
Subject: [PATCH 39/50] whitespace

---
 tests/cp2k/sets/test_defect.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/cp2k/sets/test_defect.py b/tests/cp2k/sets/test_defect.py
index 631c6825f2..d2f42b505b 100644
--- a/tests/cp2k/sets/test_defect.py
+++ b/tests/cp2k/sets/test_defect.py
@@ -5,14 +5,13 @@ def test_input_generators(si_structure):
         DefectSetGenerator, DefectStaticSetGenerator, DefectRelaxSetGenerator, DefectCellOptSetGenerator,
         DefectHybridStaticSetGenerator, DefectHybridRelaxSetGenerator, DefectHybridCellOptSetGenerator
     )
-    
+
     # check that all generators give the correct printing
     for gen in [
-        DefectSetGenerator(), DefectStaticSetGenerator(), DefectRelaxSetGenerator(), 
+        DefectSetGenerator(), DefectStaticSetGenerator(), DefectRelaxSetGenerator(),
         DefectCellOptSetGenerator(), DefectHybridStaticSetGenerator(),
         DefectHybridRelaxSetGenerator(), DefectHybridCellOptSetGenerator()
         ]:
         input_set = gen.get_input_set(si_structure)
         assert input_set.cp2k_input.check("FORCE_EVAL/DFT/PRINT/PDOS") or input_set.cp2k_input.check("FORCE_EVAL/DFT/PRINT/DOS")
         assert input_set.cp2k_input.check("FORCE_EVAL/DFT/PRINT/V_HARTREE_CUBE")
-    

From 2928336c6ac741ea3d2c02d49771f013f1d92b0b Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 10 Jan 2023 11:17:18 -0800
Subject: [PATCH 40/50] Defect jobs

---
 src/atomate2/cp2k/flows/defect.py | 62 +++++++++++++------------------
 src/atomate2/cp2k/jobs/defect.py  | 43 ++++++++++-----------
 2 files changed, 46 insertions(+), 59 deletions(-)

diff --git a/src/atomate2/cp2k/flows/defect.py b/src/atomate2/cp2k/flows/defect.py
index 5567d60502..79e5ae27f3 100644
--- a/src/atomate2/cp2k/flows/defect.py
+++ b/src/atomate2/cp2k/flows/defect.py
@@ -19,20 +19,9 @@
 from pymatgen.analysis.defects.thermo import DefectEntry
 from pymatgen.analysis.defects.supercells import get_sc_fromstruct
 
-from atomate2.cp2k.jobs.base import BaseCp2kMaker 
-from atomate2.cp2k.jobs.core import StaticMaker, HybridStaticMaker, RelaxMaker, HybridRelaxMaker, CellOptMaker, HybridCellOptMaker
-
-from atomate2.cp2k.schemas.defect import DefectDoc
-from atomate2.cp2k.sets.core import (
-    StaticSetGenerator, RelaxSetGenerator, CellOptSetGenerator
-)
-
-from atomate2.cp2k.sets.defect import (
-    DefectStaticSetGenerator, DefectRelaxSetGenerator, DefectCellOptSetGenerator,
-    DefectHybridStaticSetGenerator, DefectHybridRelaxSetGenerator, DefectHybridCellOptSetGenerator
-)
+from atomate2.cp2k.jobs.base import BaseCp2kMaker
 from atomate2.cp2k.jobs.defect import (
-    BaseDefectMaker, DefectStaticMaker, DefectRelaxMaker, DefectCellOptMaker,
+    DefectStaticMaker, DefectRelaxMaker, DefectCellOptMaker,
     DefectHybridStaticMaker, DefectHybridRelaxMaker, DefectHybridCellOptMaker
 )
 
@@ -43,23 +32,23 @@
 @dataclass
 class DefectHybridStaticFlowMaker(HybridStaticFlowMaker):
 
-    initialize_maker: BaseCp2kMaker = field(default_factory=DefectStaticMaker)
+    pbe_maker: BaseCp2kMaker = field(default_factory=DefectStaticMaker)
     hybrid_maker: BaseCp2kMaker = field(default=DefectHybridStaticMaker(
         copy_cp2k_kwargs={'additional_cp2k_files': ("info.json",)})
         )
 
-@dataclass 
+@dataclass
 class DefectHybridRelaxFlowMaker(HybridRelaxFlowMaker):
 
-    initialize_maker: BaseCp2kMaker = field(default_factory=DefectStaticMaker)
+    pbe_maker: BaseCp2kMaker = field(default_factory=DefectStaticMaker)
     hybrid_maker: BaseCp2kMaker = field(default=DefectHybridRelaxMaker(
         copy_cp2k_kwargs={'additional_cp2k_files': ("info.json",)})
         )
 
-@dataclass 
+@dataclass
 class DefectHybridCellOptFlowMaker(HybridCellOptFlowMaker):
 
-    initialize_maker: BaseCp2kMaker = field(default_factory=DefectStaticMaker)
+    pbe_maker: BaseCp2kMaker = field(default_factory=DefectStaticMaker)
     hybrid_maker: BaseCp2kMaker = field(default=DefectHybridCellOptMaker(
         copy_cp2k_kwargs={'additional_cp2k_files': ("info.json",)})
         )
@@ -73,7 +62,7 @@ class FormationEnergyMaker(Maker):
 
     Parameters
     ----------
-    name: This flow's name. i.e. "defect formation energy" 
+    name: This flow's name. i.e. "defect formation energy"
     run_bulk: whether to run the bulk supercell as a static ("static")
         calculation, a full relaxation ("relax"), or to skip it (False)
     hybrid_functional: If provided, this activates hybrid version of the
@@ -91,10 +80,10 @@ class FormationEnergyMaker(Maker):
     """
 
     name: str = "defect formation energy"
-    run_bulk: Literal["static", "relax"] | bool = field(default="static") 
+    run_bulk: Literal["static", "relax"] | bool = field(default="static")
     hybrid_functional: str | None = field(default=None)
     initialize_with_pbe: bool = field(default=True)
-    
+
     supercell_matrix: NDArray = field(default=None)
     min_atoms: int = field(default=80)
     max_atoms: int = field(default=240)
@@ -106,7 +95,7 @@ def __post_init__(self):
             if self.hybrid_functional:
                 self.bulk_maker = DefectHybridCellOptMaker(
                     name="bulk hybrid relax", transformations=None,
-                    initialize_with_pbe=self.initialize_with_pbe, 
+                    initialize_with_pbe=self.initialize_with_pbe,
                     hybrid_functional=self.hybrid_functional
                     )
             else:
@@ -114,7 +103,7 @@ def __post_init__(self):
 
         elif self.run_bulk == "static":
             if self.hybrid_functional:
-                self.bulk_maker = DefectHybridStaticFlowMaker( 
+                self.bulk_maker = DefectHybridStaticFlowMaker(
                     name='bulk hybrid static',
                     initialize_with_pbe=self.initialize_with_pbe,
                     hybrid_functional=self.hybrid_functional,
@@ -127,20 +116,21 @@ def __post_init__(self):
                 hybrid_functional=self.hybrid_functional,
                 initialize_with_pbe=self.initialize_with_pbe,
             )
-            self.def_maker.initialize_maker.supercell_matrix = self.supercell_matrix
+            self.def_maker.pbe_maker.supercell_matrix = self.supercell_matrix
             self.def_maker.hybrid_maker.supercell_matrix = self.supercell_matrix
 
-            self.def_maker.initialize_maker.max_atoms = self.max_atoms
+            self.def_maker.pbe_maker.max_atoms = self.max_atoms
             self.def_maker.hybrid_maker.max_atoms = self.max_atoms
 
-            self.def_maker.initialize_maker.min_atoms = self.min_atoms
+            self.def_maker.pbe_maker.min_atoms = self.min_atoms
             self.def_maker.hybrid_maker.min_atoms = self.min_atoms
 
-            self.def_maker.initialize_maker.min_length = self.min_length
+            self.def_maker.pbe_maker.min_length = self.min_length
             self.def_maker.hybrid_maker.min_length = self.min_length
 
-            self.def_maker.initialize_maker.force_diagonal = self.force_diagonal
+            self.def_maker.pbe_maker.force_diagonal = self.force_diagonal
             self.def_maker.hybrid_maker.force_diagonal = self.force_diagonal
+
         else:
             self.def_maker = DefectRelaxMaker()
             self.def_maker.supercell_matrix = self.supercell_matrix
@@ -150,13 +140,13 @@ def __post_init__(self):
             self.def_maker.force_diagonal = self.force_diagonal
 
     def make(
-        self, defects: Iterable[Defect], 
-        charges: bool | Iterable[int] = False, 
+        self, defects: Iterable[Defect],
+        charges: bool | Iterable[int] = False,
         dielectric: NDArray | int | float | None = None,
         prev_cp2k_dir: str | Path | None = None,
         collect_outputs: bool = True,
         ):
-        """Make a flow to run multiple defects in order to calculate their formation 
+        """Make a flow to run multiple defects in order to calculate their formation
         energy diagram.
 
         Parameters
@@ -176,8 +166,8 @@ def make(
 
         sc_mat = self.supercell_matrix if self.supercell_matrix else \
                     get_sc_fromstruct(
-                        bulk_structure, self.min_atoms, 
-                        self.max_atoms, self.min_length, 
+                        bulk_structure, self.min_atoms,
+                        self.max_atoms, self.min_length,
                         self.force_diagonal,)
 
         if self.run_bulk:
@@ -192,7 +182,9 @@ def make(
             else:
                 chgs = charges if charges else [0]
             for charge in chgs:
-                defect_job = self.def_maker.make(deepcopy(defect), charge)
+                dfct = deepcopy(defect)
+                dfct.user_charges = [charge]
+                defect_job = self.def_maker.make(dfct)
                 jobs.append(defect_job)
                 defect_outputs[defect.name][int(charge)] = (defect, defect_job.output)
 
@@ -205,7 +197,6 @@ def make(
             jobs.append(collect_job)
         else:
             collect_job = None
-
         return Flow(
             jobs=jobs,
             name=self.name,
@@ -274,4 +265,3 @@ def ensure_defects_same_structure(defects: Iterable[Defect]):
         elif struct != defect.structure:
             raise ValueError("All defects must have the same host structure.")
     return struct
-
diff --git a/src/atomate2/cp2k/jobs/defect.py b/src/atomate2/cp2k/jobs/defect.py
index 3147532f23..188e6ecf50 100644
--- a/src/atomate2/cp2k/jobs/defect.py
+++ b/src/atomate2/cp2k/jobs/defect.py
@@ -13,7 +13,7 @@
 from pymatgen.analysis.defects.core import Defect, Vacancy
 from atomate2.cp2k.sets.base import Cp2kInputGenerator, recursive_update
 from atomate2.cp2k.sets.defect import (
-    DefectSetGenerator, DefectStaticSetGenerator, DefectRelaxSetGenerator, DefectCellOptSetGenerator, 
+    DefectSetGenerator, DefectStaticSetGenerator, DefectRelaxSetGenerator, DefectCellOptSetGenerator,
     DefectHybridStaticSetGenerator, DefectHybridRelaxSetGenerator, DefectHybridCellOptSetGenerator
 )
 from atomate2.cp2k.jobs.base import BaseCp2kMaker, cp2k_job
@@ -26,7 +26,7 @@
     "store_volumetric_data": ("v_hartree",),
 }
 
-@dataclass 
+@dataclass
 class BaseDefectMaker(BaseCp2kMaker):
 
     task_document_kwargs: dict = field(default_factory=lambda: DEFECT_TASK_DOC)
@@ -37,12 +37,12 @@ class BaseDefectMaker(BaseCp2kMaker):
     force_diagonal: bool = field(default=False)
 
     @cp2k_job
-    def make(self, defect: Defect | Structure, charge: int = 0, prev_cp2k_dir: str | Path | None = None):
+    def make(self, defect: Defect | Structure, prev_cp2k_dir: str | Path | None = None):
         if isinstance(defect, Defect):
 
             structure = defect.get_supercell_structure(
-                sc_mat=self.supercell_matrix, 
-                dummy_species=defect.site.species if isinstance(defect, Vacancy) else None, 
+                sc_mat=self.supercell_matrix,
+                dummy_species=defect.site.species if isinstance(defect, Vacancy) else None,
                 min_atoms=self.min_atoms,
                 max_atoms=self.max_atoms,
                 min_length=self.min_length,
@@ -52,19 +52,26 @@ def make(self, defect: Defect | Structure, charge: int = 0, prev_cp2k_dir: str |
             if isinstance(defect, Vacancy):
                 structure.add_site_property("ghost", [False]*(len(structure.sites)-1) + [True])
 
+            if defect.user_charges:
+                if len(defect.user_charges) > 1:
+                    raise ValueError("Multiple user charges found. Individual defect jobs can only contain 1.")
+                else:
+                    charge = defect.user_charges[0]
+            else:
+                charge = 0
+
             # provenance stuff
             recursive_update(self.write_additional_data, {
                 "info.json": {
-                    "defect": deepcopy(defect), 
-                    "defect_charge": charge, 
+                    "defect": deepcopy(defect),
                     "sc_mat": self.supercell_matrix
                     }
                 }
             )
-            
+
         else:
-            charge = charge if charge else defect.charge
             structure = deepcopy(defect)
+            charge = structure.charge
 
         structure.set_charge(charge)
         return super().make.original(self, structure=structure, prev_cp2k_dir=prev_cp2k_dir)
@@ -106,29 +113,19 @@ class DefectCellOptMaker(BaseDefectMaker):
     transformation_params: tuple[dict, ...] | None = field(default=({"distance": 0.01},))
 
 @dataclass
-class DefectHybridStaticMaker(DefectStaticMaker, HybridStaticMaker):
-    
+class DefectHybridStaticMaker(BaseDefectMaker):
+
     name: str = field(default="defect hybrid static")
     input_set_generator: DefectSetGenerator = field(default_factory=DefectHybridStaticSetGenerator)
 
 @dataclass
-class DefectHybridRelaxMaker(DefectRelaxMaker, HybridRelaxMaker):
+class DefectHybridRelaxMaker(BaseDefectMaker):
 
     name: str = field(default="defect hybrid relax")
     input_set_generator: DefectSetGenerator = field(default_factory=DefectHybridRelaxSetGenerator)
 
 @dataclass
-class DefectHybridCellOptMaker(DefectCellOptMaker, HybridCellOptMaker):
+class DefectHybridCellOptMaker(BaseDefectMaker):
 
     name: str = field(default="defect hybrid cell opt")
     input_set_generator: DefectSetGenerator = field(default_factory=DefectHybridCellOptSetGenerator)
-
-class GhostVacancy(Vacancy):
-    """Custom override of vacancy to deal with basis set superposition error."""
-
-    @property
-    def defect_structure(self):
-        """Returns the defect structure with the proper oxidation state"""
-        struct = self.structure.copy()
-        struct.add_site_property("ghost", [i == self.defect_site_index for i in range(len(struct))])
-        return struct

From 4d5109530858faa80714ec2d4396c4eb7050a381 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 10 Jan 2023 11:17:32 -0800
Subject: [PATCH 41/50] lint

---
 src/atomate2/cp2k/flows/defect.py | 133 +++++++++++++++++++-----------
 src/atomate2/cp2k/jobs/defect.py  |  95 ++++++++++++++-------
 2 files changed, 154 insertions(+), 74 deletions(-)

diff --git a/src/atomate2/cp2k/flows/defect.py b/src/atomate2/cp2k/flows/defect.py
index 79e5ae27f3..1faefcf425 100644
--- a/src/atomate2/cp2k/flows/defect.py
+++ b/src/atomate2/cp2k/flows/defect.py
@@ -1,57 +1,71 @@
-
 """Flows used in the calculation of defect properties."""
 
 from __future__ import annotations
-from copy import deepcopy
 
 import logging
+from copy import deepcopy
 from dataclasses import dataclass, field
-from typing import Iterable, Literal, Mapping
 from pathlib import Path
-from numpy.typing import NDArray
-import itertools
+from typing import Iterable, Literal, Mapping
 
-from jobflow import Flow, Job, Maker, OutputReference, job
-from pymatgen.core.structure import Structure
-from pymatgen.io.common import VolumetricData
-from pymatgen.entries.computed_entries import ComputedStructureEntry
+from jobflow import Flow, Maker, OutputReference, job
+from numpy.typing import NDArray
 from pymatgen.analysis.defects.core import Defect
-from pymatgen.analysis.defects.thermo import DefectEntry
 from pymatgen.analysis.defects.supercells import get_sc_fromstruct
+from pymatgen.analysis.defects.thermo import DefectEntry
+from pymatgen.entries.computed_entries import ComputedStructureEntry
+from pymatgen.io.common import VolumetricData
 
+from atomate2.cp2k.flows.core import (
+    HybridCellOptFlowMaker,
+    HybridRelaxFlowMaker,
+    HybridStaticFlowMaker,
+)
 from atomate2.cp2k.jobs.base import BaseCp2kMaker
 from atomate2.cp2k.jobs.defect import (
-    DefectStaticMaker, DefectRelaxMaker, DefectCellOptMaker,
-    DefectHybridStaticMaker, DefectHybridRelaxMaker, DefectHybridCellOptMaker
+    DefectCellOptMaker,
+    DefectHybridCellOptMaker,
+    DefectHybridRelaxMaker,
+    DefectHybridStaticMaker,
+    DefectRelaxMaker,
+    DefectStaticMaker,
 )
 
-from atomate2.cp2k.flows.core import HybridStaticFlowMaker, HybridRelaxFlowMaker, HybridCellOptFlowMaker
-
 logger = logging.getLogger(__name__)
 
+
 @dataclass
 class DefectHybridStaticFlowMaker(HybridStaticFlowMaker):
 
     pbe_maker: BaseCp2kMaker = field(default_factory=DefectStaticMaker)
-    hybrid_maker: BaseCp2kMaker = field(default=DefectHybridStaticMaker(
-        copy_cp2k_kwargs={'additional_cp2k_files': ("info.json",)})
+    hybrid_maker: BaseCp2kMaker = field(
+        default=DefectHybridStaticMaker(
+            copy_cp2k_kwargs={"additional_cp2k_files": ("info.json",)}
         )
+    )
+
 
 @dataclass
 class DefectHybridRelaxFlowMaker(HybridRelaxFlowMaker):
 
     pbe_maker: BaseCp2kMaker = field(default_factory=DefectStaticMaker)
-    hybrid_maker: BaseCp2kMaker = field(default=DefectHybridRelaxMaker(
-        copy_cp2k_kwargs={'additional_cp2k_files': ("info.json",)})
+    hybrid_maker: BaseCp2kMaker = field(
+        default=DefectHybridRelaxMaker(
+            copy_cp2k_kwargs={"additional_cp2k_files": ("info.json",)}
         )
+    )
+
 
 @dataclass
 class DefectHybridCellOptFlowMaker(HybridCellOptFlowMaker):
 
     pbe_maker: BaseCp2kMaker = field(default_factory=DefectStaticMaker)
-    hybrid_maker: BaseCp2kMaker = field(default=DefectHybridCellOptMaker(
-        copy_cp2k_kwargs={'additional_cp2k_files': ("info.json",)})
+    hybrid_maker: BaseCp2kMaker = field(
+        default=DefectHybridCellOptMaker(
+            copy_cp2k_kwargs={"additional_cp2k_files": ("info.json",)}
         )
+    )
+
 
 # TODO close to being able to put this in common. Just need a switch that decides which core flow/job to use based on software
 @dataclass
@@ -91,23 +105,26 @@ class FormationEnergyMaker(Maker):
     force_diagonal: bool = field(default=False)
 
     def __post_init__(self):
-        if self.run_bulk == 'relax':
+        if self.run_bulk == "relax":
             if self.hybrid_functional:
                 self.bulk_maker = DefectHybridCellOptMaker(
-                    name="bulk hybrid relax", transformations=None,
+                    name="bulk hybrid relax",
+                    transformations=None,
                     initialize_with_pbe=self.initialize_with_pbe,
-                    hybrid_functional=self.hybrid_functional
-                    )
+                    hybrid_functional=self.hybrid_functional,
+                )
             else:
-                self.bulk_maker = DefectCellOptMaker(name="bulk relax", transformations=None)
+                self.bulk_maker = DefectCellOptMaker(
+                    name="bulk relax", transformations=None
+                )
 
         elif self.run_bulk == "static":
             if self.hybrid_functional:
                 self.bulk_maker = DefectHybridStaticFlowMaker(
-                    name='bulk hybrid static',
+                    name="bulk hybrid static",
                     initialize_with_pbe=self.initialize_with_pbe,
                     hybrid_functional=self.hybrid_functional,
-                    )
+                )
             else:
                 self.bulk_maker = DefectStaticMaker(name="bulk static")
 
@@ -140,12 +157,13 @@ def __post_init__(self):
             self.def_maker.force_diagonal = self.force_diagonal
 
     def make(
-        self, defects: Iterable[Defect],
+        self,
+        defects: Iterable[Defect],
         charges: bool | Iterable[int] = False,
         dielectric: NDArray | int | float | None = None,
         prev_cp2k_dir: str | Path | None = None,
         collect_outputs: bool = True,
-        ):
+    ):
         """Make a flow to run multiple defects in order to calculate their formation
         energy diagram.
 
@@ -161,19 +179,29 @@ def make(
             The workflow to calculate the formation energy diagram.
         """
         jobs, defect_outputs = [], {}
-        defect_outputs = {defect.name: {} for defect in defects} # TODO DEFECT NAMES ARE NOT UNIQUE HASHES
+        defect_outputs = {
+            defect.name: {} for defect in defects
+        }  # TODO DEFECT NAMES ARE NOT UNIQUE HASHES
         bulk_structure = ensure_defects_same_structure(defects)
 
-        sc_mat = self.supercell_matrix if self.supercell_matrix else \
-                    get_sc_fromstruct(
-                        bulk_structure, self.min_atoms,
-                        self.max_atoms, self.min_length,
-                        self.force_diagonal,)
+        sc_mat = (
+            self.supercell_matrix
+            if self.supercell_matrix
+            else get_sc_fromstruct(
+                bulk_structure,
+                self.min_atoms,
+                self.max_atoms,
+                self.min_length,
+                self.force_diagonal,
+            )
+        )
 
         if self.run_bulk:
             s = bulk_structure.copy()
             s.make_supercell(sc_mat)
-            bulk_job = self.bulk_maker.make(bulk_structure * sc_mat, prev_cp2k_dir=prev_cp2k_dir)
+            bulk_job = self.bulk_maker.make(
+                bulk_structure * sc_mat, prev_cp2k_dir=prev_cp2k_dir
+            )
             jobs.append(bulk_job)
 
         for defect in defects:
@@ -192,8 +220,8 @@ def make(
             collect_job = collect_defect_outputs(
                 defect_outputs=defect_outputs,
                 bulk_output=bulk_job.output if self.run_bulk else None,
-                dielectric=dielectric
-                )
+                dielectric=dielectric,
+            )
             jobs.append(collect_job)
         else:
             collect_job = None
@@ -203,10 +231,13 @@ def make(
             output=jobs[-1].output if collect_job else None,
         )
 
+
 # TODO this is totally code agnostic and should be in common
 @job
 def collect_defect_outputs(
-    defect_outputs: Mapping[str, Mapping[int, OutputReference]], bulk_output: OutputReference, dielectric: NDArray | int | float | None
+    defect_outputs: Mapping[str, Mapping[int, OutputReference]],
+    bulk_output: OutputReference,
+    dielectric: NDArray | int | float | None,
 ) -> dict:
     """Collect all the outputs from the defect calculations.
     This job will combine the structure and entry fields to create a
@@ -222,7 +253,9 @@ def collect_defect_outputs(
     """
     outputs = {"results": {}}
     if not dielectric:
-        logger.warn("Dielectric constant not provided. Defect formation energies will be uncorrected.")
+        logger.warn(
+            "Dielectric constant not provided. Defect formation energies will be uncorrected."
+        )
     for defect_name, defects_with_charges in defect_outputs.items():
         defect_entries = []
         fnv_plots = {}
@@ -232,21 +265,29 @@ def collect_defect_outputs(
             defect_entry = DefectEntry(
                 defect=defect,
                 charge_state=charge,
-                sc_entry=ComputedStructureEntry(structure=bulk_output.structure, energy=output_with_charge.output.energy - bulk_output.output.energy)
+                sc_entry=ComputedStructureEntry(
+                    structure=bulk_output.structure,
+                    energy=output_with_charge.output.energy - bulk_output.output.energy,
+                ),
             )
             defect_entries.append(defect_entry)
             plot_data = defect_entry.get_freysoldt_correction(
-                defect_locpot=VolumetricData.from_dict(output_with_charge.cp2k_objects['v_hartree']),
-                bulk_locpot=VolumetricData.from_dict(output_with_charge.cp2k_objects['v_hartree']),
-                dielectric=dielectric
-                )
+                defect_locpot=VolumetricData.from_dict(
+                    output_with_charge.cp2k_objects["v_hartree"]
+                ),
+                bulk_locpot=VolumetricData.from_dict(
+                    output_with_charge.cp2k_objects["v_hartree"]
+                ),
+                dielectric=dielectric,
+            )
             fnv_plots[int(charge)] = plot_data
         outputs["results"][defect.name] = dict(
             defect=defect, defect_entries=defect_entries, fnv_plots=fnv_plots
         )
     return outputs
 
-#TODO should be in common
+
+# TODO should be in common
 def ensure_defects_same_structure(defects: Iterable[Defect]):
     """Ensure that the defects are valid.
     Parameters
diff --git a/src/atomate2/cp2k/jobs/defect.py b/src/atomate2/cp2k/jobs/defect.py
index 188e6ecf50..04898f46a0 100644
--- a/src/atomate2/cp2k/jobs/defect.py
+++ b/src/atomate2/cp2k/jobs/defect.py
@@ -3,21 +3,25 @@
 from __future__ import annotations
 
 import logging
-from pathlib import Path
-from dataclasses import dataclass, field
 from copy import deepcopy
-from tkinter import W
-from numpy.typing import NDArray
+from dataclasses import dataclass, field
+from pathlib import Path
 
-from pymatgen.core import Structure
+from numpy.typing import NDArray
 from pymatgen.analysis.defects.core import Defect, Vacancy
+from pymatgen.core import Structure
+
+from atomate2.cp2k.jobs.base import BaseCp2kMaker, cp2k_job
 from atomate2.cp2k.sets.base import Cp2kInputGenerator, recursive_update
 from atomate2.cp2k.sets.defect import (
-    DefectSetGenerator, DefectStaticSetGenerator, DefectRelaxSetGenerator, DefectCellOptSetGenerator,
-    DefectHybridStaticSetGenerator, DefectHybridRelaxSetGenerator, DefectHybridCellOptSetGenerator
+    DefectCellOptSetGenerator,
+    DefectHybridCellOptSetGenerator,
+    DefectHybridRelaxSetGenerator,
+    DefectHybridStaticSetGenerator,
+    DefectRelaxSetGenerator,
+    DefectSetGenerator,
+    DefectStaticSetGenerator,
 )
-from atomate2.cp2k.jobs.base import BaseCp2kMaker, cp2k_job
-from atomate2.cp2k.jobs.core import HybridStaticMaker, HybridRelaxMaker, HybridCellOptMaker
 
 logger = logging.getLogger(__name__)
 
@@ -26,6 +30,7 @@
     "store_volumetric_data": ("v_hartree",),
 }
 
+
 @dataclass
 class BaseDefectMaker(BaseCp2kMaker):
 
@@ -42,7 +47,9 @@ def make(self, defect: Defect | Structure, prev_cp2k_dir: str | Path | None = No
 
             structure = defect.get_supercell_structure(
                 sc_mat=self.supercell_matrix,
-                dummy_species=defect.site.species if isinstance(defect, Vacancy) else None,
+                dummy_species=defect.site.species
+                if isinstance(defect, Vacancy)
+                else None,
                 min_atoms=self.min_atoms,
                 max_atoms=self.max_atoms,
                 min_length=self.min_length,
@@ -50,23 +57,29 @@ def make(self, defect: Defect | Structure, prev_cp2k_dir: str | Path | None = No
             )
 
             if isinstance(defect, Vacancy):
-                structure.add_site_property("ghost", [False]*(len(structure.sites)-1) + [True])
+                structure.add_site_property(
+                    "ghost", [False] * (len(structure.sites) - 1) + [True]
+                )
 
             if defect.user_charges:
                 if len(defect.user_charges) > 1:
-                    raise ValueError("Multiple user charges found. Individual defect jobs can only contain 1.")
+                    raise ValueError(
+                        "Multiple user charges found. Individual defect jobs can only contain 1."
+                    )
                 else:
                     charge = defect.user_charges[0]
             else:
                 charge = 0
 
             # provenance stuff
-            recursive_update(self.write_additional_data, {
-                "info.json": {
-                    "defect": deepcopy(defect),
-                    "sc_mat": self.supercell_matrix
+            recursive_update(
+                self.write_additional_data,
+                {
+                    "info.json": {
+                        "defect": deepcopy(defect),
+                        "sc_mat": self.supercell_matrix,
                     }
-                }
+                },
             )
 
         else:
@@ -74,7 +87,10 @@ def make(self, defect: Defect | Structure, prev_cp2k_dir: str | Path | None = No
             charge = structure.charge
 
         structure.set_charge(charge)
-        return super().make.original(self, structure=structure, prev_cp2k_dir=prev_cp2k_dir)
+        return super().make.original(
+            self, structure=structure, prev_cp2k_dir=prev_cp2k_dir
+        )
+
 
 @dataclass
 class DefectStaticMaker(BaseDefectMaker):
@@ -82,7 +98,8 @@ class DefectStaticMaker(BaseDefectMaker):
     name: str = field(default="defect static")
     input_set_generator: DefectSetGenerator = field(
         default_factory=DefectStaticSetGenerator
-        )
+    )
+
 
 @dataclass
 class DefectRelaxMaker(BaseDefectMaker):
@@ -94,9 +111,16 @@ class DefectRelaxMaker(BaseDefectMaker):
     """
 
     name: str = field(default="defect relax")
-    input_set_generator: Cp2kInputGenerator = field(default_factory=DefectRelaxSetGenerator)
-    transformations: tuple[str, ...] = field(default=("PerturbStructureTransformation",))
-    transformation_params: tuple[dict, ...] | None = field(default=({"distance": 0.01},))
+    input_set_generator: Cp2kInputGenerator = field(
+        default_factory=DefectRelaxSetGenerator
+    )
+    transformations: tuple[str, ...] = field(
+        default=("PerturbStructureTransformation",)
+    )
+    transformation_params: tuple[dict, ...] | None = field(
+        default=({"distance": 0.01},)
+    )
+
 
 @dataclass
 class DefectCellOptMaker(BaseDefectMaker):
@@ -108,24 +132,39 @@ class DefectCellOptMaker(BaseDefectMaker):
     """
 
     name: str = field(default="defect relax")
-    input_set_generator: Cp2kInputGenerator = field(default_factory=DefectCellOptSetGenerator)
-    transformations: tuple[str, ...] = field(default=("PerturbStructureTransformation",))
-    transformation_params: tuple[dict, ...] | None = field(default=({"distance": 0.01},))
+    input_set_generator: Cp2kInputGenerator = field(
+        default_factory=DefectCellOptSetGenerator
+    )
+    transformations: tuple[str, ...] = field(
+        default=("PerturbStructureTransformation",)
+    )
+    transformation_params: tuple[dict, ...] | None = field(
+        default=({"distance": 0.01},)
+    )
+
 
 @dataclass
 class DefectHybridStaticMaker(BaseDefectMaker):
 
     name: str = field(default="defect hybrid static")
-    input_set_generator: DefectSetGenerator = field(default_factory=DefectHybridStaticSetGenerator)
+    input_set_generator: DefectSetGenerator = field(
+        default_factory=DefectHybridStaticSetGenerator
+    )
+
 
 @dataclass
 class DefectHybridRelaxMaker(BaseDefectMaker):
 
     name: str = field(default="defect hybrid relax")
-    input_set_generator: DefectSetGenerator = field(default_factory=DefectHybridRelaxSetGenerator)
+    input_set_generator: DefectSetGenerator = field(
+        default_factory=DefectHybridRelaxSetGenerator
+    )
+
 
 @dataclass
 class DefectHybridCellOptMaker(BaseDefectMaker):
 
     name: str = field(default="defect hybrid cell opt")
-    input_set_generator: DefectSetGenerator = field(default_factory=DefectHybridCellOptSetGenerator)
+    input_set_generator: DefectSetGenerator = field(
+        default_factory=DefectHybridCellOptSetGenerator
+    )

From 7317dab4d93ed52d1f8c784189f0335393bcafc4 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Thu, 12 Jan 2023 10:04:45 -0800
Subject: [PATCH 42/50] dfct

---
 src/atomate2/cp2k/jobs/defect.py | 33 ++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/src/atomate2/cp2k/jobs/defect.py b/src/atomate2/cp2k/jobs/defect.py
index 04898f46a0..5637f79504 100644
--- a/src/atomate2/cp2k/jobs/defect.py
+++ b/src/atomate2/cp2k/jobs/defect.py
@@ -147,24 +147,57 @@ class DefectCellOptMaker(BaseDefectMaker):
 class DefectHybridStaticMaker(BaseDefectMaker):
 
     name: str = field(default="defect hybrid static")
+    hybrid_functional: str = "PBE0"
     input_set_generator: DefectSetGenerator = field(
         default_factory=DefectHybridStaticSetGenerator
     )
 
+    def __post_init__(self):
+        """Update the input settings with hybrid_functional attribute"""
+        self.input_set_generator.user_input_settings.update(
+            {"activate_hybrid": {"hybrid_functional": self.hybrid_functional}}
+        )
+
 
 @dataclass
 class DefectHybridRelaxMaker(BaseDefectMaker):
 
     name: str = field(default="defect hybrid relax")
+    hybrid_functional: str = "PBE0"
     input_set_generator: DefectSetGenerator = field(
         default_factory=DefectHybridRelaxSetGenerator
     )
+    transformations: tuple[str, ...] = field(
+        default=("PerturbStructureTransformation",)
+    )
+    transformation_params: tuple[dict, ...] | None = field(
+        default=({"distance": 0.01},)
+    )
+
+    def __post_init__(self):
+        """Update the input settings with hybrid_functional attribute"""
+        self.input_set_generator.user_input_settings.update(
+            {"activate_hybrid": {"hybrid_functional": self.hybrid_functional}}
+        )
 
 
 @dataclass
 class DefectHybridCellOptMaker(BaseDefectMaker):
 
     name: str = field(default="defect hybrid cell opt")
+    hybrid_functional: str = "PBE0"
     input_set_generator: DefectSetGenerator = field(
         default_factory=DefectHybridCellOptSetGenerator
     )
+    transformations: tuple[str, ...] = field(
+        default=("PerturbStructureTransformation",)
+    )
+    transformation_params: tuple[dict, ...] | None = field(
+        default=({"distance": 0.01},)
+    )
+
+    def __post_init__(self):
+        """Update the input settings with hybrid_functional attribute"""
+        self.input_set_generator.user_input_settings.update(
+            {"activate_hybrid": {"hybrid_functional": self.hybrid_functional}}
+        )
\ No newline at end of file

From 7e79c82f5fa4caf5b921dd5e7c3c0dcd65af08c2 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Mon, 16 Jan 2023 18:00:00 -0800
Subject: [PATCH 43/50] Remove endline

---
 src/atomate2/cp2k/jobs/defect.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/atomate2/cp2k/jobs/defect.py b/src/atomate2/cp2k/jobs/defect.py
index 5637f79504..39a4542679 100644
--- a/src/atomate2/cp2k/jobs/defect.py
+++ b/src/atomate2/cp2k/jobs/defect.py
@@ -200,4 +200,4 @@ def __post_init__(self):
         """Update the input settings with hybrid_functional attribute"""
         self.input_set_generator.user_input_settings.update(
             {"activate_hybrid": {"hybrid_functional": self.hybrid_functional}}
-        )
\ No newline at end of file
+        )

From 8f8e86fc49808bb448cafde83437e18f749f1c04 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 17 Jan 2023 11:06:59 -0800
Subject: [PATCH 44/50] trim

---
 src/atomate2/cp2k/builders/defect.py | 547 +++++++++++++++++----------
 src/atomate2/cp2k/schemas/defect.py  | 151 +++++---
 2 files changed, 430 insertions(+), 268 deletions(-)

diff --git a/src/atomate2/cp2k/builders/defect.py b/src/atomate2/cp2k/builders/defect.py
index 9f2ccce408..cc829b1d84 100644
--- a/src/atomate2/cp2k/builders/defect.py
+++ b/src/atomate2/cp2k/builders/defect.py
@@ -3,25 +3,20 @@
 from typing import Dict, Iterator, List, Literal, Optional
 
 import numpy as np
-from monty.json import MontyDecoder, jsanitize
-
+from emmet.core.electronic_structure import ElectronicStructureDoc
+from emmet.core.material import MaterialsDoc
 from maggma.builders import Builder
 from maggma.stores import Store
 from maggma.utils import grouper
-
-from pymatgen.core import Structure
+from monty.json import MontyDecoder, jsanitize
 from pymatgen.analysis.structure_matcher import ElementComparator, StructureMatcher
-from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
+from pymatgen.core import Structure
 from pymatgen.io.cp2k.inputs import Cp2kInput
+from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
 
-from emmet.core.material import MaterialsDoc
-
-from atomate2.settings import Atomate2Settings
-from atomate2.cp2k.schemas.task import TaskDocument
-from atomate2.cp2k.schemas.defect import DefectDoc, DefectiveMaterialDoc
 from atomate2.cp2k.schemas.calc_types import TaskType
-
-from emmet.core.electronic_structure import ElectronicStructureDoc
+from atomate2.cp2k.schemas.defect import DefectDoc, DefectiveMaterialDoc
+from atomate2.settings import Atomate2Settings
 
 __author__ = "Nicholas Winner <nwinner@berkeley.edu>"
 
@@ -50,15 +45,15 @@ class DefectBuilder(Builder):
         6.) Update the defect store
     """
 
-    #TODO how to incorporate into settings?
+    # TODO how to incorporate into settings?
     DEFAULT_ALLOWED_DFCT_TASKS = [
-            TaskType.Structure_Optimization.value, 
-            ]
+        TaskType.Structure_Optimization.value,
+    ]
 
     DEFAULT_ALLOWED_BULK_TASKS = [
-            TaskType.Structure_Optimization.value, 
-            TaskType.Static.value
-            ]
+        TaskType.Structure_Optimization.value,
+        TaskType.Static.value,
+    ]
 
     def __init__(
         self,
@@ -72,8 +67,10 @@ def __init__(
         query: Optional[Dict] = None,
         bulk_query: Optional[Dict] = None,
         allowed_dfct_types: Optional[List[str]] = DEFAULT_ALLOWED_DFCT_TASKS,
-        allowed_bulk_types: Optional[List[str]] = DEFAULT_ALLOWED_BULK_TASKS, 
-        task_schema: Literal["cp2k"] = "cp2k", # TODO cp2k specific right now, but this will go in common eventually
+        allowed_bulk_types: Optional[List[str]] = DEFAULT_ALLOWED_BULK_TASKS,
+        task_schema: Literal[
+            "cp2k"
+        ] = "cp2k",  # TODO cp2k specific right now, but this will go in common eventually
         settings: Dict | None = None,
         **kwargs,
     ):
@@ -100,11 +97,15 @@ def __init__(
         self.electronic_structure = electronic_structure
         self.electrostatic_potentials = electrostatic_potentials
         self.task_validation = task_validation
-        self._allowed_dfct_types = allowed_dfct_types #TODO How to incorporate into getitems?
-        self._allowed_bulk_types = allowed_bulk_types #TODO How to incorporate into getitems?
+        self._allowed_dfct_types = (
+            allowed_dfct_types  # TODO How to incorporate into getitems?
+        )
+        self._allowed_bulk_types = (
+            allowed_bulk_types  # TODO How to incorporate into getitems?
+        )
 
         settings = settings if settings else {}
-        self.settings = Atomate2Settings(**settings) # TODO don't think this is right 
+        self.settings = Atomate2Settings(**settings)  # TODO don't think this is right
         self.query = query if query else {}
         self.bulk_query = bulk_query if bulk_query else {}
         self.timestamp = None
@@ -113,33 +114,49 @@ def __init__(
         self.kwargs = kwargs
 
         # TODO Long term, schemas should be part of the matching and grouping process so that a builder can be run on a mixture
-        self.query.update({'output.@module': f"atomate2.{self.task_schema}.schemas.task", "output.@class": "TaskDocument"})
-        self.bulk_query.update({'output.@module': f"atomate2.{self.task_schema}.schemas.task", "output.@class": "TaskDocument"})
-        self._defect_query = 'output.additional_json.info.defect'
+        self.query.update(
+            {
+                "output.@module": f"atomate2.{self.task_schema}.schemas.task",
+                "output.@class": "TaskDocument",
+            }
+        )
+        self.bulk_query.update(
+            {
+                "output.@module": f"atomate2.{self.task_schema}.schemas.task",
+                "output.@class": "TaskDocument",
+            }
+        )
+        self._defect_query = "output.additional_json.info.defect"
 
         self._required_defect_properties = [
             self._defect_query,
             self.tasks.key,
-            'output.output.energy',
-            'output.output.structure',
-            'output.input',
-            'output.nsites',
-            'output.cp2k_objects.v_hartree',
-        ] 
+            "output.output.energy",
+            "output.output.structure",
+            "output.input",
+            "output.nsites",
+            "output.cp2k_objects.v_hartree",
+        ]
 
         self._required_bulk_properties = [
             self.tasks.key,
-            'output.output.energy',
-            'output.output.structure',
-            'output.input',
-            'output.cp2k_objects.v_hartree',
-            'output.output.vbm',
-        ] 
+            "output.output.energy",
+            "output.output.structure",
+            "output.input",
+            "output.cp2k_objects.v_hartree",
+            "output.output.vbm",
+        ]
 
         self._optional_defect_properties = []
         self._optional_bulk_properties = []
 
-        sources = [tasks, dielectric, electronic_structure, materials, electrostatic_potentials]
+        sources = [
+            tasks,
+            dielectric,
+            electronic_structure,
+            materials,
+            electrostatic_potentials,
+        ]
         if self.task_validation:
             sources.append(self.task_validation)
         super().__init__(sources=sources, targets=[defects], **kwargs)
@@ -151,7 +168,7 @@ def defect_query(self) -> str:
         """
         return self._defect_query
 
-    #TODO Hartree pot should be required but only for charged defects
+    # TODO Hartree pot should be required but only for charged defects
     @property
     def required_defect_properties(self) -> List:
         """
@@ -191,7 +208,7 @@ def allowed_dfct_types(self) -> set:
     @property
     def allowed_bulk_types(self) -> set:
         return {TaskType(t) for t in self._allowed_bulk_types}
-    
+
     def ensure_indexes(self):
         """
         Ensures indicies on the tasks and materials collections
@@ -201,7 +218,7 @@ def ensure_indexes(self):
         self.tasks.ensure_index(self.tasks.key)
         self.tasks.ensure_index("output.last_updated")
         self.tasks.ensure_index("output.state")
-        self.tasks.ensure_index("output.formula_pretty") # TODO is necessary?
+        self.tasks.ensure_index("output.formula_pretty")  # TODO is necessary?
 
         # Search index for materials
         self.materials.ensure_index("material_id")
@@ -219,7 +236,7 @@ def ensure_indexes(self):
 
     def prechunk(self, number_splits: int) -> Iterator[Dict]:
 
-        tag_query = {} 
+        tag_query = {}
         if len(self.settings.BUILD_TAGS) > 0 and len(self.settings.EXCLUDED_TAGS) > 0:
             tag_query["$and"] = [
                 {"tags": {"$in": self.settings.BUILD_TAGS}},
@@ -231,21 +248,29 @@ def prechunk(self, number_splits: int) -> Iterator[Dict]:
         # Get defect tasks
         temp_query = self.query.copy()
         temp_query.update(tag_query)
-        temp_query.update({d: {'$exists': True, "$ne": None} for d in self.required_defect_properties})
-        temp_query.update({self.defect_query: {'$exists': True}, "state": "successful"})
+        temp_query.update(
+            {d: {"$exists": True, "$ne": None} for d in self.required_defect_properties}
+        )
+        temp_query.update({self.defect_query: {"$exists": True}, "state": "successful"})
         defect_tasks = {
             doc[self.tasks.key]
-            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
+            for doc in self.tasks.query(
+                criteria=temp_query, properties=[self.tasks.key]
+            )
         }
 
         # Get bulk tasks
         temp_query = self.bulk_query.copy()
         temp_query.update(tag_query)
-        temp_query.update({d: {'$exists': True} for d in self.required_bulk_properties})
-        temp_query.update({self.defect_query: {'$exists': False}, "state": "successful"})
+        temp_query.update({d: {"$exists": True} for d in self.required_bulk_properties})
+        temp_query.update(
+            {self.defect_query: {"$exists": False}, "state": "successful"}
+        )
         bulk_tasks = {
             doc[self.tasks.key]
-            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
+            for doc in self.tasks.query(
+                criteria=temp_query, properties=[self.tasks.key]
+            )
         }
 
         N = np.ceil(len(defect_tasks) / number_splits)
@@ -263,9 +288,9 @@ def get_items(self) -> Iterator[List[Dict]]:
             1. Get all tasks with standard "defect" query tag
             2. Filter all tasks by skipping tasks which are already in the Defect Store
             3. Get all tasks that could be used as bulk
-            4. Filter all bulks which do not have corresponding Dielectric and 
+            4. Filter all bulks which do not have corresponding Dielectric and
                ElectronicStructure data (if a band gap exists for that task).
-            5. Group defect tasks by defect matching 
+            5. Group defect tasks by defect matching
             6. Given defect object in a group, bundle them with bulk tasks
                identified with structure matching
             7. Yield the item bundles
@@ -274,7 +299,7 @@ def get_items(self) -> Iterator[List[Dict]]:
             Iterator of (defect documents, task bundles)
 
                 The defect document is an existing defect doc to be updated with new data, or None
-            
+
                 task bundles bundle are all the tasks that correspond to the same defect and all possible
                 bulk tasks that could be matched to them.
         """
@@ -297,46 +322,64 @@ def get_items(self) -> Iterator[List[Dict]]:
 
         ##### Get defect tasks #####
         temp_query = self.query.copy()
-        temp_query.update({d: {'$exists': True, "$ne": None} for d in self.required_defect_properties})
-        temp_query.update({self.defect_query: {'$exists': True}, "output.state": "successful"})
+        temp_query.update(
+            {d: {"$exists": True, "$ne": None} for d in self.required_defect_properties}
+        )
+        temp_query.update(
+            {self.defect_query: {"$exists": True}, "output.state": "successful"}
+        )
         defect_tasks = {
             doc[self.tasks.key]
-            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
+            for doc in self.tasks.query(
+                criteria=temp_query, properties=[self.tasks.key]
+            )
         }
 
         # TODO Seems slow
         not_allowed = {
-            doc[self.tasks.key] 
+            doc[self.tasks.key]
             for doc in self.tasks.query(
-                criteria={self.tasks.key: {"$in": list(defect_tasks)}}, 
-                properties=['output.calcs_reversed']
-                )
-            if TaskType(doc['output']['calcs_reversed'][0]['task_type']) not in self.allowed_dfct_types
+                criteria={self.tasks.key: {"$in": list(defect_tasks)}},
+                properties=["output.calcs_reversed"],
+            )
+            if TaskType(doc["output"]["calcs_reversed"][0]["task_type"])
+            not in self.allowed_dfct_types
         }
         if not_allowed:
-            self.logger.debug(f"{len(not_allowed)} defect tasks dropped. Not allowed TaskType")
+            self.logger.debug(
+                f"{len(not_allowed)} defect tasks dropped. Not allowed TaskType"
+            )
         defect_tasks = defect_tasks - not_allowed
 
         ##### Get bulk tasks #####
         temp_query = self.bulk_query.copy()
-        temp_query.update({d: {'$exists': True, "$ne": None} for d in self.required_bulk_properties})
-        temp_query.update({self.defect_query: {'$exists': False}, "output.state": "successful"})
+        temp_query.update(
+            {d: {"$exists": True, "$ne": None} for d in self.required_bulk_properties}
+        )
+        temp_query.update(
+            {self.defect_query: {"$exists": False}, "output.state": "successful"}
+        )
         bulk_tasks = {
             doc[self.tasks.key]
-            for doc in self.tasks.query(criteria=temp_query, properties=[self.tasks.key])
+            for doc in self.tasks.query(
+                criteria=temp_query, properties=[self.tasks.key]
+            )
         }
-        
+
         # TODO seems slow
         not_allowed = {
-            doc[self.tasks.key] 
+            doc[self.tasks.key]
             for doc in self.tasks.query(
                 criteria={self.tasks.key: {"$in": list(bulk_tasks)}},
-                properties=['output.calcs_reversed']
-                )
-            if TaskType(doc['output']['calcs_reversed'][0]['task_type']) not in self.allowed_bulk_types
+                properties=["output.calcs_reversed"],
+            )
+            if TaskType(doc["output"]["calcs_reversed"][0]["task_type"])
+            not in self.allowed_bulk_types
         }
         if not_allowed:
-            self.logger.debug(f"{len(not_allowed)} bulk tasks dropped. Not allowed TaskType")
+            self.logger.debug(
+                f"{len(not_allowed)} bulk tasks dropped. Not allowed TaskType"
+            )
         bulk_tasks = bulk_tasks - not_allowed
 
         # TODO Not the same validation behavior as material builders?
@@ -344,9 +387,7 @@ def get_items(self) -> Iterator[List[Dict]]:
         if self.task_validation:
             validated = {
                 doc[self.tasks.key]
-                for doc in self.task_validation.query(
-                    {}, [self.task_validation.key]        
-                )
+                for doc in self.task_validation.query({}, [self.task_validation.key])
             }
 
             defect_tasks = defect_tasks.intersection(validated)
@@ -358,7 +399,7 @@ def get_items(self) -> Iterator[List[Dict]]:
                     {"is_valid": False}, [self.task_validation.key]
                 )
             }
-            self.logger.info("Removing {} invalid tasks".format(len(invalid_ids)))
+            self.logger.info(f"Removing {len(invalid_ids)} invalid tasks")
             defect_tasks = defect_tasks - invalid_ids
             bulk_tasks = bulk_tasks - invalid_ids
 
@@ -369,11 +410,11 @@ def get_items(self) -> Iterator[List[Dict]]:
         }
         all_tasks = defect_tasks | bulk_tasks
 
-        self.logger.debug("All tasks: {}".format(len(all_tasks)))
-        self.logger.debug("Bulk tasks before filter: {}".format(len(bulk_tasks)))
+        self.logger.debug(f"All tasks: {len(all_tasks)}")
+        self.logger.debug(f"Bulk tasks before filter: {len(bulk_tasks)}")
         bulk_tasks = set(filter(self.__preprocess_bulk, bulk_tasks))
-        self.logger.debug("Bulk tasks after filter: {}".format(len(bulk_tasks)))
-        self.logger.debug("All defect tasks: {}".format(len(defect_tasks)))
+        self.logger.debug(f"Bulk tasks after filter: {len(bulk_tasks)}")
+        self.logger.debug(f"All defect tasks: {len(defect_tasks)}")
         unprocessed_defect_tasks = defect_tasks - processed_defect_tasks
 
         if not unprocessed_defect_tasks:
@@ -383,8 +424,12 @@ def get_items(self) -> Iterator[List[Dict]]:
             self.logger.info("No compatible bulk calculations. Exiting.")
             return
 
-        self.logger.info(f"Found {len(unprocessed_defect_tasks)} unprocessed defect tasks")
-        self.logger.info(f"Found {len(bulk_tasks)} bulk tasks with dielectric properties")
+        self.logger.info(
+            f"Found {len(unprocessed_defect_tasks)} unprocessed defect tasks"
+        )
+        self.logger.info(
+            f"Found {len(bulk_tasks)} bulk tasks with dielectric properties"
+        )
 
         # Set total for builder bars to have a total
         self.total = len(unprocessed_defect_tasks)
@@ -392,10 +437,12 @@ def get_items(self) -> Iterator[List[Dict]]:
         # yield list of defects that are of the same type, matched to an appropriate bulk calc
         self.logger.info(f"Starting defect matching.")
 
-        for defect, defect_task_group in self.__filter_and_group_tasks(unprocessed_defect_tasks):
+        for defect, defect_task_group in self.__filter_and_group_tasks(
+            unprocessed_defect_tasks
+        ):
             task_ids = self.__match_defects_to_bulks(bulk_tasks, defect_task_group)
             if not task_ids:
-                continue 
+                continue
             doc = self.__get_defect_doc(defect)
             if doc:
                 self.logger.info(f"DOC IS {doc.defect.__repr__()}")
@@ -416,18 +463,34 @@ def process_item(self, items):
         returns: the defect document as a dictionary
         """
         defect_doc, item_bundle, material_id, task_ids = items
-        self.logger.info(f"Processing group of {len(item_bundle)} defects into DefectDoc")
+        self.logger.info(
+            f"Processing group of {len(item_bundle)} defects into DefectDoc"
+        )
         if item_bundle:
             for _, (defect_task, bulk_task, dielectric) in item_bundle.items():
                 if not defect_doc:
                     defect_doc = DefectDoc.from_tasks(
-                        defect_task=defect_task, bulk_task=bulk_task, dielectric=dielectric,
-                        query=self.defect_query, key=self.tasks.key, material_id=material_id
-                        )
+                        defect_task=defect_task,
+                        bulk_task=bulk_task,
+                        dielectric=dielectric,
+                        query=self.defect_query,
+                        key=self.tasks.key,
+                        material_id=material_id,
+                    )
                 else:
-                    defect_doc.update_one(defect_task, bulk_task, dielectric, query=self.defect_query, key=self.tasks.key) # TODO Atomate2Store wrapper
-                defect_doc.task_ids = list(set(task_ids + defect_doc.task_ids)) # TODO should I store the bulk id too?
-            return jsanitize(defect_doc.dict(), allow_bson=True, enum_values=True, strict=True)
+                    defect_doc.update_one(
+                        defect_task,
+                        bulk_task,
+                        dielectric,
+                        query=self.defect_query,
+                        key=self.tasks.key,
+                    )  # TODO Atomate2Store wrapper
+                defect_doc.task_ids = list(
+                    set(task_ids + defect_doc.task_ids)
+                )  # TODO should I store the bulk id too?
+            return jsanitize(
+                defect_doc.dict(), allow_bson=True, enum_values=True, strict=True
+            )
         return {}
 
     def update_targets(self, items):
@@ -443,10 +506,10 @@ def update_targets(self, items):
                 item.update({"_bt": self.timestamp})
                 self.defects.remove_docs(
                     {
-                       "task_ids": item['task_ids'],
+                        "task_ids": item["task_ids"],
                     }
                 )
-            self.defects.update(items, key='task_ids')
+            self.defects.update(items, key="task_ids")
         else:
             self.logger.info("No items to update")
 
@@ -463,37 +526,36 @@ def __filter_and_group_tasks(self, tasks):
             [ (defect, [task_ids] ), ...] where task_ids correspond to the same defect
         """
 
-        props = [
-            self.defect_query,
-            self.tasks.key,
-            'output.structure'
-        ]
+        props = [self.defect_query, self.tasks.key, "output.structure"]
 
         self.logger.debug(f"Finding equivalent tasks for {len(tasks)} defects")
 
-        sm = StructureMatcher(allow_subset=False) #TODO build settings
+        sm = StructureMatcher(allow_subset=False)  # TODO build settings
         defects = [
             {
-                self.tasks.key: t[self.tasks.key], 'defect': self.__get_defect_from_task(t),
-                'structure': Structure.from_dict(t['output']['structure'])
+                self.tasks.key: t[self.tasks.key],
+                "defect": self.__get_defect_from_task(t),
+                "structure": Structure.from_dict(t["output"]["structure"]),
             }
-            for t in self.tasks.query(criteria={self.tasks.key: {'$in': list(tasks)}}, properties=props)
+            for t in self.tasks.query(
+                criteria={self.tasks.key: {"$in": list(tasks)}}, properties=props
+            )
         ]
         for d in defects:
             # TODO remove oxidation state because spins/oxidation cause errors in comparison.
             #  but they shouldnt if those props are close in value
-            d['structure'].remove_oxidation_states()
-            d['defect'].user_charges = [d['structure'].charge]
+            d["structure"].remove_oxidation_states()
+            d["defect"].user_charges = [d["structure"].charge]
 
         def key(x):
-            s = x['defect'].structure
+            s = x["defect"].structure
             return get_sg(s), s.composition.reduced_composition
 
         def are_equal(x, y):
             """To decide if defects are equal."""
-            if x['structure'].charge != y['structure'].charge:
+            if x["structure"].charge != y["structure"].charge:
                 return False
-            if x['defect'] == y['defect']:
+            if x["defect"] == y["defect"]:
                 return True
             return False
 
@@ -506,11 +568,21 @@ def are_equal(x, y):
             while len(unmatched) > 0:
                 i, refs = unmatched.pop(0)
                 matches = [i]
-                inds = list(filter(lambda j: are_equal(refs, unmatched[j][1]), list(range(len(unmatched)))))
+                inds = list(
+                    filter(
+                        lambda j: are_equal(refs, unmatched[j][1]),
+                        list(range(len(unmatched))),
+                    )
+                )
                 matches.extend([unmatched[i][0] for i in inds])
-                unmatched = [unmatched[i] for i in range(len(unmatched)) if i not in inds]
+                unmatched = [
+                    unmatched[i] for i in range(len(unmatched)) if i not in inds
+                ]
                 all_groups.append(
-                    (defects[i]['defect'], [defects[i][self.tasks.key] for i in matches])
+                    (
+                        defects[i]["defect"],
+                        [defects[i][self.tasks.key] for i in matches],
+                    )
                 )
 
         self.logger.debug(f"{len(all_groups)} groups")
@@ -532,7 +604,9 @@ def __get_defect_doc(self, defect):
         material_id = self._get_mpid(defect.structure)
         docs = [
             DefectDoc(**doc)
-            for doc in self.defects.query(criteria={'material_id': material_id}, properties=None)
+            for doc in self.defects.query(
+                criteria={"material_id": material_id}, properties=None
+            )
         ]
         for doc in docs:
             if self.__defect_match(defect, doc.defect):
@@ -546,8 +620,9 @@ def __defect_match(self, x, y):
             return False
 
         # Elem. changes needed to distinguish ghost vacancies
-        if x.element_changes == y.element_changes and \
-                sm.fit(x.defect_structure, y.defect_structure):
+        if x.element_changes == y.element_changes and sm.fit(
+            x.defect_structure, y.defect_structure
+        ):
             return True
 
         return False
@@ -559,11 +634,13 @@ def __get_dielectric(self, key):
         and retrieve the total dielectric tensor for defect analysis. If no dielectric exists, as would
         be the case for metallic systems, return None.
         """
-        for diel in self.dielectric.query(criteria={"material_id": key}, properties=['total']):
-            return diel['total']
+        for diel in self.dielectric.query(
+            criteria={"material_id": key}, properties=["total"]
+        ):
+            return diel["total"]
         return None
 
-    #TODO retrieving the electrostatic potential is by far the most expesive part of the builder. Any way to reduce?
+    # TODO retrieving the electrostatic potential is by far the most expesive part of the builder. Any way to reduce?
     def __get_item_bundle(self, task_ids):
         """
         Gets a group of items that can be processed together into a defect document.
@@ -576,11 +653,12 @@ def __get_item_bundle(self, task_ids):
         """
         return {
             rt: (
-                self.tasks.query_one(criteria={self.tasks.key: pairs[0]}, load=True), 
-                self.tasks.query_one(criteria={self.tasks.key: pairs[1]}, load=True), 
-                self.__get_dielectric(self._mpid_map[pairs[1]])
-                ) for rt, pairs in task_ids.items()
-                }
+                self.tasks.query_one(criteria={self.tasks.key: pairs[0]}, load=True),
+                self.tasks.query_one(criteria={self.tasks.key: pairs[1]}, load=True),
+                self.__get_dielectric(self._mpid_map[pairs[1]]),
+            )
+            for rt, pairs in task_ids.items()
+        }
 
     def _get_mpid(self, structure):
         """
@@ -592,17 +670,22 @@ def _get_mpid(self, structure):
 
         returns: material_id, if one exists, else None
         """
-        sga = SpacegroupAnalyzer(structure, symprec=self.settings.SYMPREC) # TODO Add angle tolerance
+        sga = SpacegroupAnalyzer(
+            structure, symprec=self.settings.SYMPREC
+        )  # TODO Add angle tolerance
         mats = self.materials.query(
             criteria={
-                'chemsys': structure.composition.chemical_system,
-            }, properties=['structure', 'material_id']
+                "chemsys": structure.composition.chemical_system,
+            },
+            properties=["structure", "material_id"],
         )
         # TODO coudl more than one material match true?
-        sm = StructureMatcher() # TODO add tolerances
+        sm = StructureMatcher(
+            primitive_cell=True, comparator=ElementComparator()
+        )  # TODO add tolerances
         for m in mats:
-            if sm.fit(structure, Structure.from_dict(m['structure'])):
-                return m['material_id']
+            if sm.fit(structure, Structure.from_dict(m["structure"])):
+                return m["material_id"]
         return None
 
     def __match_defects_to_bulks(self, bulk_ids, defect_ids) -> list[tuple]:
@@ -620,26 +703,30 @@ def __match_defects_to_bulks(self, bulk_ids, defect_ids) -> list[tuple]:
         # TODO mongo projection on array doesn't work (see above)
         props = [
             self.tasks.key,
-            self.defect_query, 
-            'output.input',
-            'output.nsites',
-            'output.output.structure',
-            'output.output.energy',
-            'output.calcs_reversed' 
+            self.defect_query,
+            "output.input",
+            "output.nsites",
+            "output.output.structure",
+            "output.output.energy",
+            "output.calcs_reversed",
         ]
-        defects = list(self.tasks.query(criteria={self.tasks.key: {'$in': list(defect_ids)}}, properties=props))
+        defects = list(
+            self.tasks.query(
+                criteria={self.tasks.key: {"$in": list(defect_ids)}}, properties=props
+            )
+        )
         ps = self.__get_pristine_supercell(defects[0])
-        ps.remove_oxidation_states() # TODO might cause problems
+        ps.remove_oxidation_states()  # TODO might cause problems
         bulks = list(
             self.tasks.query(
                 criteria={
-                    self.tasks.key: {'$in': list(bulk_ids)},
-                    'output.formula_pretty': jsanitize(ps.composition.reduced_formula),
+                    self.tasks.key: {"$in": list(bulk_ids)},
+                    "output.formula_pretty": jsanitize(ps.composition.reduced_formula),
                 },
-                properties=props
+                properties=props,
             )
-        ) 
-        
+        )
+
         pairs = [
             (defect, bulk)
             for bulk in bulks
@@ -649,22 +736,23 @@ def __match_defects_to_bulks(self, bulk_ids, defect_ids) -> list[tuple]:
         self.logger.debug(f"Found {len(pairs)} commensurate bulk/defect pairs")
 
         def key(x):
-            return -x[0]['output']['nsites'], x[0]['output']['output']['energy']
+            return -x[0]["output"]["nsites"], x[0]["output"]["output"]["energy"]
+
         def _run_type(x):
-            return x[0]['output']['calcs_reversed'][0]['run_type']
+            return x[0]["output"]["calcs_reversed"][0]["run_type"]
 
         rt_pairs = {}
         for rt, group in groupby(pairs, key=_run_type):
             rt_pairs[rt] = [
-                (defect[self.tasks.key], bulk[self.tasks.key]) 
+                (defect[self.tasks.key], bulk[self.tasks.key])
                 for defect, bulk in sorted(list(group), key=key)
-                ]
+            ]
 
         # Return only the first (best) pair for each rt
         return {rt: lst[0] for rt, lst in rt_pairs.items()}
 
-    # TODO Checking for same dft settings (e.g. OT/diag) is a little cumbersome. 
-    # Maybe, in future, task doc can be defined to have OT/diag as part of input summary 
+    # TODO Checking for same dft settings (e.g. OT/diag) is a little cumbersome.
+    # Maybe, in future, task doc can be defined to have OT/diag as part of input summary
     # for fast querying
     def __are_bulk_and_defect_commensurate(self, b, d):
         """
@@ -673,41 +761,47 @@ def __are_bulk_and_defect_commensurate(self, b, d):
         Checks for:
             1. Same run type.
             2. Same pristine structures with no supercell reduction
-            3. Compatible DFT settings 
+            3. Compatible DFT settings
         """
         # TODO add settings
         sm = StructureMatcher(
-            ltol = 1e-3,
-            stol = 0.1,
-            angle_tol = 1,
+            ltol=1e-3,
+            stol=0.1,
+            angle_tol=1,
             primitive_cell=False,
             scale=True,
             attempt_supercell=False,
             allow_subset=False,
             comparator=ElementComparator(),
         )
-        rtb = b.get('output').get('input').get('xc').split("+U")[0]
-        rtd = d.get('output').get('input').get('xc').split("+U")[0]
+        rtb = b.get("output").get("input").get("xc").split("+U")[0]
+        rtd = d.get("output").get("input").get("xc").split("+U")[0]
         baux = {
-            dat['element']: dat.get('auxiliary_basis')
-            for dat in b['output']['input']['atomic_kind_info']['atomic_kinds'].values()
-            }
+            dat["element"]: dat.get("auxiliary_basis")
+            for dat in b["output"]["input"]["atomic_kind_info"]["atomic_kinds"].values()
+        }
         daux = {
-            dat['element']: dat.get('auxiliary_basis')
-            for dat in d['output']['input']['atomic_kind_info']['atomic_kinds'].values()
-            }
+            dat["element"]: dat.get("auxiliary_basis")
+            for dat in d["output"]["input"]["atomic_kind_info"]["atomic_kinds"].values()
+        }
 
-        if rtb == rtd: 
-            if sm.fit(self.__get_pristine_supercell(d), self.__get_pristine_supercell(b)):
-                    cib = Cp2kInput.from_dict(b['output']['calcs_reversed'][0]['input']['cp2k_input'])
-                    cid = Cp2kInput.from_dict(d['output']['calcs_reversed'][0]['input']['cp2k_input'])
-                    bis_ot = cib.check("force_eval/dft/scf/ot")
-                    dis_ot = cid.check("force_eval/dft/scf/ot")
-                    if (bis_ot and dis_ot) or (not bis_ot and not dis_ot):
-                        for el in baux:
-                            if baux[el] != daux[el]:
-                                return False
-                        return True
+        if rtb == rtd:
+            if sm.fit(
+                self.__get_pristine_supercell(d), self.__get_pristine_supercell(b)
+            ):
+                cib = Cp2kInput.from_dict(
+                    b["output"]["calcs_reversed"][0]["input"]["cp2k_input"]
+                )
+                cid = Cp2kInput.from_dict(
+                    d["output"]["calcs_reversed"][0]["input"]["cp2k_input"]
+                )
+                bis_ot = cib.check("force_eval/dft/scf/ot")
+                dis_ot = cid.check("force_eval/dft/scf/ot")
+                if (bis_ot and dis_ot) or (not bis_ot and not dis_ot):
+                    for el in baux:
+                        if baux[el] != daux[el]:
+                            return False
+                    return True
         return False
 
     def __preprocess_bulk(self, task):
@@ -720,10 +814,17 @@ def __preprocess_bulk(self, task):
             (3) If bulk is not a metal, electronic structure document must exist in the store
 
         """
-        self.logger.debug("Preprocessing bulk task {}".format(task))
-        t = next(self.tasks.query(criteria={self.tasks.key: task}, properties=['output.output.structure', 'mpid']))
+        self.logger.debug(f"Preprocessing bulk task {task}")
+        t = next(
+            self.tasks.query(
+                criteria={self.tasks.key: task},
+                properties=["output.output.structure", "mpid"],
+            )
+        )
 
-        struc = Structure.from_dict(t.get('output').get('output').get('structure')) # TODO specific to atomate2
+        struc = Structure.from_dict(
+            t.get("output").get("output").get("structure")
+        )  # TODO specific to atomate2
         mpid = self._get_mpid(struc)
         if not mpid:
             self.logger.debug(f"No material id found for bulk task {task}")
@@ -732,18 +833,20 @@ def __preprocess_bulk(self, task):
         self.logger.debug(f"Material ID: {mpid}")
 
         elec = self.electronic_structure.query_one(
-            properties=['band_gap'], criteria={self.electronic_structure.key: mpid}
-            )
+            properties=["band_gap"], criteria={self.electronic_structure.key: mpid}
+        )
         if not elec:
             self.logger.debug(f"Electronic structure data not found for {mpid}")
             return False
 
         # TODO right now pulling dos from electronic structure, should just pull summary document
-        if elec['band_gap'] > 0:
+        if elec["band_gap"] > 0:
             diel = self.__get_dielectric(mpid)
             if not diel:
-                self.logger.info(f"Task {task} for {mpid} ({struc.composition.reduced_formula}) requires "
-                                 f"dielectric properties, but none found in dielectric store")
+                self.logger.info(
+                    f"Task {task} for {mpid} ({struc.composition.reduced_formula}) requires "
+                    f"dielectric properties, but none found in dielectric store"
+                )
                 return False
 
         return True
@@ -762,7 +865,9 @@ def __get_pristine_supercell(self, task):
         new_lattice = Lattice(np.dot(scale_matrix, self._lattice.matrix))
         """
         d = unpack(query=self.defect_query, d=task)
-        out_structure = MontyDecoder().process_decoded(task['output']['output']['structure'])
+        out_structure = MontyDecoder().process_decoded(
+            task["output"]["output"]["structure"]
+        )
         if d:
             defect = MontyDecoder().process_decoded(d)
             s = defect.structure.copy()
@@ -785,12 +890,12 @@ class DefectiveMaterialBuilder(Builder):
     """
 
     def __init__(
-            self,
-            defects: Store,
-            defect_thermos: Store,
-            materials: Store,
-            query: Optional[Dict] = None,
-            **kwargs,
+        self,
+        defects: Store,
+        defect_thermos: Store,
+        materials: Store,
+        query: Optional[Dict] = None,
+        **kwargs,
     ):
         """
         Args:
@@ -809,7 +914,9 @@ def __init__(
         self.timestamp = None
         self.kwargs = kwargs
 
-        super().__init__(sources=[defects, materials], targets=[defect_thermos], **kwargs)
+        super().__init__(
+            sources=[defects, materials], targets=[defect_thermos], **kwargs
+        )
 
     def ensure_indexes(self):
         """
@@ -848,19 +955,23 @@ def get_items(self) -> Iterator[List[Dict]]:
         temp_query = dict(self.query)
         temp_query["state"] = "successful"
 
-        #unprocessed_defect_tasks = all_tasks - processed_defect_tasks
+        # unprocessed_defect_tasks = all_tasks - processed_defect_tasks
 
         all_docs = [doc for doc in self.defects.query(self.query)]
 
         self.logger.debug(f"Found {len(all_docs)} defect docs to process")
 
         def filterfunc(x):
-            if not self.materials.query_one(criteria={'material_id': x['material_id']}, properties=None):
-                self.logger.debug(f"No material with MPID={x['material_id']} in the material store")
+            if not self.materials.query_one(
+                criteria={"material_id": x["material_id"]}, properties=None
+            ):
+                self.logger.debug(
+                    f"No material with MPID={x['material_id']} in the material store"
+                )
                 return False
             return True
-            defect = MontyDecoder().process_decoded(x['defect'])
-            for el in defect.element_changes: 
+            defect = MontyDecoder().process_decoded(x["defect"])
+            for el in defect.element_changes:
                 if el not in self.thermo:
                     self.logger.debug(f"No entry for {el} in Thermo Store")
                     return False
@@ -868,10 +979,8 @@ def filterfunc(x):
             return True
 
         for key, group in groupby(
-                filter(
-                    filterfunc,
-                    sorted(all_docs, key=lambda x: x['material_id'])
-                ), key=lambda x: x['material_id']
+            filter(filterfunc, sorted(all_docs, key=lambda x: x["material_id"])),
+            key=lambda x: x["material_id"],
         ):
             try:
                 yield list(group)
@@ -884,7 +993,9 @@ def process_item(self, defects):
         """
         defect_docs = [DefectDoc(**d) for d in defects]
         self.logger.info(f"Processing {len(defect_docs)} defects")
-        defect_thermo_doc = DefectiveMaterialDoc.from_docs(defect_docs, material_id=defect_docs[0].material_id)
+        defect_thermo_doc = DefectiveMaterialDoc.from_docs(
+            defect_docs, material_id=defect_docs[0].material_id
+        )
         return defect_thermo_doc.dict()
 
     def update_targets(self, items):
@@ -917,8 +1028,10 @@ def __get_electronic_structure(self, material_id):
             criteria={self.electronic_structures.key: material_id},
             properties=None,
         )
-        t_id = ElectronicStructureDoc(**dosdoc).dos.total['1'].task_id
-        dos = self.dos.query_one(criteria={'task_id': int(t_id)}, properties=None) #TODO MPID str/int issues
+        t_id = ElectronicStructureDoc(**dosdoc).dos.total["1"].task_id
+        dos = self.dos.query_one(
+            criteria={"task_id": int(t_id)}, properties=None
+        )  # TODO MPID str/int issues
         return dos
 
     def __get_materials(self, key) -> List:
@@ -926,7 +1039,7 @@ def __get_materials(self, key) -> List:
         Given a group of DefectDocs, use the bulk material_id to get materials in the chemsys from the
         materials store.
         """
-        bulk = self.materials.query_one(criteria={'material_id': key}, properties=None)
+        bulk = self.materials.query_one(criteria={"material_id": key}, properties=None)
         if not bulk:
             raise LookupError(
                 f"The bulk material ({key}) for these defects cannot be found in the materials store"
@@ -934,35 +1047,48 @@ def __get_materials(self, key) -> List:
         return MaterialsDoc(**bulk)
 
     def __get_thermos(self, composition) -> List:
-        return list(self.thermo.query(criteria={'elements': {"$size": 1}}, properties=None))
+        return list(
+            self.thermo.query(criteria={"elements": {"$size": 1}}, properties=None)
+        )
 
 
 class DefectValidator(Builder):
-
     def __init__(
-        self, 
-        tasks: Store, 
-        defect_validation: Store, 
+        self,
+        tasks: Store,
+        defect_validation: Store,
         chunk_size: int = 1000,
-        defect_query = 'output.additional_json.info.defect',
-       ):
+        defect_query="output.additional_json.info.defect",
+    ):
         self.tasks = tasks
         self.defect_validation = defect_validation
         self.chunk_size = chunk_size
         self.defect_query = defect_query
-        super().__init__(sources=tasks, targets=defect_validation, chunk_size=chunk_size)
+        super().__init__(
+            sources=tasks, targets=defect_validation, chunk_size=chunk_size
+        )
 
     def get_items(self):
         self.logger.info("Getting tasks")
-        tids = list(self.tasks.query(criteria={self.defect_query: {"$exists": True}}, properties=[self.tasks.key]))
+        tids = list(
+            self.tasks.query(
+                criteria={self.defect_query: {"$exists": True}},
+                properties=[self.tasks.key],
+            )
+        )
         self.logger.info(f"{len(tids)} to process")
-        for t in self.tasks.query():
-            yield t
-    
+        yield from self.tasks.query()
+
     def process_item(self, item):
         from atomate2.cp2k.schemas.defect import DefectValidation
+
         tid = item[self.tasks.key]
-        return jsanitize(DefectValidation.process_task(item, tid).dict(), allow_bson=True, enum_values=True, strict=True)
+        return jsanitize(
+            DefectValidation.process_task(item, tid).dict(),
+            allow_bson=True,
+            enum_values=True,
+            strict=True,
+        )
 
     def update_targets(self, items: List):
         """
@@ -994,8 +1120,9 @@ def unpack(query, d):
                 return unpack(query.split("."), d)
     return unpack(query[1:], d.__getitem__(query.pop(0)))
 
+
 # TODO SHOULD GO IN COMMON
-def get_sg(struc, symprec=.01) -> int:
+def get_sg(struc, symprec=0.01) -> int:
     """helper function to get spacegroup with a loose tolerance"""
     try:
         return struc.get_space_group_info(symprec=symprec)[1]
diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index 4e6120dfab..800f50dbf4 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -1,35 +1,30 @@
 from datetime import datetime
-from tokenize import group
-from typing import ClassVar, TypeVar, Type, Dict, Tuple, Mapping, List, Callable
-from pydantic import BaseModel, Field
-from pydantic import validator
-from itertools import groupby
+from typing import Callable, ClassVar, Dict, List, Mapping, Tuple, Type, TypeVar
 
 import numpy as np
-
 from monty.json import MontyDecoder
 from monty.tempfile import ScratchDir
-
-from pymatgen.core import Structure, Element
-from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
-from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
-from pymatgen.io.cp2k.utils import get_truncated_coulomb_cutoff
-from pymatgen.analysis.phase_diagram import PhaseDiagram
-from pymatgen.analysis.defects.core import Defect, Adsorbate
+from pydantic import BaseModel, Field
+from pymatgen.analysis.defects.core import Adsorbate, Defect
 from pymatgen.analysis.defects.corrections.freysoldt import (
-    get_freysoldt_correction,
     get_freysoldt2d_correction,
+    get_freysoldt_correction,
 )
+from pymatgen.analysis.defects.finder import DefectSiteFinder
 from pymatgen.analysis.defects.thermo import (
     DefectEntry,
     DefectSiteFinder,
-    MultiFormationEnergyDiagram
+    MultiFormationEnergyDiagram,
 )
-from pymatgen.analysis.defects.finder import DefectSiteFinder
+from pymatgen.analysis.phase_diagram import PhaseDiagram
+from pymatgen.core import Element
+from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
+from pymatgen.io.cp2k.utils import get_truncated_coulomb_cutoff
+from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
 
 from atomate2 import SETTINGS
 from atomate2.common.schemas.structure import StructureMetadata
-from atomate2.cp2k.schemas.calc_types.enums import CalcType, TaskType, RunType
+from atomate2.cp2k.schemas.calc_types.enums import RunType
 from atomate2.cp2k.schemas.task import TaskDocument
 
 __all__ = ["DefectDoc"]
@@ -38,6 +33,7 @@
 S = TypeVar("S", bound="DefectiveMaterialDoc")
 V = TypeVar("V", bound="DefectValidation")
 
+
 class DefectDoc(StructureMetadata):
     """
     A document used to represent a single defect. e.g. a O vacancy with a -2 charge.
@@ -57,8 +53,12 @@ class DefectDoc(StructureMetadata):
     material_id: str = Field(
         None, description="Unique material ID for the bulk material"
     )  # TODO Change to MPID
-    defect_ids: Mapping[RunType, str] = Field(None, description="Map run types of defect entry to task id")
-    bulk_ids: Mapping[RunType, str] = Field(None, description="Map run types of bulk entry to task id")
+    defect_ids: Mapping[RunType, str] = Field(
+        None, description="Map run types of defect entry to task id"
+    )
+    bulk_ids: Mapping[RunType, str] = Field(
+        None, description="Map run types of bulk entry to task id"
+    )
     task_ids: List[str] = Field(
         None, description="All defect task ids used in creating this defect doc."
     )
@@ -81,9 +81,13 @@ class DefectDoc(StructureMetadata):
         default_factory=datetime.utcnow,
     )
     metadata: Dict = Field(None, description="Metadata for this defect")
-    valid: Mapping[RunType, Dict] = Field(None, description="Whether each run type has a valid entry")
+    valid: Mapping[RunType, Dict] = Field(
+        None, description="Whether each run type has a valid entry"
+    )
 
-    def update_one(self, defect_task, bulk_task, dielectric, query="defect", key="task_id"):
+    def update_one(
+        self, defect_task, bulk_task, dielectric, query="defect", key="task_id"
+    ):
 
         # Metadata
         self.last_updated = datetime.now()
@@ -92,25 +96,25 @@ def update_one(self, defect_task, bulk_task, dielectric, query="defect", key="ta
         defect = self.get_defect_from_task(query=query, task=defect_task)
         d_id = defect_task[key]
         b_id = bulk_task[key]
-        defect_task = TaskDocument(**defect_task['output'])
-        bulk_task = TaskDocument(**bulk_task['output']) # TODO Atomate2Store 
+        defect_task = TaskDocument(**defect_task["output"])
+        bulk_task = TaskDocument(**bulk_task["output"])  # TODO Atomate2Store
         defect_entry, valid = self.get_defect_entry_from_tasks(
             defect_task, bulk_task, defect, dielectric
         )
         bulk_entry = self.get_bulk_entry_from_task(bulk_task)
 
         rt = defect_task.calcs_reversed[0].run_type
-        tt = defect_task.calcs_reversed[0].task_type
-        ct = defect_task.calcs_reversed[0].calc_type
-        current_largest_sc = self.defect_entries[rt].sc_entry.composition.num_atoms if rt in self.defect_entries else 0
+        defect_task.calcs_reversed[0].task_type
+        defect_task.calcs_reversed[0].calc_type
+        current_largest_sc = (
+            self.defect_entries[rt].sc_entry.composition.num_atoms
+            if rt in self.defect_entries
+            else 0
+        )
         potential_largest_sc = defect_entry.sc_entry.composition.num_atoms
-        if (
-            potential_largest_sc > current_largest_sc
-            or (
-                potential_largest_sc == current_largest_sc
-                and defect_entry.sc_entry.energy
-                < self.defect_entries[rt].sc_entry.energy
-            )
+        if potential_largest_sc > current_largest_sc or (
+            potential_largest_sc == current_largest_sc
+            and defect_entry.sc_entry.energy < self.defect_entries[rt].sc_entry.energy
         ):
             self.defect_entries[rt] = defect_entry
             self.defect_ids[rt] = d_id
@@ -135,7 +139,15 @@ def update_many(
             )
 
     @classmethod
-    def from_tasks(cls: Type[T], defect_task, bulk_task, dielectric, query="defect", key="task_id", material_id=None) -> T:
+    def from_tasks(
+        cls: Type[T],
+        defect_task,
+        bulk_task,
+        dielectric,
+        query="defect",
+        key="task_id",
+        material_id=None,
+    ) -> T:
         """
         The standard way to create this document.
         Args:
@@ -147,16 +159,18 @@ def from_tasks(cls: Type[T], defect_task, bulk_task, dielectric, query="defect",
         defect = cls.get_defect_from_task(query=query, task=defect_task)
         defect_task = TaskDocument(**defect_task["output"])
         bulk_task_id = bulk_task[key]
-        bulk_task = TaskDocument(**bulk_task['output'])
+        bulk_task = TaskDocument(**bulk_task["output"])
 
         # Metadata
-        last_updated = datetime.now() 
-        created_at = datetime.now() 
+        last_updated = datetime.now()
+        created_at = datetime.now()
 
         rt = defect_task.calcs_reversed[0].run_type
 
         metadata = {}
-        defect_entry, valid = cls.get_defect_entry_from_tasks(defect_task, bulk_task, defect, dielectric)
+        defect_entry, valid = cls.get_defect_entry_from_tasks(
+            defect_task, bulk_task, defect, dielectric
+        )
         valid = {rt: valid}
         defect_entries = {rt: defect_entry}
         bulk_entries = {rt: cls.get_bulk_entry_from_task(bulk_task)}
@@ -187,7 +201,9 @@ def from_tasks(cls: Type[T], defect_task, bulk_task, dielectric, query="defect",
             "metadata": metadata,
             "valid": valid,
         }
-        prim = SpacegroupAnalyzer(defect_entries[rt].defect.structure).get_primitive_standard_structure()
+        prim = SpacegroupAnalyzer(
+            defect_entries[rt].defect.structure
+        ).get_primitive_standard_structure()
         data.update(StructureMetadata.from_structure(prim).dict())
         return cls(**data)
 
@@ -229,7 +245,7 @@ def get_defect_entry_from_tasks(
             sc_defect_frac_coords=parameters["defect_frac_sc_coords"],
             corrections=corrections,
         )
-        parameters['defect'] = defect
+        parameters["defect"] = defect
         valid = DefectValidation().process_entry(parameters)
         return defect_entry, valid
 
@@ -255,7 +271,9 @@ def get_freysoldt_correction(cls, parameters) -> Tuple[Dict, Dict]:
         if parameters["charge_state"] and not parameters.get("2d"):
             es, pot, met = get_freysoldt_correction(
                 q=parameters["charge_state"],
-                dielectric=np.array(parameters["dielectric"]), # TODO pmg-analysis expects np array here
+                dielectric=np.array(
+                    parameters["dielectric"]
+                ),  # TODO pmg-analysis expects np array here
                 defect_locpot=parameters["defect_v_hartree"],
                 bulk_locpot=parameters["bulk_v_hartree"],
                 defect_frac_coords=parameters["defect_frac_sc_coords"],
@@ -268,7 +286,7 @@ def get_freysoldt2d_correction(cls, parameters):
 
         from pymatgen.io.vasp.outputs import VolumetricData as VaspVolumetricData
 
-        if False: #parameters["charge_state"] and parameters.get("2d"):
+        if parameters["charge_state"] and parameters.get("2d"):
             eps_parallel = (
                 parameters["dielectric"][0][0] + parameters["dielectric"][1][1]
             ) / 2
@@ -354,19 +372,22 @@ def get_parameters_from_tasks(
 
         return parameters
 
+
 class DefectValidation(BaseModel):
     """Validate a task document for defect processing"""
 
     MAX_ATOMIC_RELAXATION: float = Field(
-        0.02, 
-        description="Threshold for the mean absolute displacement of atoms outside a defect's radius of isolution"
-        )
+        0.02,
+        description="Threshold for the mean absolute displacement of atoms outside a defect's radius of isolution",
+    )
 
-    DESORPTION_DISTANCE: float = Field(3, description="Distance to consider adsorbate as desorbed")
+    DESORPTION_DISTANCE: float = Field(
+        3, description="Distance to consider adsorbate as desorbed"
+    )
 
     def process_entry(self, parameters) -> V:
         """Gets a dictionary of {validator: result}. Result true for passing, false for failing."""
-        v = {} 
+        v = {}
         v.update(self._atomic_relaxation(parameters))
         v.update(self._desorption(parameters))
         return v
@@ -375,10 +396,16 @@ def _atomic_relaxation(self, parameters):
         """Returns false if the mean displacement outside the isolation radius is greater than the cutoff"""
         in_struc = parameters["initial_defect_structure"]
         out_struc = parameters["final_defect_structure"]
-        sites = out_struc.get_sites_in_sphere(parameters['defect_frac_sc_coords'], get_truncated_coulomb_cutoff(in_struc), include_index=True)
+        sites = out_struc.get_sites_in_sphere(
+            parameters["defect_frac_sc_coords"],
+            get_truncated_coulomb_cutoff(in_struc),
+            include_index=True,
+        )
         inside_sphere = [site.index for site in sites]
         outside_sphere = [i for i in range(len(out_struc)) if i not in inside_sphere]
-        distances = np.array([site.distance(in_struc[i]) for i, site in enumerate(out_struc)])
+        distances = np.array(
+            [site.distance(in_struc[i]) for i, site in enumerate(out_struc)]
+        )
         distances_outside = distances[outside_sphere]
         if np.mean(distances_outside) > self.MAX_ATOMIC_RELAXATION:
             return {"atomic_relaxation": False}
@@ -386,16 +413,24 @@ def _atomic_relaxation(self, parameters):
 
     def _desorption(self, parameters):
         """Returns false if any atom is too far from all other atoms."""
-        if isinstance(parameters['defect'], Adsorbate):
+        if isinstance(parameters["defect"], Adsorbate):
             out_struc = parameters["final_defect_structure"]
-            defect_site =  out_struc.get_sites_in_sphere(
-                out_struc.lattice.get_cartesian_coords(parameters['defect_frac_sc_coords']), 
-                0.1, include_index=True
-                )[0]
-            distances = [defect_site.distance(site) for i, site in enumerate(out_struc) if i != defect_site.index]
+            defect_site = out_struc.get_sites_in_sphere(
+                out_struc.lattice.get_cartesian_coords(
+                    parameters["defect_frac_sc_coords"]
+                ),
+                0.1,
+                include_index=True,
+            )[0]
+            distances = [
+                defect_site.distance(site)
+                for i, site in enumerate(out_struc)
+                if i != defect_site.index
+            ]
             if all(d > self.DESORPTION_DISTANCE for d in distances):
-                return {'desorption': False}
-        return {'desorption': True}
+                return {"desorption": False}
+        return {"desorption": True}
+
 
 class DefectiveMaterialDoc(StructureMetadata):
     """Document containing all / many defect tasks for a single material ID"""
@@ -431,7 +466,7 @@ def from_docs(cls: Type["S"], defect_docs: DefectDoc, material_id: str) -> S:
 
     @property
     def element_set(self) -> set:
-        els = set(Element(e) for e in self.defect_docs[0].defect.structure.symbol_set)
+        els = {Element(e) for e in self.defect_docs[0].defect.structure.symbol_set}
         for d in self.defect_docs:
             els = els | set(d.defect.element_changes.keys())
         return els

From bc3e3775cb4f9a96e07d387ef3bddeacfd8c729d Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 20 Jan 2023 11:48:03 -0800
Subject: [PATCH 45/50] Temporary v_hartree solution for 2d

---
 src/atomate2/cp2k/schemas/defect.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index 800f50dbf4..e2a2466a36 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -294,6 +294,11 @@ def get_freysoldt2d_correction(cls, parameters):
             dielectric = (eps_parallel - 1) / (1 - 1 / eps_perp)
             with ScratchDir("."):
 
+                # TODO builder ensure structures are commensurate, but the sxdefectalign2d requires exact match
+                # between structures (to about 6 digits of precision). No good solution right now,
+                # Just setting def lattice with bulk lattice, which will shift the locpot data
+                parameters["defect_v_hartree"].structure.lattice = parameters["bulk_v_hartree"].structure
+
                 lref = VaspVolumetricData(
                     structure=parameters["bulk_v_hartree"].structure,
                     data=parameters["bulk_v_hartree"].data,

From e41ee3df267b33b510a7a549bf4f32a6fa515b72 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 20 Jan 2023 11:48:22 -0800
Subject: [PATCH 46/50] lint

---
 src/atomate2/cp2k/schemas/defect.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index e2a2466a36..8312a1c9dc 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -297,7 +297,9 @@ def get_freysoldt2d_correction(cls, parameters):
                 # TODO builder ensure structures are commensurate, but the sxdefectalign2d requires exact match
                 # between structures (to about 6 digits of precision). No good solution right now,
                 # Just setting def lattice with bulk lattice, which will shift the locpot data
-                parameters["defect_v_hartree"].structure.lattice = parameters["bulk_v_hartree"].structure
+                parameters["defect_v_hartree"].structure.lattice = parameters[
+                    "bulk_v_hartree"
+                ].structure
 
                 lref = VaspVolumetricData(
                     structure=parameters["bulk_v_hartree"].structure,

From 7a392347da5c1d772f307e8f3bc366359336a38e Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 20 Jan 2023 15:12:18 -0800
Subject: [PATCH 47/50] updates

---
 src/atomate2/cp2k/jobs/defect.py | 30 ++++----------------
 src/atomate2/cp2k/sets/defect.py | 48 +++++++++++++++++---------------
 2 files changed, 30 insertions(+), 48 deletions(-)

diff --git a/src/atomate2/cp2k/jobs/defect.py b/src/atomate2/cp2k/jobs/defect.py
index 39a4542679..37af4f658c 100644
--- a/src/atomate2/cp2k/jobs/defect.py
+++ b/src/atomate2/cp2k/jobs/defect.py
@@ -19,7 +19,6 @@
     DefectHybridRelaxSetGenerator,
     DefectHybridStaticSetGenerator,
     DefectRelaxSetGenerator,
-    DefectSetGenerator,
     DefectStaticSetGenerator,
 )
 
@@ -96,7 +95,7 @@ def make(self, defect: Defect | Structure, prev_cp2k_dir: str | Path | None = No
 class DefectStaticMaker(BaseDefectMaker):
 
     name: str = field(default="defect static")
-    input_set_generator: DefectSetGenerator = field(
+    input_set_generator: Cp2kInputGenerator = field(
         default_factory=DefectStaticSetGenerator
     )
 
@@ -148,23 +147,16 @@ class DefectHybridStaticMaker(BaseDefectMaker):
 
     name: str = field(default="defect hybrid static")
     hybrid_functional: str = "PBE0"
-    input_set_generator: DefectSetGenerator = field(
+    input_set_generator: Cp2kInputGenerator = field(
         default_factory=DefectHybridStaticSetGenerator
     )
 
-    def __post_init__(self):
-        """Update the input settings with hybrid_functional attribute"""
-        self.input_set_generator.user_input_settings.update(
-            {"activate_hybrid": {"hybrid_functional": self.hybrid_functional}}
-        )
-
-
 @dataclass
 class DefectHybridRelaxMaker(BaseDefectMaker):
 
     name: str = field(default="defect hybrid relax")
     hybrid_functional: str = "PBE0"
-    input_set_generator: DefectSetGenerator = field(
+    input_set_generator: Cp2kInputGenerator = field(
         default_factory=DefectHybridRelaxSetGenerator
     )
     transformations: tuple[str, ...] = field(
@@ -174,19 +166,13 @@ class DefectHybridRelaxMaker(BaseDefectMaker):
         default=({"distance": 0.01},)
     )
 
-    def __post_init__(self):
-        """Update the input settings with hybrid_functional attribute"""
-        self.input_set_generator.user_input_settings.update(
-            {"activate_hybrid": {"hybrid_functional": self.hybrid_functional}}
-        )
-
 
 @dataclass
 class DefectHybridCellOptMaker(BaseDefectMaker):
 
     name: str = field(default="defect hybrid cell opt")
     hybrid_functional: str = "PBE0"
-    input_set_generator: DefectSetGenerator = field(
+    input_set_generator: Cp2kInputGenerator = field(
         default_factory=DefectHybridCellOptSetGenerator
     )
     transformations: tuple[str, ...] = field(
@@ -194,10 +180,4 @@ class DefectHybridCellOptMaker(BaseDefectMaker):
     )
     transformation_params: tuple[dict, ...] | None = field(
         default=({"distance": 0.01},)
-    )
-
-    def __post_init__(self):
-        """Update the input settings with hybrid_functional attribute"""
-        self.input_set_generator.user_input_settings.update(
-            {"activate_hybrid": {"hybrid_functional": self.hybrid_functional}}
-        )
+    )
\ No newline at end of file
diff --git a/src/atomate2/cp2k/sets/defect.py b/src/atomate2/cp2k/sets/defect.py
index 33f06e5019..3aee7ed977 100644
--- a/src/atomate2/cp2k/sets/defect.py
+++ b/src/atomate2/cp2k/sets/defect.py
@@ -6,6 +6,7 @@
 from dataclasses import dataclass
 
 from pymatgen.core import Structure
+from pymatgen.io.cp2k.utils import get_truncated_coulomb_cutoff
 
 from atomate2.cp2k.sets.base import Cp2kInputGenerator
 from atomate2.cp2k.sets.core import (
@@ -16,49 +17,50 @@
 
 DEFECT_SET_UPDATES = {'print_v_hartree': True, "print_pdos": True, "print_dos": True}
 
-@dataclass
-class DefectSetGenerator(Cp2kInputGenerator):
-    """
-    Base input set generator for defect calculations. Adds printing of the
-    partial density of states and the electrostatic potential.
-    """
-
-    def get_input_updates(self, structure: Structure, *args, **kwargs) -> dict:
-        """Get input updates"""
-        return {'print_v_hartree': True, "print_pdos": True, "print_dos": True}
-
 @dataclass
 class DefectStaticSetGenerator(StaticSetGenerator):
 
-    def __post_init__(self):
-        self.user_input_settings.update(DEFECT_SET_UPDATES)
+    def get_input_updates(self, *args, **kwargs) -> dict:
+        updates = super().get_input_updates(*args, **kwargs)
+        updates.update(DEFECT_SET_UPDATES)
+        return updates
 
 @dataclass
 class DefectRelaxSetGenerator(RelaxSetGenerator):
 
-    def __post_init__(self):
-        self.user_input_settings.update(DEFECT_SET_UPDATES)
+    def get_input_updates(self, *args, **kwargs) -> dict:
+        updates = super().get_input_updates(*args, **kwargs)
+        updates.update(DEFECT_SET_UPDATES)
+        return updates
 
 @dataclass
 class DefectCellOptSetGenerator(CellOptSetGenerator):
 
-    def __post_init__(self):
-        self.user_input_settings.update(DEFECT_SET_UPDATES)
+    def get_input_updates(self, *args, **kwargs) -> dict:
+        updates = super().get_input_updates(*args, **kwargs)
+        updates.update(DEFECT_SET_UPDATES)
+        return updates
 
 @dataclass
 class DefectHybridStaticSetGenerator(HybridStaticSetGenerator):
 
-    def __post_init__(self):
-        self.user_input_settings.update(DEFECT_SET_UPDATES)
+    def get_input_updates(self, *args, **kwargs) -> dict:
+        updates = super().get_input_updates(*args, **kwargs)
+        updates.update(DEFECT_SET_UPDATES)
+        return updates
 
 @dataclass
 class DefectHybridRelaxSetGenerator(HybridRelaxSetGenerator):
 
-    def __post_init__(self):
-        self.user_input_settings.update(DEFECT_SET_UPDATES)
+    def get_input_updates(self, *args, **kwargs) -> dict:
+        updates = super().get_input_updates(*args, **kwargs)
+        updates.update(DEFECT_SET_UPDATES)
+        return updates
 
 @dataclass
 class DefectHybridCellOptSetGenerator(HybridCellOptSetGenerator):
 
-    def __post_init__(self):
-        self.user_input_settings.update(DEFECT_SET_UPDATES)
\ No newline at end of file
+    def get_input_updates(self, *args, **kwargs) -> dict:
+        updates = super().get_input_updates(*args, **kwargs)
+        updates.update(DEFECT_SET_UPDATES)
+        return updates
\ No newline at end of file

From df484551ae8e9934133db32b4cf49c88c71e8162 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Fri, 20 Jan 2023 15:12:32 -0800
Subject: [PATCH 48/50] lint

---
 src/atomate2/cp2k/jobs/defect.py |  3 ++-
 src/atomate2/cp2k/sets/defect.py | 29 +++++++++++++++--------------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/atomate2/cp2k/jobs/defect.py b/src/atomate2/cp2k/jobs/defect.py
index 37af4f658c..fcaeec9817 100644
--- a/src/atomate2/cp2k/jobs/defect.py
+++ b/src/atomate2/cp2k/jobs/defect.py
@@ -151,6 +151,7 @@ class DefectHybridStaticMaker(BaseDefectMaker):
         default_factory=DefectHybridStaticSetGenerator
     )
 
+
 @dataclass
 class DefectHybridRelaxMaker(BaseDefectMaker):
 
@@ -180,4 +181,4 @@ class DefectHybridCellOptMaker(BaseDefectMaker):
     )
     transformation_params: tuple[dict, ...] | None = field(
         default=({"distance": 0.01},)
-    )
\ No newline at end of file
+    )
diff --git a/src/atomate2/cp2k/sets/defect.py b/src/atomate2/cp2k/sets/defect.py
index 3aee7ed977..0ac4200512 100644
--- a/src/atomate2/cp2k/sets/defect.py
+++ b/src/atomate2/cp2k/sets/defect.py
@@ -5,62 +5,63 @@
 import logging
 from dataclasses import dataclass
 
-from pymatgen.core import Structure
-from pymatgen.io.cp2k.utils import get_truncated_coulomb_cutoff
-
-from atomate2.cp2k.sets.base import Cp2kInputGenerator
 from atomate2.cp2k.sets.core import (
-    StaticSetGenerator, RelaxSetGenerator, CellOptSetGenerator,
-    HybridStaticSetGenerator, HybridRelaxSetGenerator, HybridCellOptSetGenerator
+    CellOptSetGenerator,
+    HybridCellOptSetGenerator,
+    HybridRelaxSetGenerator,
+    HybridStaticSetGenerator,
+    RelaxSetGenerator,
+    StaticSetGenerator,
 )
+
 logger = logging.getLogger(__name__)
 
-DEFECT_SET_UPDATES = {'print_v_hartree': True, "print_pdos": True, "print_dos": True}
+DEFECT_SET_UPDATES = {"print_v_hartree": True, "print_pdos": True, "print_dos": True}
+
 
 @dataclass
 class DefectStaticSetGenerator(StaticSetGenerator):
-
     def get_input_updates(self, *args, **kwargs) -> dict:
         updates = super().get_input_updates(*args, **kwargs)
         updates.update(DEFECT_SET_UPDATES)
         return updates
 
+
 @dataclass
 class DefectRelaxSetGenerator(RelaxSetGenerator):
-
     def get_input_updates(self, *args, **kwargs) -> dict:
         updates = super().get_input_updates(*args, **kwargs)
         updates.update(DEFECT_SET_UPDATES)
         return updates
 
+
 @dataclass
 class DefectCellOptSetGenerator(CellOptSetGenerator):
-
     def get_input_updates(self, *args, **kwargs) -> dict:
         updates = super().get_input_updates(*args, **kwargs)
         updates.update(DEFECT_SET_UPDATES)
         return updates
 
+
 @dataclass
 class DefectHybridStaticSetGenerator(HybridStaticSetGenerator):
-
     def get_input_updates(self, *args, **kwargs) -> dict:
         updates = super().get_input_updates(*args, **kwargs)
         updates.update(DEFECT_SET_UPDATES)
         return updates
 
+
 @dataclass
 class DefectHybridRelaxSetGenerator(HybridRelaxSetGenerator):
-
     def get_input_updates(self, *args, **kwargs) -> dict:
         updates = super().get_input_updates(*args, **kwargs)
         updates.update(DEFECT_SET_UPDATES)
         return updates
 
+
 @dataclass
 class DefectHybridCellOptSetGenerator(HybridCellOptSetGenerator):
-
     def get_input_updates(self, *args, **kwargs) -> dict:
         updates = super().get_input_updates(*args, **kwargs)
         updates.update(DEFECT_SET_UPDATES)
-        return updates
\ No newline at end of file
+        return updates

From d17ec26425c239335f4230856eaf0df26b770d32 Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Tue, 7 Mar 2023 10:24:41 -0800
Subject: [PATCH 49/50] defects

---
 src/atomate2/cp2k/flows/defect.py   | 25 ++++++++-------
 src/atomate2/cp2k/schemas/defect.py | 47 +++++++++++++++++------------
 src/atomate2/cp2k/sets/base.py      |  2 +-
 3 files changed, 42 insertions(+), 32 deletions(-)

diff --git a/src/atomate2/cp2k/flows/defect.py b/src/atomate2/cp2k/flows/defect.py
index 1faefcf425..adc705037f 100644
--- a/src/atomate2/cp2k/flows/defect.py
+++ b/src/atomate2/cp2k/flows/defect.py
@@ -67,7 +67,8 @@ class DefectHybridCellOptFlowMaker(HybridCellOptFlowMaker):
     )
 
 
-# TODO close to being able to put this in common. Just need a switch that decides which core flow/job to use based on software
+# TODO close to being able to put this in common. Just need a switch that decides
+# which core flow/job to use based on software
 @dataclass
 class FormationEnergyMaker(Maker):
     """
@@ -85,12 +86,14 @@ class FormationEnergyMaker(Maker):
     initialize_with_pbe: If hybrid functional is provided, this enables
         the use of a static PBE run before the hybrid calc to provide a
         starting guess for CP2K HF module.
-    supercell_matrix: If provided, the defect supercell wil lbe created
+    supercell_matrix: If provided, the defect supercell will be created
         by this 3x3 matrix. Else other parameters will be used.
     max_atoms: Maximum number of atoms allowed in the supercell.
     min_atoms: Minimum number of atoms allowed in the supercell.
-    min_length: Minimum length of the smallest supercell lattice vector.
-    force_diagonal: If True, return a transformation with a diagonal transformation matrix.
+    min_length: Minimum length of the smallest supercell lattice
+        vector.
+    force_diagonal: If True, return a transformation with a
+        diagonal transformation matrix.
     """
 
     name: str = "defect formation energy"
@@ -178,8 +181,8 @@ def make(
         flow: Flow
             The workflow to calculate the formation energy diagram.
         """
-        jobs, defect_outputs = [], {}
-        defect_outputs = {
+        jobs = []
+        defect_outputs: dict[str, dict[int, tuple[Defect, OutputReference]]] = {
             defect.name: {} for defect in defects
         }  # TODO DEFECT NAMES ARE NOT UNIQUE HASHES
         bulk_structure = ensure_defects_same_structure(defects)
@@ -205,8 +208,8 @@ def make(
             jobs.append(bulk_job)
 
         for defect in defects:
-            if charges == True:
-                chgs = defect.get_charge_states() if charges else [0]
+            if charges is True:
+                chgs = defect.get_charge_states()
             else:
                 chgs = charges if charges else [0]
             for charge in chgs:
@@ -251,7 +254,7 @@ def collect_defect_outputs(
     dielectric:
         The dielectric constant used to construct the formation energy diagram.
     """
-    outputs = {"results": {}}
+    outputs: dict[str, dict[str, dict]] = {"results": {}}
     if not dielectric:
         logger.warn(
             "Dielectric constant not provided. Defect formation energies will be uncorrected."
@@ -261,7 +264,7 @@ def collect_defect_outputs(
         fnv_plots = {}
         for charge, defect_and_output in defects_with_charges.items():
             defect, output_with_charge = defect_and_output
-            logger.info(f"Processing {defect.name} with charge state={charge}")
+            logger.info(f"Processing {defect_name} with charge state={charge}")
             defect_entry = DefectEntry(
                 defect=defect,
                 charge_state=charge,
@@ -281,7 +284,7 @@ def collect_defect_outputs(
                 dielectric=dielectric,
             )
             fnv_plots[int(charge)] = plot_data
-        outputs["results"][defect.name] = dict(
+        outputs["results"][defect_name] = dict(
             defect=defect, defect_entries=defect_entries, fnv_plots=fnv_plots
         )
     return outputs
diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index 8312a1c9dc..112341dc60 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -1,5 +1,5 @@
 from datetime import datetime
-from typing import Callable, ClassVar, Dict, List, Mapping, Tuple, Type, TypeVar
+from typing import Callable, ClassVar, Dict, List, Mapping, Set, Tuple, Type, TypeVar
 
 import numpy as np
 from monty.json import MontyDecoder
@@ -11,11 +11,7 @@
     get_freysoldt_correction,
 )
 from pymatgen.analysis.defects.finder import DefectSiteFinder
-from pymatgen.analysis.defects.thermo import (
-    DefectEntry,
-    DefectSiteFinder,
-    MultiFormationEnergyDiagram,
-)
+from pymatgen.analysis.defects.thermo import DefectEntry, MultiFormationEnergyDiagram
 from pymatgen.analysis.phase_diagram import PhaseDiagram
 from pymatgen.core import Element
 from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
@@ -25,7 +21,7 @@
 from atomate2 import SETTINGS
 from atomate2.common.schemas.structure import StructureMetadata
 from atomate2.cp2k.schemas.calc_types.enums import RunType
-from atomate2.cp2k.schemas.task import TaskDocument
+from atomate2.cp2k.schemas.task import Cp2kObject, TaskDocument
 
 __all__ = ["DefectDoc"]
 
@@ -221,8 +217,8 @@ def get_defect_entry_from_tasks(
         Args:
             defect_task: task dict for the defect calculation
             bulk_task: task dict for the bulk calculation
-            dielectric: Dielectric doc if the defect is charged. If not present, no dielectric
-                corrections will be performed, even if the defect is charged.
+            dielectric: Dielectric doc if the defect is charged. If not present, no
+                dielectric corrections will be performed, even if the defect is charged.
             query: Mongo-style query to retrieve the defect object from the defect task
         """
         parameters = cls.get_parameters_from_tasks(
@@ -294,12 +290,14 @@ def get_freysoldt2d_correction(cls, parameters):
             dielectric = (eps_parallel - 1) / (1 - 1 / eps_perp)
             with ScratchDir("."):
 
-                # TODO builder ensure structures are commensurate, but the sxdefectalign2d requires exact match
-                # between structures (to about 6 digits of precision). No good solution right now,
-                # Just setting def lattice with bulk lattice, which will shift the locpot data
+                # TODO builder ensure structures are commensurate, but the
+                # sxdefectalign2d requires exact match between structures
+                # (to about 6 digits of precision). No good solution right now,
+                # Just setting def lattice with bulk lattice, which will shift
+                # the locpot data
                 parameters["defect_v_hartree"].structure.lattice = parameters[
                     "bulk_v_hartree"
-                ].structure
+                ].structure.lattice
 
                 lref = VaspVolumetricData(
                     structure=parameters["bulk_v_hartree"].structure,
@@ -337,7 +335,8 @@ def get_parameters_from_tasks(
         cls, defect_task: TaskDocument, bulk_task: TaskDocument
     ):
         """
-        Get parameters necessary to create a defect entry from defect and bulk task dicts
+        Get parameters necessary to create a defect entry from defect and bulk
+        task dicts
         Args:
             defect_task: task dict for the defect calculation
             bulk_task: task dict for the bulk calculation
@@ -367,10 +366,10 @@ def get_parameters_from_tasks(
             "charge_state": defect_task.output.structure.charge,
             "defect_frac_sc_coords": defect_frac_sc_coords,
             "defect_v_hartree": MontyDecoder().process_decoded(
-                defect_task.cp2k_objects["v_hartree"]
+                defect_task.cp2k_objects[Cp2kObject.v_hartree]  # type: ignore
             ),  # TODO CP2K spec name
             "bulk_v_hartree": MontyDecoder().process_decoded(
-                bulk_task.cp2k_objects["v_hartree"]
+                bulk_task.cp2k_objects[Cp2kObject.v_hartree]  # type: ignore
             ),  # TODO CP2K spec name
         }
 
@@ -392,15 +391,21 @@ class DefectValidation(BaseModel):
         3, description="Distance to consider adsorbate as desorbed"
     )
 
-    def process_entry(self, parameters) -> V:
-        """Gets a dictionary of {validator: result}. Result true for passing, false for failing."""
+    def process_entry(self, parameters) -> Dict:
+        """
+        Gets a dictionary of {validator: result}. Result true for passing,
+        false for failing.
+        """
         v = {}
         v.update(self._atomic_relaxation(parameters))
         v.update(self._desorption(parameters))
         return v
 
     def _atomic_relaxation(self, parameters):
-        """Returns false if the mean displacement outside the isolation radius is greater than the cutoff"""
+        """
+        Returns false if the mean displacement outside the isolation radius is greater
+        than the cutoff.
+        """
         in_struc = parameters["initial_defect_structure"]
         out_struc = parameters["final_defect_structure"]
         sites = out_struc.get_sites_in_sphere(
@@ -487,10 +492,12 @@ def get_formation_energy_diagram(
     ) -> MultiFormationEnergyDiagram:
 
         filters = filters if filters else [lambda _: True]
-        els = set()
+        els: Set[Element] = set()
         defect_entries = []
         bulk_entries = []
         vbms = []
+        if isinstance(run_type, str):
+            run_type = RunType(run_type)
         for doc in filter(lambda x: all(f(x) for f in filters), self.defect_docs):
             if doc.defect_entries.get(run_type):
                 els = els | set(doc.defect.element_changes.keys())
diff --git a/src/atomate2/cp2k/sets/base.py b/src/atomate2/cp2k/sets/base.py
index e7a685dbb2..8ac5d83f75 100644
--- a/src/atomate2/cp2k/sets/base.py
+++ b/src/atomate2/cp2k/sets/base.py
@@ -189,7 +189,7 @@ class Cp2kInputGenerator(InputGenerator):
     user_input_settings: dict = field(default_factory=dict)
     user_kpoints_settings: dict | Kpoints = field(default_factory=dict)
     auto_kspacing: bool = True
-    use_structure_charge: bool = False
+    use_structure_charge: bool = True
     sort_structure: bool = True
     symprec: float = SETTINGS.SYMPREC
     force_gamma: bool = False

From e856206f393849205e578c40cafcf29844029bba Mon Sep 17 00:00:00 2001
From: nwinner <nwinner@berkeley.edu>
Date: Sun, 26 Mar 2023 16:15:07 -0700
Subject: [PATCH 50/50] freysoldt

---
 src/atomate2/cp2k/schemas/defect.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/atomate2/cp2k/schemas/defect.py b/src/atomate2/cp2k/schemas/defect.py
index 112341dc60..e78d01da96 100644
--- a/src/atomate2/cp2k/schemas/defect.py
+++ b/src/atomate2/cp2k/schemas/defect.py
@@ -176,7 +176,7 @@ def from_tasks(
             "intrinsic"
             if all(
                 el in defect_entries[rt].defect.structure.composition
-                for el in defect_entries[rt].defect.element_changes.keys()
+                for el in defect_entries[rt].defect.element_changes
             )
             else "extrinsic"
         )
@@ -265,7 +265,7 @@ def get_correction_from_parameters(cls, parameters) -> Tuple[Dict, Dict]:
     @classmethod
     def get_freysoldt_correction(cls, parameters) -> Tuple[Dict, Dict]:
         if parameters["charge_state"] and not parameters.get("2d"):
-            es, pot, met = get_freysoldt_correction(
+            result = get_freysoldt_correction(
                 q=parameters["charge_state"],
                 dielectric=np.array(
                     parameters["dielectric"]
@@ -274,7 +274,7 @@ def get_freysoldt_correction(cls, parameters) -> Tuple[Dict, Dict]:
                 bulk_locpot=parameters["bulk_v_hartree"],
                 defect_frac_coords=parameters["defect_frac_sc_coords"],
             )
-            return {"electrostatic": es, "potential_alignment": pot}, met
+            return {"freysoldt": result.correction_energy}, result.metadata
         return {}, {}
 
     @classmethod
@@ -310,7 +310,7 @@ def get_freysoldt2d_correction(cls, parameters):
                 lref.write_file("LOCPOT.ref")
                 ldef.write_file("LOCPOT.def")
 
-                es, pot, met = get_freysoldt2d_correction(
+                result = get_freysoldt2d_correction(
                     q=parameters["charge_state"],
                     dielectric=dielectric,
                     defect_locpot=ldef,
@@ -319,7 +319,7 @@ def get_freysoldt2d_correction(cls, parameters):
                     energy_cutoff=520,
                     slab_buffer=2,
                 )
-                return {"electrostatic": es, "potential_alignment": pot}, met
+                return {"freysoldt": result.correction_energy}, result.metadata
         return {}, {}
 
     @classmethod
@@ -339,7 +339,7 @@ def get_parameters_from_tasks(
         task dicts
         Args:
             defect_task: task dict for the defect calculation
-            bulk_task: task dict for the bulk calculation
+            bulk_task: task dict for the bulk calculation.
         """
         final_defect_structure = defect_task.structure
         final_bulk_structure = bulk_task.structure
@@ -380,7 +380,7 @@ def get_parameters_from_tasks(
 
 
 class DefectValidation(BaseModel):
-    """Validate a task document for defect processing"""
+    """Validate a task document for defect processing."""
 
     MAX_ATOMIC_RELAXATION: float = Field(
         0.02,
@@ -445,7 +445,7 @@ def _desorption(self, parameters):
 
 
 class DefectiveMaterialDoc(StructureMetadata):
-    """Document containing all / many defect tasks for a single material ID"""
+    """Document containing all / many defect tasks for a single material ID."""
 
     property_name: ClassVar[str] = "defective material"