From a4b78ebd16eb92f856e9fd2bc3ddc54e7c1d46b6 Mon Sep 17 00:00:00 2001
From: Richard Tibbles <richard@learningequality.org>
Date: Sat, 9 Mar 2024 15:47:04 -0800
Subject: [PATCH 1/7] Add support for cheffing QTI IMSCP files.

---
 ricecooker/classes/files.py | 151 +++++++++++++++++++++++++++++++++++-
 ricecooker/classes/nodes.py |  75 ++++++++++++++++++
 setup.py                    |   1 +
 3 files changed, 225 insertions(+), 2 deletions(-)

diff --git a/ricecooker/classes/files.py b/ricecooker/classes/files.py
index f6451533..b5a997d8 100644
--- a/ricecooker/classes/files.py
+++ b/ricecooker/classes/files.py
@@ -2,8 +2,10 @@
 from __future__ import unicode_literals
 
 import hashlib
+import io
 import json
 import os
+import re
 import shutil
 import tempfile
 import zipfile
@@ -12,13 +14,16 @@
 from urllib.parse import urlparse
 from xml.etree import ElementTree
 
+import chardet
 import filetype
+import xmltodict
 import yt_dlp
 from cachecontrol.caches.file_cache import FileCache
 from le_utils.constants import exercises
 from le_utils.constants import file_formats
 from le_utils.constants import format_presets
 from le_utils.constants import languages
+from lxml import etree
 from PIL import Image
 from PIL import UnidentifiedImageError
 from requests.exceptions import ConnectionError
@@ -745,6 +750,9 @@ def get_preset(self):
         return self.preset or format_presets.EPUB
 
 
+no_index_zips = set([format_presets.HTML5_DEPENDENCY_ZIP, format_presets.QTI_ZIP])
+
+
 class HTMLZipFile(DownloadFile):
     default_ext = file_formats.HTML5
     allowed_formats = [file_formats.HTML5]
@@ -757,8 +765,8 @@ def process_file(self):
         self.filename = super(HTMLZipFile, self).process_file()
         if self.filename:
             try:
-                # make sure index.html exists unless this is a dependency (i.e. shared resources) zip
-                if not self.get_preset() == format_presets.HTML5_DEPENDENCY_ZIP:
+                # make sure index.html exists unless this is a dependency file or IMS content package
+                if self.get_preset() not in no_index_zips:
                     with zipfile.ZipFile(config.get_storage_path(self.filename)) as zf:
                         _ = zf.getinfo("index.html")
             except KeyError as err:
@@ -768,6 +776,145 @@ def process_file(self):
         return self.filename
 
 
+class IMSCPZipFile(HTMLZipFile):
+    def strip_ns_prefix(self, tree):
+        """Strip namespace prefixes from an LXML tree.
+        From https://stackoverflow.com/a/30233635
+        """
+        for element in tree.xpath("descendant-or-self::*[namespace-uri()!='']"):
+            element.tag = etree.QName(element).localname
+
+    def strip_langstring(self, tree):
+        """Replace all langstring elements with their text value."""
+        for ls in tree.xpath(".//langstring"):
+            ls.tail = ls.text + ls.tail if ls.tail else ls.text
+        etree.strip_elements(tree, "langstring", with_tail=False)
+
+    def collect_metadata(self, metadata_elem):
+        self.strip_ns_prefix(metadata_elem)
+        self.strip_langstring(metadata_elem)
+        metadata_dict = {}
+
+        for tag in ("general", "rights", "educational", "lifecycle"):
+            elem = metadata_elem.find("lom/%s" % tag)
+            if elem:
+                metadata_dict.update(xmltodict.parse(etree.tostring(elem)))
+
+        return metadata_dict
+
+    def get_manifest(self):
+        with zipfile.ZipFile(config.get_storage_path(self.filename)) as zf:
+            manifest_file = zf.open("imsmanifest.xml")
+            try:
+                return etree.parse(manifest_file).getroot()
+            except etree.XMLSyntaxError:
+                # we've run across XML files that are marked as UTF-8 encoded but which have non-UTF-8 characters in them
+                # for this case, detect the 'real' encoding and decode it as unicode, then make it actual UTF-8 and parse.
+                f = zf.open("imsmanifest.xml", "rb")
+                data = f.read()
+                f.close()
+
+                info = chardet.detect(data)
+                data = data.decode(info["encoding"])
+                return etree.parse(io.BytesIO(data.encode("utf-8"))).getroot()
+
+    def walk_items(self, root):
+        root_dict = dict(root.items())
+
+        title_elem = root.find("title", root.nsmap)
+        if title_elem is not None:
+            # title_elem.text has issues when there are BR tags. Instead get ALL text, ignoring BR tags.
+            # As BR tags do not make sense in metadata, we can assume it's an editor glitch causing it.
+            text = ""
+            for child in title_elem.iter():
+                if child.text:
+                    text += child.text
+                if child.tail:
+                    text += child.tail
+            assert text.strip(), "Title element has no title: {}".format(
+                etree.tostring(title_elem, pretty_print=True)
+            )
+            root_dict["title"] = text.strip()
+
+        metadata_elem = root.find("metadata", root.nsmap)
+        if metadata_elem is not None:
+            root_dict["metadata"] = self.collect_metadata(metadata_elem)
+
+        children = []
+        for item in root.findall("item", root.nsmap):
+            children.append(self.walk_items(item))
+
+        if children:
+            root_dict["children"] = children
+
+        return root_dict
+
+    def derive_content_files_dict(self, resource_elem, resources_dict):
+        nsmap = resource_elem.nsmap
+        file_elements = resource_elem.findall("file", nsmap)
+        base = "./" + (
+            resource_elem.get("{http://www.w3.org/XML/1998/namespace}base") or ""
+        )
+        file_paths = [base + fe.get("href") for fe in file_elements]
+        dep_elements = resource_elem.findall("dependency", nsmap)
+        dep_paths = []
+        for de in dep_elements:
+            dre = resources_dict[de.get("identifierref")]
+            dep_paths.extend(self.derive_content_files_dict(dre, resources_dict))
+        return file_paths + dep_paths
+
+    def collect_resources(self, item, resources_dict):
+        if item.get("children"):
+            for child in item["children"]:
+                self.collect_resources(child, resources_dict)
+        elif item.get("identifierref"):
+            resource_elem = resources_dict[item["identifierref"]]
+
+            # Add all resource attrs to item dict
+            for key, value in resource_elem.items():
+                key_stripped = re.sub("^{.*}", "", key)  # Strip any namespace prefix
+                item[key_stripped] = value
+
+            if resource_elem.get("type") == "webcontent":
+                item["index_file"] = resource_elem.get("href")
+                item["files"] = self.derive_content_files_dict(
+                    resource_elem, resources_dict
+                )
+
+    def extract_metadata(self):
+        """Extract metadata and topic tree info from an IMSCP file.
+        Return a dict {'metadata': {...}, 'organizations': [list of topic dicts]}
+        """
+        manifest = self.get_manifest()
+
+        nsmap = manifest.nsmap
+
+        metadata_elem = manifest.find("metadata", nsmap)
+        metadata = {}
+        if metadata_elem is not None:
+            metadata = self.collect_metadata(metadata_elem)
+
+        resources_elem = manifest.find("resources", nsmap)
+        resources_dict = dict((r.get("identifier"), r) for r in resources_elem)
+
+        organizations = []
+        for org_elem in manifest.findall("organizations/organization", nsmap):
+            item_tree = self.walk_items(org_elem)
+            self.collect_resources(license, item_tree, resources_dict)
+            organizations.append(item_tree)
+
+        return {
+            "identifier": manifest.get("identifier"),
+            "metadata": metadata,
+            "organizations": organizations,
+        }
+
+
+class QTIZipFile(IMSCPZipFile):
+    def get_preset(self):
+        return format_presets.QTI_ZIP
+
+
 class H5PFile(DownloadFile):
     default_ext = file_formats.H5P
     allowed_formats = [file_formats.H5P]
diff --git a/ricecooker/classes/nodes.py b/ricecooker/classes/nodes.py
index efdd012c..acf4f721 100644
--- a/ricecooker/classes/nodes.py
+++ b/ricecooker/classes/nodes.py
@@ -1647,3 +1647,78 @@ def to_dict(self):
 
 # add alias for back-compatibility
 RemoteContentNode = StudioContentNode
+
+
+def get_by_path(obj, *args):
+    for arg in args:
+        obj = obj.get(arg, {})
+    if obj == {}:
+        return None
+    return obj
+
+
+QTI_PLACEHOLDER_SOURCE_ID = "A SOURCE ID THAT WILL BE REMOVED BEFORE UPLOAD"
+
+
+class QTINode(ContentNode):
+    """
+    Node representing QTI exercise
+    """
+
+    kind = content_kinds.EXERCISE
+
+    def __init__(
+        self, source_id=QTI_PLACEHOLDER_SOURCE_ID, title=None, license=None, **kwargs
+    ):
+        super(QTINode, self).__init__(source_id, title, license, **kwargs)
+        from .files import QTIZipFile
+
+        qti_files = [f for f in self.files if isinstance(f, QTIZipFile)]
+        qti_file = qti_files[0]
+        qti_file.process_file()
+        metadata = qti_file.extract_metadata()
+        self.source_id = (
+            metadata.get("identifer")
+            if self.source_id == QTI_PLACEHOLDER_SOURCE_ID
+            else self.source_id
+        )
+        if self.source_id is None:
+            raise InvalidNodeException(
+                "No source_id was provided and the QTI file {} does not have an identifier".format(
+                    qti_file.path
+                )
+            )
+        self.title = self.title or get_by_path(
+            metadata, "metadata", "general", "title", "string"
+        )
+        if self.title is None:
+            raise InvalidNodeException(
+                "No title was provided and the QTI file {} does not have a title".format(
+                    qti_file.path
+                )
+            )
+
+    def validate(self):
+        """validate: Makes sure QTI is valid
+        Args: None
+        Returns: boolean indicating if QTI is valid
+        """
+        from .files import QTIZipFile
+
+        try:
+            assert (
+                self.kind == content_kinds.EXERCISE
+            ), "Assumption Failed: Node should be an Exercise"
+            assert (
+                self.questions == []
+            ), "Assumption Failed: QTI should not have questions"
+            assert [
+                f for f in self.files if isinstance(f, QTIZipFile)
+            ], "Assumption Failed: QTI should have at least one QTI file"
+            return super(QTINode, self).validate()
+        except AssertionError as ae:
+            raise InvalidNodeException(
+                "Invalid node ({}): {} - {}".format(
+                    ae.args[0], self.title, self.__dict__
+                )
+            )
diff --git a/setup.py b/setup.py
index 1b84ac24..c6c05f5f 100644
--- a/setup.py
+++ b/setup.py
@@ -53,6 +53,7 @@
         "EbookLib>=0.17.1",
         "filetype>=1.1.0",
         "urllib3==2.2.1",
+        "xmltodict==0.13.0",
     ],
     python_requires=">=3.8, <3.11",
     license="MIT license",

From 58fb6fed225e94ed264ca0509f3c73abd2c056d2 Mon Sep 17 00:00:00 2001
From: Richard Tibbles <richard@learningequality.org>
Date: Sun, 18 Feb 2024 16:49:12 -0800
Subject: [PATCH 2/7] Add utilities to map SCORM metadata to our own metadata
 schema where possible.

---
 ricecooker/utils/SCORM_metadata.py | 158 +++++++++++++++++++++++++++++
 tests/test_SCORM_metadata.py       |  88 ++++++++++++++++
 2 files changed, 246 insertions(+)
 create mode 100644 ricecooker/utils/SCORM_metadata.py
 create mode 100644 tests/test_SCORM_metadata.py

diff --git a/ricecooker/utils/SCORM_metadata.py b/ricecooker/utils/SCORM_metadata.py
new file mode 100644
index 00000000..bd1db071
--- /dev/null
+++ b/ricecooker/utils/SCORM_metadata.py
@@ -0,0 +1,158 @@
+"""
+Utilities for mapping from SCORM metadata to LE Utils metadata.
+"""
+from le_utils.constants.labels import learning_activities
+from le_utils.constants.labels import needs
+from le_utils.constants.labels import resource_type
+
+
+imscp_metadata_keys = {
+    "general": ["title", "description", "language", "keyword"],
+    "rights": [],
+    "educational": [
+        "interactivityType",
+        "interactivityLevel",
+        "learningResourceType",
+        "intendedEndUserRole",
+    ],
+    "lifecycle": [],
+}
+
+
+# Define mappings from SCORM educational types to LE Utils activity types
+SCORM_to_learning_activities_mappings = {
+    "exercise": learning_activities.PRACTICE,
+    "simulation": learning_activities.EXPLORE,
+    "questionnaire": learning_activities.PRACTICE,
+    "diagram": learning_activities.EXPLORE,
+    "figure": learning_activities.EXPLORE,
+    "graph": learning_activities.EXPLORE,
+    "index": learning_activities.READ,
+    "slide": learning_activities.READ,
+    "table": learning_activities.READ,
+    "narrative text": learning_activities.READ,
+    "exam": learning_activities.PRACTICE,
+    "experiment": learning_activities.EXPLORE,
+    "problem statement": learning_activities.REFLECT,
+    "self assessment": learning_activities.REFLECT,
+    "lecture": learning_activities.WATCH,
+}
+
+
+def map_scorm_to_le_utils_activities(scorm_dict):
+    le_utils_activities = []
+
+    # Adjustments based on interactivity
+    interactive_adjustments = {
+        learning_activities.EXPLORE: (
+            learning_activities.READ,
+            learning_activities.WATCH,
+        )
+    }
+
+    # Determine the interactivity level and type
+    interactivity_type = scorm_dict.get("educational", {}).get("interactivityType")
+    interactivity_level = scorm_dict.get("educational", {}).get("interactivityLevel")
+
+    is_interactive = (
+        interactivity_type
+        in [
+            "active",
+            "mixed",
+        ]
+        or interactivity_level in ["medium", "high"]
+    )
+
+    # Extract the learning resource types from the SCORM data
+    learning_resource_types = scorm_dict.get("educational", {}).get(
+        "learningResourceType", []
+    )
+
+    # Map each SCORM type to an LE Utils activity type
+    for learning_resource_type in learning_resource_types:
+        le_utils_type = SCORM_to_learning_activities_mappings.get(
+            learning_resource_type
+        )
+        # Adjust based on interactivity
+        if not is_interactive and le_utils_type in interactive_adjustments:
+            le_utils_type = (
+                interactive_adjustments[le_utils_type][0]
+                if learning_resource_type == "simulation"
+                else interactive_adjustments[le_utils_type][1]
+            )
+
+        if le_utils_type and le_utils_type not in le_utils_activities:
+            le_utils_activities.append(le_utils_type)
+
+    return le_utils_activities
+
+
+# Define mappings from SCORM educational types to educator-focused resource types
+SCORM_to_resource_type_mappings = {
+    "exercise": resource_type.EXERCISE,
+    "simulation": resource_type.ACTIVITY,
+    "questionnaire": resource_type.ACTIVITY,
+    "diagram": resource_type.MEDIA,
+    "figure": resource_type.MEDIA,
+    "graph": resource_type.MEDIA,
+    "index": resource_type.GUIDE,
+    "slide": resource_type.LESSON,
+    "table": resource_type.TUTORIAL,
+    "narrative text": resource_type.TEXTBOOK,
+    "exam": resource_type.EXERCISE,
+    "experiment": resource_type.ACTIVITY,
+    "problem statement": resource_type.ACTIVITY,
+    "self assessment": resource_type.ACTIVITY,
+    "lecture": resource_type.LESSON,
+}
+
+
+# Mapping for intendedEndUserRole when the resource is for educators
+SCORM_intended_role_to_resource_type_mapping = {
+    "teacher": resource_type.LESSON_PLAN,
+    "author": resource_type.GUIDE,
+    "manager": resource_type.GUIDE,
+}
+
+
+def map_scorm_to_educator_resource_types(scorm_dict):
+    educator_resource_types = []
+
+    # Extract the learning resource types and intended end user role from the SCORM data
+    learning_resource_types = scorm_dict.get("educational", {}).get(
+        "learningResourceType", []
+    )
+    intended_roles = scorm_dict.get("educational", {}).get("intendedEndUserRole", [])
+
+    # Map each SCORM type to an educator-focused resource type
+    for learning_resource_type in learning_resource_types:
+        mapped_type = SCORM_to_resource_type_mappings.get(learning_resource_type)
+        if mapped_type and mapped_type not in educator_resource_types:
+            educator_resource_types.append(mapped_type)
+
+    # Check if the intended end user role matches any educator roles
+    for role in intended_roles:
+        if (
+            role in SCORM_intended_role_to_resource_type_mapping
+            and SCORM_intended_role_to_resource_type_mapping[role]
+            not in educator_resource_types
+        ):
+            educator_resource_types.append(
+                SCORM_intended_role_to_resource_type_mapping[role]
+            )
+
+    return educator_resource_types
+
+
+def infer_beginner_level_from_difficulty(scorm_dict):
+    # Beginner difficulty levels
+    beginner_difficulties = {"very easy", "easy"}
+
+    educational_metadata = scorm_dict.get("educational", {})
+
+    # Check if the difficulty level indicates beginner content
+    difficulty = educational_metadata.get("difficulty")
+    if difficulty in beginner_difficulties:
+        return [needs.FOR_BEGINNERS]
+
+    return []
diff --git a/tests/test_SCORM_metadata.py b/tests/test_SCORM_metadata.py
new file mode 100644
index 00000000..297e6416
--- /dev/null
+++ b/tests/test_SCORM_metadata.py
@@ -0,0 +1,88 @@
+import pytest
+from le_utils.constants.labels import learning_activities
+from le_utils.constants.labels import needs
+from le_utils.constants.labels import resource_type
+
+from ricecooker.utils.SCORM_metadata import infer_beginner_level_from_difficulty
+from ricecooker.utils.SCORM_metadata import map_scorm_to_educator_resource_types
+from ricecooker.utils.SCORM_metadata import map_scorm_to_le_utils_activities
+
+
+@pytest.mark.parametrize(
+    "scorm_dict, expected_result",
+    [
+        (
+            {
+                "educational": {
+                    "interactivityType": "active",
+                    "interactivityLevel": "high",
+                    "learningResourceType": ["exercise", "simulation"],
+                }
+            },
+            [learning_activities.PRACTICE, learning_activities.EXPLORE],
+        ),
+        (
+            {"educational": {"learningResourceType": ["lecture", "self assessment"]}},
+            [learning_activities.REFLECT, learning_activities.WATCH],
+        ),
+        (
+            {
+                "educational": {
+                    "interactivityType": "mixed",
+                    "interactivityLevel": "medium",
+                    "learningResourceType": ["simulation", "graph"],
+                }
+            },
+            [learning_activities.EXPLORE],
+        ),
+        (
+            {
+                "educational": {
+                    "interactivityType": "expositive",
+                    "interactivityLevel": "low",
+                    "learningResourceType": ["simulation", "graph"],
+                }
+            },
+            [learning_activities.READ, learning_activities.WATCH],
+        ),
+    ],
+)
+def test_map_scorm_to_le_utils_activities(scorm_dict, expected_result):
+    assert set(map_scorm_to_le_utils_activities(scorm_dict)) == set(expected_result)
+
+
+@pytest.mark.parametrize(
+    "scorm_dict, expected_result",
+    [
+        (
+            {
+                "educational": {
+                    "learningResourceType": ["exercise", "lecture"],
+                    "intendedEndUserRole": ["teacher"],
+                }
+            },
+            [resource_type.EXERCISE, resource_type.LESSON, resource_type.LESSON_PLAN],
+        ),
+        (
+            {
+                "educational": {
+                    "learningResourceType": ["simulation", "figure"],
+                    "intendedEndUserRole": ["author"],
+                }
+            },
+            [resource_type.ACTIVITY, resource_type.MEDIA, resource_type.GUIDE],
+        ),
+    ],
+)
+def test_map_scorm_to_educator_resource_types(scorm_dict, expected_result):
+    assert set(map_scorm_to_educator_resource_types(scorm_dict)) == set(expected_result)
+
+
+def test_infer_beginner_level_from_difficulty():
+    scorm_dict_easy = {"educational": {"difficulty": "easy"}}
+    assert infer_beginner_level_from_difficulty(scorm_dict_easy) == [
+        needs.FOR_BEGINNERS
+    ]
+
+    scorm_dict_hard = {"educational": {"difficulty": "difficult"}}
+    assert infer_beginner_level_from_difficulty(scorm_dict_hard) == []

From d13181b946c09dc70dec741bce1bda5a083e69c0 Mon Sep 17 00:00:00 2001
From: Richard Tibbles <richard@learningequality.org>
Date: Sun, 18 Feb 2024 16:52:30 -0800
Subject: [PATCH 3/7] Update IMSCP file parsing to more fully parse metadata
 and external metadata files.

---
 ricecooker/classes/files.py                   | 135 +++++++---
 tests/test_files.py                           | 206 ++++++++++++++++
 ...mplete_manifest_with_external_metadata.xml | 102 ++++++++
 .../ims_xml/metadata_hummingbirds_course.xml  | 232 ++++++++++++++++++
 .../metadata_hummingbirds_organization.xml    |  14 ++
 .../samples/ims_xml/simple_manifest.xml       |  63 +++++
 6 files changed, 716 insertions(+), 36 deletions(-)
 create mode 100644 tests/testcontent/samples/ims_xml/complete_manifest_with_external_metadata.xml
 create mode 100644 tests/testcontent/samples/ims_xml/metadata_hummingbirds_course.xml
 create mode 100644 tests/testcontent/samples/ims_xml/metadata_hummingbirds_organization.xml
 create mode 100644 tests/testcontent/samples/ims_xml/simple_manifest.xml

diff --git a/ricecooker/classes/files.py b/ricecooker/classes/files.py
index b5a997d8..2d9d1c33 100644
--- a/ricecooker/classes/files.py
+++ b/ricecooker/classes/files.py
@@ -42,6 +42,7 @@
 from ricecooker.utils.images import create_image_from_zip
 from ricecooker.utils.images import create_tiled_image
 from ricecooker.utils.images import ThumbnailGenerationError
+from ricecooker.utils.SCORM_metadata import imscp_metadata_keys
 from ricecooker.utils.subtitles import build_subtitle_converter_from_file
 from ricecooker.utils.subtitles import InvalidSubtitleFormatError
 from ricecooker.utils.subtitles import InvalidSubtitleLanguageError
@@ -750,9 +751,6 @@ def get_preset(self):
         return self.preset or format_presets.EPUB
 
 
-no_index_zips = set([format_presets.HTML5_DEPENDENCY_ZIP, format_presets.QTI_ZIP])
-
-
 class HTMLZipFile(DownloadFile):
     default_ext = file_formats.HTML5
     allowed_formats = [file_formats.HTML5]
@@ -765,8 +763,8 @@ def process_file(self):
         self.filename = super(HTMLZipFile, self).process_file()
         if self.filename:
             try:
-                # make sure index.html exists unless this is a dependency file or IMS content package
-                if self.get_preset() not in no_index_zips:
+                # make sure index.html exists unless this is a dependency file
+                if self.get_preset() != format_presets.HTML5_DEPENDENCY_ZIP:
                     with zipfile.ZipFile(config.get_storage_path(self.filename)) as zf:
                         _ = zf.getinfo("index.html")
             except KeyError as err:
@@ -776,7 +774,38 @@ def process_file(self):
         return self.filename
 
 
-class IMSCPZipFile(HTMLZipFile):
+def denest_xml_value(value, preferred_language):
+    if isinstance(value, dict):
+        # Handle the 'string' -> '#text' nested structure
+        if "string" in value:
+            return denest_xml_value(value["string"], preferred_language)
+        elif "langstring" in value:
+            return denest_xml_value(value["langstring"], preferred_language)
+        # Handle other simple text and key-value pairs
+        elif "#text" in value:
+            return value["#text"]
+        elif "value" in value:
+            return value["value"]
+    elif isinstance(value, list):
+        try:
+            return next(
+                denest_xml_value(item, preferred_language)
+                for item in value
+                if item.get("@language", "").startswith(preferred_language)
+            )
+        except StopIteration:
+            return [denest_xml_value(item, preferred_language) for item in value]
+    return value
+
+
+class IMSCPZipFile(DownloadFile):
+    default_ext = file_formats.HTML5
+    allowed_formats = [file_formats.HTML5]
+    is_primary = True
+
+    def get_preset(self):
+        return self.preset or format_presets.IMSCP_ZIP
+
     def strip_ns_prefix(self, tree):
         """Strip namespace prefixes from an LXML tree.
         From https://stackoverflow.com/a/30233635
@@ -784,33 +813,70 @@ def strip_ns_prefix(self, tree):
         for element in tree.xpath("descendant-or-self::*[namespace-uri()!='']"):
             element.tag = etree.QName(element).localname
 
-    def strip_langstring(self, tree):
-        """Replace all langstring elements with their text value."""
-        for ls in tree.xpath(".//langstring"):
-            ls.tail = ls.text + ls.tail if ls.tail else ls.text
-        etree.strip_elements(tree, "langstring", with_tail=False)
+    def _get_elem_for_tag(self, root, tag):
+        elem = root.find("lom/%s" % tag)
+        if elem is not None:
+            return elem
+        return root.find(tag)
 
-    def collect_metadata(self, metadata_elem):
-        self.strip_ns_prefix(metadata_elem)
-        self.strip_langstring(metadata_elem)
+    def collect_metadata(self, root):
         metadata_dict = {}
 
-        for tag in ("general", "rights", "educational", "lifecycle"):
-            elem = metadata_elem.find("lom/%s" % tag)
-            if elem:
-                metadata_dict.update(xmltodict.parse(etree.tostring(elem)))
+        metadata_elem = root.find("metadata", root.nsmap)
+
+        if metadata_elem is None:
+            return metadata_dict
+
+        # Check for external metadata reference
+        external_metadata_ref = metadata_elem.find(
+            "adlcp:location",
+            namespaces={"adlcp": "http://www.adlnet.org/xsd/adlcp_v1p3"},
+        )
+        if external_metadata_ref is not None:
+            # External metadata file path
+            external_file_path = external_metadata_ref.text
+            with self.open_zip() as zip_file:
+                with zip_file.open(external_file_path) as external_file:
+                    metadata_elem = etree.parse(external_file).getroot()
+
+        self.strip_ns_prefix(metadata_elem)
+        preferred_language = self.language
+
+        if preferred_language is None:
+            elem = self._get_elem_for_tag(metadata_elem, "general")
+            if elem is not None:
+                values = xmltodict.parse(etree.tostring(elem))
+                if "language" in values["general"]:
+                    preferred_language = denest_xml_value(
+                        values["general"]["language"], None
+                    )
+
+        for tag, fields in imscp_metadata_keys.items():
+            elem = self._get_elem_for_tag(metadata_elem, tag)
+            if elem is not None:
+                values = xmltodict.parse(etree.tostring(elem))
+                for field in fields:
+                    if field in values[tag]:
+                        metadata_dict[field] = denest_xml_value(
+                            values[tag][field], preferred_language
+                        )
 
         return metadata_dict
 
+    @contextmanager
+    def open_zip(self):
+        with zipfile.ZipFile(config.get_storage_path(self.get_filename())) as zf:
+            yield zf
+
     def get_manifest(self):
-        with zipfile.ZipFile(config.get_storage_path(self.filename)) as zf:
-            manifest_file = zf.open("imsmanifest.xml")
+        with self.open_zip() as zf:
             try:
-                return etree.parse(manifest_file).getroot()
+                with zf.open("imsmanifest.xml") as manifest_file:
+                    return etree.parse(manifest_file).getroot()
             except etree.XMLSyntaxError:
                 # we've run across XML files that are marked as UTF-8 encoded but which have non-UTF-8 characters in them
                 # for this case, detect the 'real' encoding and decode it as unicode, then make it actual UTF-8 and parse.
-                f = zf.open("imsmanifest.xml", "rb")
+                f = zf.open("imsmanifest.xml", "r")
                 data = f.read()
                 f.close()
 
@@ -836,9 +902,7 @@ def walk_items(self, root):
             )
             root_dict["title"] = text.strip()
 
-        metadata_elem = root.find("metadata", root.nsmap)
-        if metadata_elem is not None:
-            root_dict["metadata"] = self.collect_metadata(metadata_elem)
+        root_dict["metadata"] = self.collect_metadata(root)
 
         children = []
         for item in root.findall("item", root.nsmap):
@@ -852,9 +916,7 @@ def walk_items(self, root):
     def derive_content_files_dict(self, resource_elem, resources_dict):
         nsmap = resource_elem.nsmap
         file_elements = resource_elem.findall("file", nsmap)
-        base = "./" + (
-            resource_elem.get("{http://www.w3.org/XML/1998/namespace}base") or ""
-        )
+        base = resource_elem.get("{http://www.w3.org/XML/1998/namespace}base") or ""
         file_paths = [base + fe.get("href") for fe in file_elements]
         dep_elements = resource_elem.findall("dependency", nsmap)
         dep_paths = []
@@ -873,10 +935,11 @@ def collect_resources(self, item, resources_dict):
             # Add all resource attrs to item dict
             for key, value in resource_elem.items():
                 key_stripped = re.sub("^{.*}", "", key)  # Strip any namespace prefix
-                item[key_stripped] = value
+                # Don't overwrite existing keys
+                if key_stripped not in item:
+                    item[key_stripped] = value
 
             if resource_elem.get("type") == "webcontent":
-                item["index_file"] = resource_elem.get("href")
                 item["files"] = self.derive_content_files_dict(
                     resource_elem, resources_dict
                 )
@@ -889,10 +952,10 @@ def extract_metadata(self):
 
         nsmap = manifest.nsmap
 
-        metadata_elem = manifest.find("metadata", nsmap)
-        metadata = {}
-        if metadata_elem is not None:
-            metadata = self.collect_metadata(metadata_elem)
+        metadata = self.collect_metadata(manifest)
+
+        if self.language is None and metadata.get("language"):
+            self.set_language(metadata.get("language"))
 
         resources_elem = manifest.find("resources", nsmap)
         resources_dict = dict((r.get("identifier"), r) for r in resources_elem)
@@ -900,7 +963,7 @@ def extract_metadata(self):
         organizations = []
         for org_elem in manifest.findall("organizations/organization", nsmap):
             item_tree = self.walk_items(org_elem)
-            self.collect_resources(license, item_tree, resources_dict)
+            self.collect_resources(item_tree, resources_dict)
             organizations.append(item_tree)
 
         return {
@@ -912,7 +975,7 @@ def extract_metadata(self):
 
 class QTIZipFile(IMSCPZipFile):
     def get_preset(self):
-        return format_presets.QTI_ZIP
+        return self.preset or format_presets.QTI_ZIP
 
 
 class H5PFile(DownloadFile):
diff --git a/tests/test_files.py b/tests/test_files.py
index 9c23da2e..fadbd4f8 100644
--- a/tests/test_files.py
+++ b/tests/test_files.py
@@ -2,6 +2,8 @@
 import hashlib
 import os.path
 import tempfile
+import zipfile
+from contextlib import contextmanager
 from shutil import copyfile
 
 import pytest
@@ -10,6 +12,7 @@
 
 from ricecooker import config
 from ricecooker.classes.files import _get_language_with_alpha2_fallback
+from ricecooker.classes.files import IMSCPZipFile
 from ricecooker.classes.files import is_youtube_subtitle_file_supported_language
 from ricecooker.classes.files import SubtitleFile
 from ricecooker.classes.files import YouTubeSubtitleFile
@@ -631,3 +634,206 @@ def test_convertible_substitles_weirdext_subtitlesformat():
         assert (
             "El total de los protones y neutrones de un átomo" in filecontents
         ), "missing check words in converted subs"
+
+
+@contextmanager
+def create_zip_with_manifest(manifest_filename, *additional_files):
+    # Create a temporary file for the zipfile
+    with tempfile.NamedTemporaryFile(suffix=".zip", delete=True) as temp_zip:
+        # Define the paths
+        base_path = os.path.dirname(os.path.abspath(__file__))
+        source_folder = os.path.join(base_path, "testcontent", "samples", "ims_xml")
+        manifest_file = os.path.join(source_folder, manifest_filename)
+
+        # Create the zipfile and add the manifest file
+        with zipfile.ZipFile(temp_zip.name, "w") as zf:
+            zf.write(manifest_file, "imsmanifest.xml")
+            for additional_file in additional_files:
+                zf.write(os.path.join(source_folder, additional_file), additional_file)
+
+        yield temp_zip.name
+
+    # Clean up the zipfile
+    try:
+        os.remove(temp_zip.name)
+    except FileNotFoundError:
+        pass
+
+
+def test_extract_metadata_from_simple_manifest():
+    with create_zip_with_manifest("simple_manifest.xml") as zip_path:
+        imscp_file = IMSCPZipFile(zip_path)
+        assert imscp_file.extract_metadata() == {
+            "identifier": None,
+            "metadata": {
+                "description": "Example of test file",
+                "language": "en",
+                "title": "Test File",
+            },
+            "organizations": [
+                {
+                    "children": [
+                        {
+                            "files": [],
+                            "href": "file1.html",
+                            "identifier": "file1Ref",
+                            "identifierref": "file1Ref",
+                            "title": "Test File1",
+                            "type": "webcontent",
+                            "metadata": {},
+                        },
+                        {
+                            "files": [],
+                            "href": "file2.html",
+                            "identifier": "file2Ref",
+                            "identifierref": "file2Ref",
+                            "title": "Test File2",
+                            "type": "webcontent",
+                            "metadata": {},
+                        },
+                        {
+                            "children": [
+                                {
+                                    "files": [],
+                                    "href": "file1.html",
+                                    "identifier": "file1Ref",
+                                    "identifierref": "file1Ref",
+                                    "metadata": {},
+                                    "title": "Test File1 Nested",
+                                    "type": "webcontent",
+                                },
+                                {
+                                    "files": [],
+                                    "href": "file2.html",
+                                    "identifier": "file2Ref",
+                                    "identifierref": "file2Ref",
+                                    "metadata": {},
+                                    "title": "Test File2 Nested",
+                                    "type": "webcontent",
+                                },
+                            ],
+                            "identifier": "folder2",
+                            "metadata": {},
+                            "title": "Folder 2",
+                        },
+                        {
+                            "children": [
+                                {
+                                    "files": [],
+                                    "href": "file3.html",
+                                    "identifier": "file3Ref",
+                                    "identifierref": "file3Ref",
+                                    "metadata": {},
+                                    "title": "Test File3 Nested",
+                                    "type": "webcontent",
+                                },
+                                {
+                                    "files": [],
+                                    "href": "file4.html",
+                                    "identifier": "file4Ref",
+                                    "identifierref": "file4Ref",
+                                    "metadata": {},
+                                    "title": "Test File4 Nested",
+                                    "type": "webcontent",
+                                },
+                            ],
+                            "identifier": "folder3",
+                            "metadata": {},
+                            "title": "Folder 3",
+                        },
+                    ],
+                    "title": "Folder 1",
+                    "metadata": {},
+                },
+                {
+                    "children": [
+                        {
+                            "files": [],
+                            "href": "file3.html",
+                            "identifier": "file3Ref",
+                            "identifierref": "file3Ref",
+                            "title": "Test File3",
+                            "type": "webcontent",
+                            "metadata": {},
+                        },
+                        {
+                            "files": [],
+                            "href": "file4.html",
+                            "identifier": "file4Ref",
+                            "identifierref": "file4Ref",
+                            "title": "Test File4",
+                            "type": "webcontent",
+                            "metadata": {},
+                        },
+                    ],
+                    "title": "Folder 4",
+                    "metadata": {},
+                },
+            ],
+        }
+
+
+def test_extract_metadata_from_complex_manifest():
+    # Additional metadata files are passed as arguments to the context manager
+    with create_zip_with_manifest(
+        "complete_manifest_with_external_metadata.xml",
+        "metadata_hummingbirds_course.xml",
+        "metadata_hummingbirds_organization.xml",
+    ) as zip_path:
+        imscp_file = IMSCPZipFile(zip_path)
+        assert imscp_file.extract_metadata() == {
+            "identifier": "com.example.hummingbirds.contentpackaging.metadata.2024",
+            "metadata": {
+                "description": "An engaging overview of hummingbirds, covering their biology, behavior, and habitats. This course is designed for enthusiasts and bird watchers of all levels.",  # noqa E501
+                "language": "en",
+                "title": "Discovering Hummingbirds",
+                "keyword": ["hummingbirds", "bird watching", "ornithology"],
+                "intendedEndUserRole": "learner",
+                "interactivityLevel": "medium",
+                "learningResourceType": ["documentary", "interactive lesson"],
+            },
+            "organizations": [
+                {
+                    "title": "Discovering Hummingbirds",
+                    "identifier": "hummingbirds_default_org",
+                    "metadata": {
+                        "description": "This metadata provides information about the structure and organization of the Discovering Hummingbirds course. The course is organized in a hierarchical manner, guiding learners from basic concepts to more detailed aspects of hummingbird biology and behavior.",  # noqa E501
+                    },
+                    "children": [
+                        {
+                            "title": "Introduction to Hummingbirds",
+                            "metadata": {
+                                "description": "Explore the fascinating world of hummingbirds, their unique characteristics and behaviors.",
+                            },
+                            "identifier": "item1",
+                            "identifierref": "resource1",
+                            "href": "intro.html",
+                            "type": "webcontent",
+                            "files": ["intro.html", "shared/style.css"],
+                        },
+                        {
+                            "title": "Hummingbird Habitats",
+                            "metadata": {
+                                "description": "Learn about various habitats of hummingbirds, from tropical jungles to backyard gardens.",
+                            },
+                            "identifier": "item2",
+                            "identifierref": "resource2",
+                            "href": "habitats.html",
+                            "type": "webcontent",
+                            "files": ["habitats.html", "shared/image1.jpg"],
+                        },
+                        {
+                            "title": "Hummingbird Species",
+                            "metadata": {
+                                "description": "Discover the diversity of hummingbird species and their unique adaptations.",
+                            },
+                            "identifier": "item3",
+                            "identifierref": "resource3",
+                            "href": "species.html",
+                            "type": "webcontent",
+                            "files": ["species.html", "shared/image2.jpg"],
+                        },
+                    ],
+                },
+            ],
+        }
diff --git a/tests/testcontent/samples/ims_xml/complete_manifest_with_external_metadata.xml b/tests/testcontent/samples/ims_xml/complete_manifest_with_external_metadata.xml
new file mode 100644
index 00000000..98afd161
--- /dev/null
+++ b/tests/testcontent/samples/ims_xml/complete_manifest_with_external_metadata.xml
@@ -0,0 +1,102 @@
+<?xml version="1.0" standalone="no" ?>
+<!--
+Hummingbirds Exploration Course Manifest Example
+
+This example demonstrates the use of metadata, including in-line and
+external metadata references. The external course-level metadata contains a comprehensive
+example of proper metadata element usage.
+-->
+<manifest identifier="com.example.hummingbirds.contentpackaging.metadata.2024" version="1"
+          xmlns="http://www.imsglobal.org/xsd/imscp_v1p1"
+          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+          xmlns:adlcp="http://www.adlnet.org/xsd/adlcp_v1p3"
+          xmlns:adlseq="http://www.adlnet.org/xsd/adlseq_v1p3"
+          xmlns:adlnav="http://www.adlnet.org/xsd/adlnav_v1p3"
+          xmlns:imsss="http://www.imsglobal.org/xsd/imsss"
+          xmlns:lom="http://ltsc.ieee.org/xsd/LOM"
+          xsi:schemaLocation="http://www.imsglobal.org/xsd/imscp_v1p1 imscp_v1p1.xsd
+                              http://www.adlnet.org/xsd/adlcp_v1p3 adlcp_v1p3.xsd
+                              http://www.adlnet.org/xsd/adlseq_v1p3 adlseq_v1p3.xsd
+                              http://www.adlnet.org/xsd/adlnav_v1p3 adlnav_v1p3.xsd
+                              http://www.imsglobal.org/xsd/imsss imsss_v1p0.xsd
+                              http://ltsc.ieee.org/xsd/LOM lom.xsd">
+
+  <metadata>
+		<schema>ADL SCORM</schema>
+		<schemaversion>2004 3rd Edition</schemaversion>
+    <!-- External metadata file for the course -->
+    <adlcp:location>metadata_hummingbirds_course.xml</adlcp:location>
+	</metadata>
+
+	<organizations default="hummingbirds_default_org">
+        <organization identifier="hummingbirds_default_org">
+            <title>Discovering Hummingbirds</title>
+            <!-- Metadata about this organization -->
+            <metadata>
+                <adlcp:location>metadata_hummingbirds_organization.xml</adlcp:location>
+            </metadata>
+
+            <!-- Item 1 -->
+            <item identifier="item1" identifierref="resource1">
+                <title>Introduction to Hummingbirds</title>
+                <metadata>
+                    <lom>
+                        <general>
+                            <description>
+                                <string language="en-US">Explore the fascinating world of hummingbirds, their unique characteristics and behaviors.</string>
+                            </description>
+                        </general>
+                    </lom>
+                </metadata>
+            </item>
+
+            <!-- Item 2 -->
+            <item identifier="item2" identifierref="resource2">
+                <title>Hummingbird Habitats</title>
+                <metadata>
+                    <lom>
+                        <general>
+                            <description>
+                                <string language="en-US">Learn about various habitats of hummingbirds, from tropical jungles to backyard gardens.</string>
+                            </description>
+                        </general>
+                    </lom>
+                </metadata>
+            </item>
+
+            <!-- Item 3 -->
+            <item identifier="item3" identifierref="resource3">
+                <title>Hummingbird Species</title>
+                <metadata>
+                    <lom>
+                        <general>
+                            <description>
+                                <string language="en-US">Discover the diversity of hummingbird species and their unique adaptations.</string>
+                            </description>
+                        </general>
+                    </lom>
+                </metadata>
+            </item>
+        </organization>
+	</organizations>
+
+    <resources>
+        <!-- Resources for Item 1 -->
+        <resource identifier="resource1" type="webcontent" href="intro.html">
+            <file href="intro.html"/>
+            <file href="shared/style.css"/>
+        </resource>
+
+        <!-- Resources for Item 2 -->
+        <resource identifier="resource2" type="webcontent" href="habitats.html">
+            <file href="habitats.html"/>
+            <file href="shared/image1.jpg"/>
+        </resource>
+
+        <!-- Resources for Item 3 -->
+        <resource identifier="resource3" type="webcontent" href="species.html">
+            <file href="species.html"/>
+            <file href="shared/image2.jpg"/>
+        </resource>
+    </resources>
+</manifest>
diff --git a/tests/testcontent/samples/ims_xml/metadata_hummingbirds_course.xml b/tests/testcontent/samples/ims_xml/metadata_hummingbirds_course.xml
new file mode 100644
index 00000000..cb3c7346
--- /dev/null
+++ b/tests/testcontent/samples/ims_xml/metadata_hummingbirds_course.xml
@@ -0,0 +1,232 @@
+<?xml version="1.0" ?>
+<lom xmlns="http://ltsc.ieee.org/xsd/LOM" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://ltsc.ieee.org/xsd/LOM lom.xsd">
+  <general>
+    <identifier>
+      <catalog>URI</catalog>
+      <entry>com.example.hummingbirds.contentpackaging.metadata.2024</entry>
+    </identifier>
+    <title>
+      <string language="en-US">Discovering Hummingbirds</string>
+      <string language="es">Descubriendo Colibríes</string>
+    </title>
+    <language>en</language>
+    <description>
+      <string language="en-US">An engaging overview of hummingbirds, covering their biology, behavior, and habitats. This course is designed for enthusiasts and bird watchers of all levels.</string>
+    </description>
+    <keyword>
+      <string language="en-US">hummingbirds</string>
+    </keyword>
+    <keyword>
+      <string language="en-US">bird watching</string>
+    </keyword>
+    <keyword>
+      <string language="en-US">ornithology</string>
+    </keyword>
+    <coverage>
+      <string language="en-US">Global range, with a focus on the Americas.</string>
+    </coverage>
+    <structure>
+      <source>LOMv1.0</source>
+      <value>hierarchical</value>
+    </structure>
+    <aggregationLevel>
+      <source>LOMv1.0</source>
+      <value>2</value>
+    </aggregationLevel>
+  </general>
+
+  <lifeCycle>
+    <version>
+      <string language="en-US">1.0</string>
+    </version>
+    <status>
+      <source>LOMv1.0</source>
+      <value>final</value>
+    </status>
+    <contribute>
+      <role>
+        <source>LOMv1.0</source>
+        <value>publisher</value>
+      </role>
+      <entity>BEGIN:VCARD
+VERSION:2.1
+FN:John Doe
+ORG:Example Organization
+TEL;WORK;VOICE:(555) 123-4567
+ADR;WORK:;;1234 Main St;Anytown;AN;12345;Country
+EMAIL;PREF;INTERNET:john.doe@example.com
+END:VCARD
+      </entity>
+      <date>
+        <dateTime>2024-01-01</dateTime>
+        <description>
+          <string language="en-US">Initial publication date of the course.</string>
+        </description>
+      </date>
+    </contribute>
+  </lifeCycle>
+
+  <metaMetadata>
+    <identifier>
+      <catalog>URI</catalog>
+      <entry>com.example.hummingbirds.metadata.2024</entry>
+    </identifier>
+    <metadataSchema>LOMv1.0</metadataSchema>
+    <metadataSchema>SCORM_CAM_v1.3</metadataSchema>
+    <language>en-us</language>
+  </metaMetadata>
+
+  <technical>
+    <format>text/html</format>
+    <format>image/jpeg</format>
+    <format>video/mp4</format>
+    <size>1048576</size>
+    <location>http://www.example.com/hummingbirds</location>
+    <requirement>
+      <orComposite>
+        <type>
+          <source>LOMv1.0</source>
+          <value>browser</value>
+        </type>
+        <name>
+          <source>LOMv1.0</source>
+          <value>any</value>
+        </name>
+      </orComposite>
+    </requirement>
+    <installationRemarks>
+      <string language="en-us">No specific installation requirements, accessible via standard web browsers.</string>
+    </installationRemarks>
+    <otherPlatformRequirements>
+      <string language="en-us">Optimized for both desktop and mobile platforms.</string>
+    </otherPlatformRequirements>
+    <duration>
+      <duration>P1H</duration>
+      <description>
+        <string language="en-us">The average time to complete the course is approximately 1 hour.</string>
+      </description>
+    </duration>
+  </technical>
+
+  <educational>
+    <learningResourceType>
+      <source>LOMv1.0</source>
+      <value>documentary</value>
+    </learningResourceType>
+    <learningResourceType>
+      <source>LOMv1.0</source>
+      <value>interactive lesson</value>
+    </learningResourceType>
+    <interactivityLevel>
+      <source>LOMv1.0</source>
+      <value>medium</value>
+    </interactivityLevel>
+    <semanticDensity>
+      <source>LOMv1.0</source>
+      <value>medium</value>
+    </semanticDensity>
+    <intendedEndUserRole>
+      <source>LOMv1.0</source>
+      <value>learner</value>
+    </intendedEndUserRole>
+    <context>
+      <source>LOMv1.0</source>
+      <value>training</value>
+    </context>
+    <typicalAgeRange>
+      <string language="en-us">Age 12 and up</string>
+    </typicalAgeRange>
+    <difficulty>
+      <source>LOMv1.0</source>
+      <value>easy</value>
+    </difficulty>
+    <typicalLearningTime>
+      <duration>P1H</duration>
+      <description>
+        <string language="en-us">Approximately 1 hour is needed to complete the course.</string>
+      </description>
+    </typicalLearningTime>
+    <description>
+      <string language="en-us">This course provides an introduction to hummingbirds, suitable for beginners in ornithology and bird watching.</string>
+    </description>
+    <language>en-us</language>
+  </educational>
+
+  <rights>
+    <cost>
+      <source>LOMv1.0</source>
+      <value>no</value>
+    </cost>
+    <copyrightAndOtherRestrictions>
+      <source>LOMv1.0</source>
+      <value>yes</value>
+    </copyrightAndOtherRestrictions>
+    <description>
+      <string>Content is protected under Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.</string>
+    </description>
+  </rights>
+
+  <relation>
+    <kind>
+      <source>LOMv1.0</source>
+      <value>ispartof</value>
+    </kind>
+    <resource>
+      <identifier>
+        <catalog>URI</catalog>
+        <entry>com.example.biologyseries</entry>
+      </identifier>
+      <description>
+        <string language="en-us">Part of a larger series on biology and wildlife.</string>
+      </description>
+    </resource>
+  </relation>
+
+  <annotation>
+    <entity>BEGIN:VCARD
+VERSION:2.1
+FN:Jane Smith
+ORG:Example Organization
+TEL;WORK;VOICE:(555) 987-6543
+ADR;WORK:;;4321 Side St;Anytown;AN;54321;Country
+EMAIL;PREF;INTERNET:jane.smith@example.com
+END:VCARD
+    </entity>
+    <date>
+      <dateTime>2024-02-01</dateTime>
+      <description>
+        <string language="en-us">Annotation added to provide additional insights into course content.</string>
+      </description>
+    </date>
+    <description>
+      <string language="en-us">This course includes interactive elements to enhance learner engagement with the subject of hummingbirds.</string>
+    </description>
+  </annotation>
+
+  <classification>
+    <purpose>
+      <source>LOMv1.0</source>
+      <value>discipline</value>
+    </purpose>
+    <taxonPath>
+      <source>
+        <string language="en-us">Example Organization's catalog of biology courses</string>
+      </source>
+      <taxon>
+        <id>ornithology_basics</id>
+        <entry>
+          <string language="en-us">Basic Ornithology</string>
+        </entry>
+      </taxon>
+    </taxonPath>
+    <description>
+      <string language="en-US">This course serves as an introductory module in the Basic Ornithology series, focusing on hummingbirds.</string>
+    </description>
+    <keyword>
+      <string language="en-US">biology</string>
+    </keyword>
+    <keyword>
+      <string language="en-US">wildlife</string>
+    </keyword>
+  </classification>
+</lom>
diff --git a/tests/testcontent/samples/ims_xml/metadata_hummingbirds_organization.xml b/tests/testcontent/samples/ims_xml/metadata_hummingbirds_organization.xml
new file mode 100644
index 00000000..87018dfa
--- /dev/null
+++ b/tests/testcontent/samples/ims_xml/metadata_hummingbirds_organization.xml
@@ -0,0 +1,14 @@
+<?xml version="1.0" ?>
+<!-- An example of organization-level metadata for the Discovering Hummingbirds course.-->
+<lom xmlns="http://ltsc.ieee.org/xsd/LOM" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://ltsc.ieee.org/xsd/LOM lom.xsd">
+  <general>
+    <description>
+      <string language="en-US">This metadata provides information about the structure and organization of the Discovering Hummingbirds course. The course is organized in a hierarchical manner, guiding learners from basic concepts to more detailed aspects of hummingbird biology and behavior.</string>
+    </description>
+
+    <structure>
+      <source>LOMv1.0</source>
+      <value>hierarchical</value>
+    </structure>
+  </general>
+</lom>
diff --git a/tests/testcontent/samples/ims_xml/simple_manifest.xml b/tests/testcontent/samples/ims_xml/simple_manifest.xml
new file mode 100644
index 00000000..bcbb2f3a
--- /dev/null
+++ b/tests/testcontent/samples/ims_xml/simple_manifest.xml
@@ -0,0 +1,63 @@
+<manifest xmlns:imsmd="http://www.imsglobal.org/xsd/imsmd_v1p2">
+    <metadata>
+    <imsmd:lom>
+        <imsmd:general>
+        <imsmd:title>
+            <imsmd:langstring xml:lang="en">Test File</imsmd:langstring>
+        </imsmd:title>
+        <imsmd:language>en</imsmd:language>
+        <imsmd:description>
+            <imsmd:langstring xml:lang="en">Example of test file</imsmd:langstring>
+        </imsmd:description>
+        </imsmd:general>
+    </imsmd:lom>
+    </metadata>
+    <organizations>
+    <organization>
+        <title>Folder 1</title>
+        <item identifierref="file1Ref">
+            <title>Test File1</title>
+        </item>
+        <item identifierref="file2Ref">
+            <title>Test File2</title>
+        </item>
+        <item identifier="folder2">
+            <title>Folder 2</title>
+            <item identifierref="file1Ref">
+                <title>Test File1 Nested</title>
+            </item>
+            <item identifierref="file2Ref">
+                <title>Test File2 Nested</title>
+            </item>
+        </item>
+        <item identifier="folder3">
+            <title>Folder 3</title>
+            <item identifierref="file3Ref">
+                <title>Test File3 Nested</title>
+            </item>
+            <item identifierref="file4Ref">
+                <title>Test File4 Nested</title>
+            </item>
+        </item>
+    </organization>
+    <organization>
+        <title>Folder 4</title>
+        <item identifierref="file3Ref">
+            <title>Test File3</title>
+        </item>
+        <item identifierref="file4Ref">
+            <title>Test File4</title>
+        </item>
+    </organization>
+    </organizations>
+    <resources>
+    <resource identifier="file1Ref" type="webcontent" href="file1.html">
+    </resource>
+    <resource identifier="file2Ref" type="webcontent" href="file2.html">
+    </resource>
+    <resource identifier="file3Ref" type="webcontent" href="file3.html">
+    </resource>
+    <resource identifier="file4Ref" type="webcontent" href="file4.html">
+    </resource>
+    </resources>
+</manifest>

From e7d5aefe824a8bfcbc68b74b0363531fb3b66166 Mon Sep 17 00:00:00 2001
From: Richard Tibbles <richard@learningequality.org>
Date: Sun, 18 Feb 2024 20:40:32 -0800
Subject: [PATCH 4/7] Update metadata reading to assume a flat metadata dict.
 Add general purpose utility for setting metadata on a node.

---
 ricecooker/utils/SCORM_metadata.py | 45 ++++++++++++++++++++----------
 tests/test_SCORM_metadata.py       | 42 +++++++++++-----------------
 2 files changed, 46 insertions(+), 41 deletions(-)

diff --git a/ricecooker/utils/SCORM_metadata.py b/ricecooker/utils/SCORM_metadata.py
index bd1db071..d1dfe21a 100644
--- a/ricecooker/utils/SCORM_metadata.py
+++ b/ricecooker/utils/SCORM_metadata.py
@@ -39,7 +39,7 @@
 }
 
 
-def map_scorm_to_le_utils_activities(scorm_dict):
+def map_scorm_to_le_utils_activities(metadata_dict):
     le_utils_activities = []
 
     # Adjustments based on interactivity
@@ -51,8 +51,8 @@ def map_scorm_to_le_utils_activities(scorm_dict):
     }
 
     # Determine the interactivity level and type
-    interactivity_type = scorm_dict.get("educational", {}).get("interactivityType")
-    interactivity_level = scorm_dict.get("educational", {}).get("interactivityLevel")
+    interactivity_type = metadata_dict.get("interactivityType")
+    interactivity_level = metadata_dict.get("interactivityLevel")
 
     is_interactive = (
         interactivity_type
@@ -64,9 +64,7 @@ def map_scorm_to_le_utils_activities(scorm_dict):
     )
 
     # Extract the learning resource types from the SCORM data
-    learning_resource_types = scorm_dict.get("educational", {}).get(
-        "learningResourceType", []
-    )
+    learning_resource_types = metadata_dict.get("learningResourceType", [])
 
     # Map each SCORM type to an LE Utils activity type
     for learning_resource_type in learning_resource_types:
@@ -115,14 +113,12 @@ def map_scorm_to_le_utils_activities(scorm_dict):
 }
 
 
-def map_scorm_to_educator_resource_types(scorm_dict):
+def map_scorm_to_educator_resource_types(metadata_dict):
     educator_resource_types = []
 
     # Extract the learning resource types and intended end user role from the SCORM data
-    learning_resource_types = scorm_dict.get("educational", {}).get(
-        "learningResourceType", []
-    )
-    intended_roles = scorm_dict.get("educational", {}).get("intendedEndUserRole", [])
+    learning_resource_types = metadata_dict.get("learningResourceType", [])
+    intended_roles = metadata_dict.get("intendedEndUserRole", [])
 
     # Map each SCORM type to an educator-focused resource type
     for learning_resource_type in learning_resource_types:
@@ -144,15 +140,34 @@ def map_scorm_to_educator_resource_types(scorm_dict):
     return educator_resource_types
 
 
-def infer_beginner_level_from_difficulty(scorm_dict):
+def infer_beginner_level_from_difficulty(metadata_dict):
     # Beginner difficulty levels
     beginner_difficulties = {"very easy", "easy"}
 
-    educational_metadata = scorm_dict.get("educational", {})
-
     # Check if the difficulty level indicates beginner content
-    difficulty = educational_metadata.get("difficulty")
+    difficulty = metadata_dict.get("difficulty")
     if difficulty in beginner_difficulties:
         return [needs.FOR_BEGINNERS]
 
     return []
+
+
+def update_node_from_metadata(node, metadata_dict):
+    # Update the node with the general metadata
+    node.description = metadata_dict.get("description") or node.description
+    if metadata_dict.get("language"):
+        node.set_language(metadata_dict.get("language"))
+    node.tags = node.tags + metadata_dict.get("keyword", [])
+
+    # Update the node with the educational metadata
+    node.learning_activities = (
+        node.learning_activities + map_scorm_to_le_utils_activities(metadata_dict)
+    )
+    node.resource_types = node.resource_types + map_scorm_to_educator_resource_types(
+        metadata_dict
+    )
+    node.learner_needs = node.learner_needs + infer_beginner_level_from_difficulty(
+        metadata_dict
+    )
+
+    return node
diff --git a/tests/test_SCORM_metadata.py b/tests/test_SCORM_metadata.py
index 297e6416..3e0d2a31 100644
--- a/tests/test_SCORM_metadata.py
+++ b/tests/test_SCORM_metadata.py
@@ -13,35 +13,29 @@
     [
         (
             {
-                "educational": {
-                    "interactivityType": "active",
-                    "interactivityLevel": "high",
-                    "learningResourceType": ["exercise", "simulation"],
-                }
+                "interactivityType": "active",
+                "interactivityLevel": "high",
+                "learningResourceType": ["exercise", "simulation"],
             },
             [learning_activities.PRACTICE, learning_activities.EXPLORE],
         ),
         (
-            {"educational": {"learningResourceType": ["lecture", "self assessment"]}},
+            {"learningResourceType": ["lecture", "self assessment"]},
             [learning_activities.REFLECT, learning_activities.WATCH],
         ),
         (
             {
-                "educational": {
-                    "interactivityType": "mixed",
-                    "interactivityLevel": "medium",
-                    "learningResourceType": ["simulation", "graph"],
-                }
+                "interactivityType": "mixed",
+                "interactivityLevel": "medium",
+                "learningResourceType": ["simulation", "graph"],
             },
             [learning_activities.EXPLORE],
         ),
         (
             {
-                "educational": {
-                    "interactivityType": "expositive",
-                    "interactivityLevel": "low",
-                    "learningResourceType": ["simulation", "graph"],
-                }
+                "interactivityType": "expositive",
+                "interactivityLevel": "low",
+                "learningResourceType": ["simulation", "graph"],
             },
             [learning_activities.READ, learning_activities.WATCH],
         ),
@@ -56,19 +50,15 @@ def test_map_scorm_to_le_utils_activities(scorm_dict, expected_result):
     [
         (
             {
-                "educational": {
-                    "learningResourceType": ["exercise", "lecture"],
-                    "intendedEndUserRole": ["teacher"],
-                }
+                "learningResourceType": ["exercise", "lecture"],
+                "intendedEndUserRole": ["teacher"],
             },
             [resource_type.EXERCISE, resource_type.LESSON, resource_type.LESSON_PLAN],
         ),
         (
             {
-                "educational": {
-                    "learningResourceType": ["simulation", "figure"],
-                    "intendedEndUserRole": ["author"],
-                }
+                "learningResourceType": ["simulation", "figure"],
+                "intendedEndUserRole": ["author"],
             },
             [resource_type.ACTIVITY, resource_type.MEDIA, resource_type.GUIDE],
         ),
@@ -79,10 +69,10 @@ def test_map_scorm_to_educator_resource_types(scorm_dict, expected_result):
 
 
 def test_infer_beginner_level_from_difficulty():
-    scorm_dict_easy = {"educational": {"difficulty": "easy"}}
+    scorm_dict_easy = {"difficulty": "easy"}
     assert infer_beginner_level_from_difficulty(scorm_dict_easy) == [
         needs.FOR_BEGINNERS
     ]
 
-    scorm_dict_hard = {"educational": {"difficulty": "difficult"}}
+    scorm_dict_hard = {"difficulty": "difficult"}
     assert infer_beginner_level_from_difficulty(scorm_dict_hard) == []

From 42c78615ec652dcf44208ab82f232e109cda17d3 Mon Sep 17 00:00:00 2001
From: Richard Tibbles <richard@learningequality.org>
Date: Sun, 18 Feb 2024 21:27:32 -0800
Subject: [PATCH 5/7] Do recursive tree generation for IMSCP nodes. Write basic
 tests to confirm behaviour.

---
 ricecooker/classes/nodes.py | 115 ++++++++++------
 tests/test_files.py         | 262 ++++++++++++++++++------------------
 tests/test_tree.py          |  43 ++++++
 3 files changed, 252 insertions(+), 168 deletions(-)

diff --git a/ricecooker/classes/nodes.py b/ricecooker/classes/nodes.py
index acf4f721..bacb93f1 100644
--- a/ricecooker/classes/nodes.py
+++ b/ricecooker/classes/nodes.py
@@ -21,6 +21,7 @@
 from ..exceptions import InvalidNodeException
 from ..utils.utils import is_valid_uuid_string
 from .licenses import License
+from ricecooker.utils.SCORM_metadata import update_node_from_metadata
 
 MASTERY_MODELS = [id for id, name in exercises.MASTERY_MODELS]
 ROLES = [id for id, name in roles.choices]
@@ -31,6 +32,7 @@ class Node(object):
 
     license = None
     language = None
+    kind = None
 
     def __init__(
         self,
@@ -40,7 +42,7 @@ def __init__(
         thumbnail=None,
         files=None,
         derive_thumbnail=False,
-        node_modifications={},
+        node_modifications=None,
         extra_fields=None,
     ):
         self.files = []
@@ -60,7 +62,7 @@ def __init__(
 
         self.set_thumbnail(thumbnail)
         # save modifications passed in by csv
-        self.node_modifications = node_modifications
+        self.node_modifications = node_modifications or {}
 
     def set_language(self, language):
         """Set self.language to internal lang. repr. code from str or Language object."""
@@ -1649,54 +1651,87 @@ def to_dict(self):
 RemoteContentNode = StudioContentNode
 
 
-def get_by_path(obj, *args):
-    for arg in args:
-        obj = obj.get(arg, {})
-    if obj == {}:
-        return None
-    return obj
-
-
-QTI_PLACEHOLDER_SOURCE_ID = "A SOURCE ID THAT WILL BE REMOVED BEFORE UPLOAD"
-
+class IMSCPNode(TreeNode):
 
-class QTINode(ContentNode):
-    """
-    Node representing QTI exercise
-    """
+    kind = content_kinds.HTML5
 
-    kind = content_kinds.EXERCISE
+    @classmethod
+    def _recurse_and_add_children(cls, parent, item_data, license):
+        source_id = item_data.get("identifier", item_data["title"])
+        if item_data.get("children") is not None:
+            node = TopicNode(
+                source_id,
+                item_data["title"],
+                files=parent.files,
+            )
+            for child in item_data["children"]:
+                cls._recurse_and_add_children(node, child, license)
+        else:
+            node = ContentNode(
+                source_id,
+                item_data["title"],
+                license,
+                files=parent.files,
+                extra_fields={
+                    "options": {
+                        "entry": item_data["href"] + item_data.get("parameters", "")
+                    }
+                },
+            )
+            node.kind = cls.kind
+        update_node_from_metadata(node, item_data["metadata"])
+        parent.add_child(node)
 
-    def __init__(
-        self, source_id=QTI_PLACEHOLDER_SOURCE_ID, title=None, license=None, **kwargs
-    ):
-        super(QTINode, self).__init__(source_id, title, license, **kwargs)
-        from .files import QTIZipFile
+    def __new__(cls, *args, **kwargs):
+        from .files import IMSCPZipFile
 
-        qti_files = [f for f in self.files if isinstance(f, QTIZipFile)]
-        qti_file = qti_files[0]
-        qti_file.process_file()
-        metadata = qti_file.extract_metadata()
-        self.source_id = (
-            metadata.get("identifer")
-            if self.source_id == QTI_PLACEHOLDER_SOURCE_ID
-            else self.source_id
-        )
-        if self.source_id is None:
+        imscp_files = [f for f in kwargs["files"] if isinstance(f, IMSCPZipFile)]
+        if not imscp_files or len(imscp_files) > 1:
+            raise InvalidNodeException(
+                "IMSCPNode must be instantiated with exactly one IMSCPZipFile"
+            )
+        imscp_file = imscp_files[0]
+        metadata = imscp_file.extract_metadata()
+        if "title" in metadata["metadata"] and kwargs.get("title") is None:
+            kwargs["title"] = metadata["metadata"]["title"]
+        if kwargs.get("title") is None:
             raise InvalidNodeException(
-                "No source_id was provided and the QTI file {} does not have an identifier".format(
-                    qti_file.path
+                "No title was provided and the IMSCP file {} does not have a title".format(
+                    imscp_file.path
                 )
             )
-        self.title = self.title or get_by_path(
-            metadata, "metadata", "general", "title", "string"
-        )
-        if self.title is None:
+
+        if "identifier" in metadata and kwargs.get("source_id") is None:
+            kwargs["source_id"] = metadata["identifier"]
+        if kwargs.get("source_id") is None:
             raise InvalidNodeException(
-                "No title was provided and the QTI file {} does not have a title".format(
-                    qti_file.path
+                "No source_id was provided and the IMSCP file {} does not have an identifier".format(
+                    imscp_file.path
                 )
             )
+        license = kwargs.pop("license")
+        if license is None:
+            raise InvalidNodeException(
+                "No license was provided and we cannot infer license from an IMSCP file"
+            )
+        if metadata.get("organizations"):
+            node = TopicNode(*args, **kwargs)
+
+            for child in metadata["organizations"]:
+                cls._recurse_and_add_children(node, child, license)
+        else:
+            node = ContentNode(*args, **kwargs)
+            node.kind = cls.kind
+        update_node_from_metadata(node, metadata["metadata"])
+        return node
+
+
+class QTINode(IMSCPNode):
+    """
+    Node representing QTI exercise
+    """
+
+    kind = content_kinds.EXERCISE
 
     def validate(self):
         """validate: Makes sure QTI is valid
diff --git a/tests/test_files.py b/tests/test_files.py
index fadbd4f8..c8d7ede2 100644
--- a/tests/test_files.py
+++ b/tests/test_files.py
@@ -660,17 +660,34 @@ def create_zip_with_manifest(manifest_filename, *additional_files):
         pass
 
 
-def test_extract_metadata_from_simple_manifest():
-    with create_zip_with_manifest("simple_manifest.xml") as zip_path:
-        imscp_file = IMSCPZipFile(zip_path)
-        assert imscp_file.extract_metadata() == {
-            "identifier": None,
-            "metadata": {
-                "description": "Example of test file",
-                "language": "en",
-                "title": "Test File",
-            },
-            "organizations": [
+expected_simple_metadata = {
+    "identifier": None,
+    "metadata": {
+        "description": "Example of test file",
+        "language": "en",
+        "title": "Test File",
+    },
+    "organizations": [
+        {
+            "children": [
+                {
+                    "files": [],
+                    "href": "file1.html",
+                    "identifier": "file1Ref",
+                    "identifierref": "file1Ref",
+                    "title": "Test File1",
+                    "type": "webcontent",
+                    "metadata": {},
+                },
+                {
+                    "files": [],
+                    "href": "file2.html",
+                    "identifier": "file2Ref",
+                    "identifierref": "file2Ref",
+                    "title": "Test File2",
+                    "type": "webcontent",
+                    "metadata": {},
+                },
                 {
                     "children": [
                         {
@@ -678,72 +695,23 @@ def test_extract_metadata_from_simple_manifest():
                             "href": "file1.html",
                             "identifier": "file1Ref",
                             "identifierref": "file1Ref",
-                            "title": "Test File1",
-                            "type": "webcontent",
                             "metadata": {},
+                            "title": "Test File1 Nested",
+                            "type": "webcontent",
                         },
                         {
                             "files": [],
                             "href": "file2.html",
                             "identifier": "file2Ref",
                             "identifierref": "file2Ref",
-                            "title": "Test File2",
-                            "type": "webcontent",
-                            "metadata": {},
-                        },
-                        {
-                            "children": [
-                                {
-                                    "files": [],
-                                    "href": "file1.html",
-                                    "identifier": "file1Ref",
-                                    "identifierref": "file1Ref",
-                                    "metadata": {},
-                                    "title": "Test File1 Nested",
-                                    "type": "webcontent",
-                                },
-                                {
-                                    "files": [],
-                                    "href": "file2.html",
-                                    "identifier": "file2Ref",
-                                    "identifierref": "file2Ref",
-                                    "metadata": {},
-                                    "title": "Test File2 Nested",
-                                    "type": "webcontent",
-                                },
-                            ],
-                            "identifier": "folder2",
                             "metadata": {},
-                            "title": "Folder 2",
-                        },
-                        {
-                            "children": [
-                                {
-                                    "files": [],
-                                    "href": "file3.html",
-                                    "identifier": "file3Ref",
-                                    "identifierref": "file3Ref",
-                                    "metadata": {},
-                                    "title": "Test File3 Nested",
-                                    "type": "webcontent",
-                                },
-                                {
-                                    "files": [],
-                                    "href": "file4.html",
-                                    "identifier": "file4Ref",
-                                    "identifierref": "file4Ref",
-                                    "metadata": {},
-                                    "title": "Test File4 Nested",
-                                    "type": "webcontent",
-                                },
-                            ],
-                            "identifier": "folder3",
-                            "metadata": {},
-                            "title": "Folder 3",
+                            "title": "Test File2 Nested",
+                            "type": "webcontent",
                         },
                     ],
-                    "title": "Folder 1",
+                    "identifier": "folder2",
                     "metadata": {},
+                    "title": "Folder 2",
                 },
                 {
                     "children": [
@@ -752,88 +720,126 @@ def test_extract_metadata_from_simple_manifest():
                             "href": "file3.html",
                             "identifier": "file3Ref",
                             "identifierref": "file3Ref",
-                            "title": "Test File3",
-                            "type": "webcontent",
                             "metadata": {},
+                            "title": "Test File3 Nested",
+                            "type": "webcontent",
                         },
                         {
                             "files": [],
                             "href": "file4.html",
                             "identifier": "file4Ref",
                             "identifierref": "file4Ref",
-                            "title": "Test File4",
-                            "type": "webcontent",
                             "metadata": {},
+                            "title": "Test File4 Nested",
+                            "type": "webcontent",
                         },
                     ],
-                    "title": "Folder 4",
+                    "identifier": "folder3",
                     "metadata": {},
+                    "title": "Folder 3",
                 },
             ],
-        }
+            "title": "Folder 1",
+            "metadata": {},
+        },
+        {
+            "children": [
+                {
+                    "files": [],
+                    "href": "file3.html",
+                    "identifier": "file3Ref",
+                    "identifierref": "file3Ref",
+                    "title": "Test File3",
+                    "type": "webcontent",
+                    "metadata": {},
+                },
+                {
+                    "files": [],
+                    "href": "file4.html",
+                    "identifier": "file4Ref",
+                    "identifierref": "file4Ref",
+                    "title": "Test File4",
+                    "type": "webcontent",
+                    "metadata": {},
+                },
+            ],
+            "title": "Folder 4",
+            "metadata": {},
+        },
+    ],
+}
 
 
-def test_extract_metadata_from_complex_manifest():
-    # Additional metadata files are passed as arguments to the context manager
-    with create_zip_with_manifest(
-        "complete_manifest_with_external_metadata.xml",
-        "metadata_hummingbirds_course.xml",
-        "metadata_hummingbirds_organization.xml",
-    ) as zip_path:
+def test_extract_metadata_from_simple_manifest():
+    with create_zip_with_manifest("simple_manifest.xml") as zip_path:
         imscp_file = IMSCPZipFile(zip_path)
-        assert imscp_file.extract_metadata() == {
-            "identifier": "com.example.hummingbirds.contentpackaging.metadata.2024",
+        assert imscp_file.extract_metadata() == expected_simple_metadata
+
+
+expected_complex_metadata = {
+    "identifier": "com.example.hummingbirds.contentpackaging.metadata.2024",
+    "metadata": {
+        "description": "An engaging overview of hummingbirds, covering their biology, behavior, and habitats. This course is designed for enthusiasts and bird watchers of all levels.",  # noqa E501
+        "language": "en",
+        "title": "Discovering Hummingbirds",
+        "keyword": ["hummingbirds", "bird watching", "ornithology"],
+        "intendedEndUserRole": "learner",
+        "interactivityLevel": "medium",
+        "learningResourceType": ["documentary", "interactive lesson"],
+    },
+    "organizations": [
+        {
+            "title": "Discovering Hummingbirds",
+            "identifier": "hummingbirds_default_org",
             "metadata": {
-                "description": "An engaging overview of hummingbirds, covering their biology, behavior, and habitats. This course is designed for enthusiasts and bird watchers of all levels.",  # noqa E501
-                "language": "en",
-                "title": "Discovering Hummingbirds",
-                "keyword": ["hummingbirds", "bird watching", "ornithology"],
-                "intendedEndUserRole": "learner",
-                "interactivityLevel": "medium",
-                "learningResourceType": ["documentary", "interactive lesson"],
+                "description": "This metadata provides information about the structure and organization of the Discovering Hummingbirds course. The course is organized in a hierarchical manner, guiding learners from basic concepts to more detailed aspects of hummingbird biology and behavior.",  # noqa E501
             },
-            "organizations": [
+            "children": [
                 {
-                    "title": "Discovering Hummingbirds",
-                    "identifier": "hummingbirds_default_org",
+                    "title": "Introduction to Hummingbirds",
                     "metadata": {
-                        "description": "This metadata provides information about the structure and organization of the Discovering Hummingbirds course. The course is organized in a hierarchical manner, guiding learners from basic concepts to more detailed aspects of hummingbird biology and behavior.",  # noqa E501
+                        "description": "Explore the fascinating world of hummingbirds, their unique characteristics and behaviors.",
                     },
-                    "children": [
-                        {
-                            "title": "Introduction to Hummingbirds",
-                            "metadata": {
-                                "description": "Explore the fascinating world of hummingbirds, their unique characteristics and behaviors.",
-                            },
-                            "identifier": "item1",
-                            "identifierref": "resource1",
-                            "href": "intro.html",
-                            "type": "webcontent",
-                            "files": ["intro.html", "shared/style.css"],
-                        },
-                        {
-                            "title": "Hummingbird Habitats",
-                            "metadata": {
-                                "description": "Learn about various habitats of hummingbirds, from tropical jungles to backyard gardens.",
-                            },
-                            "identifier": "item2",
-                            "identifierref": "resource2",
-                            "href": "habitats.html",
-                            "type": "webcontent",
-                            "files": ["habitats.html", "shared/image1.jpg"],
-                        },
-                        {
-                            "title": "Hummingbird Species",
-                            "metadata": {
-                                "description": "Discover the diversity of hummingbird species and their unique adaptations.",
-                            },
-                            "identifier": "item3",
-                            "identifierref": "resource3",
-                            "href": "species.html",
-                            "type": "webcontent",
-                            "files": ["species.html", "shared/image2.jpg"],
-                        },
-                    ],
+                    "identifier": "item1",
+                    "identifierref": "resource1",
+                    "href": "intro.html",
+                    "type": "webcontent",
+                    "files": ["intro.html", "shared/style.css"],
+                },
+                {
+                    "title": "Hummingbird Habitats",
+                    "metadata": {
+                        "description": "Learn about various habitats of hummingbirds, from tropical jungles to backyard gardens.",
+                    },
+                    "identifier": "item2",
+                    "identifierref": "resource2",
+                    "href": "habitats.html",
+                    "type": "webcontent",
+                    "files": ["habitats.html", "shared/image1.jpg"],
+                },
+                {
+                    "title": "Hummingbird Species",
+                    "metadata": {
+                        "description": "Discover the diversity of hummingbird species and their unique adaptations.",
+                    },
+                    "identifier": "item3",
+                    "identifierref": "resource3",
+                    "href": "species.html",
+                    "type": "webcontent",
+                    "files": ["species.html", "shared/image2.jpg"],
                 },
             ],
-        }
+        },
+    ],
+}
+
+
+def test_extract_metadata_from_complex_manifest():
+    # Additional metadata files are passed as arguments to the context manager
+    with create_zip_with_manifest(
+        "complete_manifest_with_external_metadata.xml",
+        "metadata_hummingbirds_course.xml",
+        "metadata_hummingbirds_organization.xml",
+    ) as zip_path:
+        imscp_file = IMSCPZipFile(zip_path)
+        assert imscp_file.extract_metadata() == expected_complex_metadata
diff --git a/tests/test_tree.py b/tests/test_tree.py
index d844d0f9..5ba9f7d9 100644
--- a/tests/test_tree.py
+++ b/tests/test_tree.py
@@ -11,16 +11,22 @@
 from le_utils.constants import licenses
 from le_utils.constants.labels import levels
 from le_utils.constants.languages import getlang
+from test_files import create_zip_with_manifest
+from test_files import expected_complex_metadata
+from test_files import expected_simple_metadata
 
 from ricecooker.classes.files import DocumentFile
 from ricecooker.classes.files import HTMLZipFile
+from ricecooker.classes.files import IMSCPZipFile
 from ricecooker.classes.files import SlideImageFile
 from ricecooker.classes.files import ThumbnailFile
 from ricecooker.classes.licenses import get_license
 from ricecooker.classes.licenses import License
+from ricecooker.classes.nodes import ContentNode
 from ricecooker.classes.nodes import CustomNavigationChannelNode
 from ricecooker.classes.nodes import CustomNavigationNode
 from ricecooker.classes.nodes import DocumentNode
+from ricecooker.classes.nodes import IMSCPNode
 from ricecooker.classes.nodes import RemoteContentNode
 from ricecooker.classes.nodes import SlideshowNode
 from ricecooker.classes.nodes import TopicNode
@@ -688,3 +694,40 @@ def test_remote_content_node_with_invalid_overridden_field():
             author="Such disallowed. Computer says no.",
         )
         node.validate_tree()
+
+
+def assert_node_matches_metadata(node, metadata, files):
+
+    extra_metadata = metadata["metadata"]
+
+    assert node.title == extra_metadata["title"]
+    if metadata.get("identifier"):
+        assert node.source_id == metadata.get("identifier")
+
+    # Check for ContentNode or TopicNode
+    if isinstance(node, ContentNode):
+        assert node.files == files
+        assert node.extra_fields.get("options", {}).get("entry") == metadata.get("href")
+    elif isinstance(node, TopicNode):
+        for child_node, child_metadata in zip(
+            node.children, metadata.get("children", [])
+        ):
+            assert_node_matches_metadata(child_node, child_metadata, files)
+
+
+def test_imscp_node_simple_manifest():
+    with create_zip_with_manifest("simple_manifest.xml") as zip_path:
+        imscp_file = IMSCPZipFile(zip_path)
+        node = IMSCPNode(source_id="test", license="CC BY", files=[imscp_file])
+        assert_node_matches_metadata(node, expected_simple_metadata, [imscp_file])
+
+
+def test_imscp_node_complex_manifest():
+    with create_zip_with_manifest(
+        "complete_manifest_with_external_metadata.xml",
+        "metadata_hummingbirds_course.xml",
+        "metadata_hummingbirds_organization.xml",
+    ) as zip_path:
+        imscp_file = IMSCPZipFile(zip_path)
+        node = IMSCPNode(license="CC BY", files=[imscp_file])
+        assert_node_matches_metadata(node, expected_complex_metadata, [imscp_file])

From 3583a03eade2eb66cc3c655324adf2816ec9c590 Mon Sep 17 00:00:00 2001
From: Richard Tibbles <richard@learningequality.org>
Date: Tue, 20 Feb 2024 21:12:32 -0800
Subject: [PATCH 6/7] Tweaks to ensure proper parsing of language codes and
 keywords.

---
 ricecooker/utils/SCORM_metadata.py                 | 14 +++++++++++---
 .../complete_manifest_with_external_metadata.xml   |  1 +
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/ricecooker/utils/SCORM_metadata.py b/ricecooker/utils/SCORM_metadata.py
index d1dfe21a..12abb47e 100644
--- a/ricecooker/utils/SCORM_metadata.py
+++ b/ricecooker/utils/SCORM_metadata.py
@@ -4,6 +4,7 @@
 from le_utils.constants.labels import learning_activities
 from le_utils.constants.labels import needs
 from le_utils.constants.labels import resource_type
+from le_utils.constants.languages import getlang
 
 
 imscp_metadata_keys = {
@@ -155,9 +156,16 @@ def infer_beginner_level_from_difficulty(metadata_dict):
 def update_node_from_metadata(node, metadata_dict):
     # Update the node with the general metadata
     node.description = metadata_dict.get("description") or node.description
-    if metadata_dict.get("language"):
-        node.set_language(metadata_dict.get("language"))
-    node.tags = node.tags + metadata_dict.get("keyword", [])
+    lang_code = metadata_dict.get("language", "")
+    lang_code = (
+        lang_code.split("-")[0].lower() if getlang(lang_code) is None else lang_code
+    )
+    if getlang(lang_code):
+        node.set_language(lang_code)
+    keyword = metadata_dict.get("keyword", [])
+    if keyword and isinstance(keyword, str):
+        keyword = [keyword]
+    node.tags = node.tags + keyword
 
     # Update the node with the educational metadata
     node.learning_activities = (
diff --git a/tests/testcontent/samples/ims_xml/complete_manifest_with_external_metadata.xml b/tests/testcontent/samples/ims_xml/complete_manifest_with_external_metadata.xml
index 98afd161..804647ec 100644
--- a/tests/testcontent/samples/ims_xml/complete_manifest_with_external_metadata.xml
+++ b/tests/testcontent/samples/ims_xml/complete_manifest_with_external_metadata.xml
@@ -45,6 +45,7 @@ example of proper metadata element usage.
                             <description>
                                 <string language="en-US">Explore the fascinating world of hummingbirds, their unique characteristics and behaviors.</string>
                             </description>
+                            <language>en-US</language>
                         </general>
                     </lom>
                 </metadata>

From affca1637aa2abb1bcdde26246a40a1e1fdad0a8 Mon Sep 17 00:00:00 2001
From: Richard Tibbles <richard@learningequality.org>
Date: Sat, 9 Mar 2024 15:55:32 -0800
Subject: [PATCH 7/7] Update tests for windows.

---
 tests/test_files.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/tests/test_files.py b/tests/test_files.py
index c8d7ede2..3aab32c8 100644
--- a/tests/test_files.py
+++ b/tests/test_files.py
@@ -638,26 +638,29 @@ def test_convertible_substitles_weirdext_subtitlesformat():
 
 @contextmanager
 def create_zip_with_manifest(manifest_filename, *additional_files):
-    # Create a temporary file for the zipfile
-    with tempfile.NamedTemporaryFile(suffix=".zip", delete=True) as temp_zip:
+    # Create a temporary file for the zipfile but close it immediately for Windows compatibility
+    temp_zip = tempfile.NamedTemporaryFile(suffix=".zip", delete=False)
+    temp_zip_path = temp_zip.name
+    temp_zip.close()  # Close the file, so it's not locked
+    try:
         # Define the paths
         base_path = os.path.dirname(os.path.abspath(__file__))
         source_folder = os.path.join(base_path, "testcontent", "samples", "ims_xml")
         manifest_file = os.path.join(source_folder, manifest_filename)
 
         # Create the zipfile and add the manifest file
-        with zipfile.ZipFile(temp_zip.name, "w") as zf:
+        with zipfile.ZipFile(temp_zip_path, "w") as zf:
             zf.write(manifest_file, "imsmanifest.xml")
             for additional_file in additional_files:
                 zf.write(os.path.join(source_folder, additional_file), additional_file)
 
         yield temp_zip.name
-
-    # Clean up the zipfile
-    try:
-        os.remove(temp_zip.name)
-    except FileNotFoundError:
-        pass
+    finally:
+        # Clean up the zipfile
+        try:
+            os.remove(temp_zip.name)
+        except (FileNotFoundError, OSError):
+            pass
 
 
 expected_simple_metadata = {
@@ -799,6 +802,7 @@ def test_extract_metadata_from_simple_manifest():
                     "title": "Introduction to Hummingbirds",
                     "metadata": {
                         "description": "Explore the fascinating world of hummingbirds, their unique characteristics and behaviors.",
+                        "language": "en-US",
                     },
                     "identifier": "item1",
                     "identifierref": "resource1",