nesi · CallumWalley · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025
diff --git a/.github/workflows/link_apps_pages.py b/.github/workflows/link_apps_pages.py
@@ -14,12 +14,14 @@
 
 
 MODULE_LIST_PATH = os.getenv("MODULE_LIST_PATH", "docs/assets/module-list.json")
+APPROVED_TAGS =  "checks/.approved_tags.yml"
 
 # Relative to doc directory.
 DOC_ROOT = os.getenv("DOC_ROOT", "docs")
 APPS_PAGES_PATH = os.getenv("APPS_PAGES_PATH", "Software/Available_Applications")
 BASE_URL = os.getenv("BASE_URL", "https://www.docs.nesi.org.nz")
 
+approved_tags = yaml.safe_load(open(APPROVED_TAGS, 'r'))
 module_list = json.load(open(MODULE_LIST_PATH))
 
 for input_file in os.listdir(os.path.join(DOC_ROOT, APPS_PAGES_PATH)):
@@ -52,6 +54,12 @@
                 for tag in meta["tags"]:
                     if tag not in module_list[app]["domains"]:
                         module_list[app]["domains"] += [tag]
+            # Drop unsupported tags.
+            for tag in module_list[app]["domains"]:
+                if tag not in approved_tags:
+                    print(f"::warning file={MODULE_LIST_PATH},\
+title=tags.bad:: Tag '{tag}' not in approved list. Dropped")
+                    module_list[app]["domains"].remove(tag)
         else:
             print(
                 f"::warning file={os.path.join(DOC_ROOT, APPS_PAGES_PATH, input_file)},\

diff --git a/checks/.approved_tags.yml b/checks/.approved_tags.yml
@@ -1,84 +1,44 @@
 ---
 [
-    ".core",
-    "2fa",
-    "Computational Chemistry",
-    "Density Functional Theory",
-    "Molecular Dynamics",
+    "Chemistry", 
+    "DensityFunctionalTheory",
+    "MolecularDynamics",
     "Slurm",
-    "XC50",
     "access",
-    "account",
-    "allocation",
-    "announcement",
-    "assessment",
-    "authentication",
-    "bash",
+    "account",  # Any page relating to user account
+    "allocation", 
     "biology",
     "cae",
     "cfd",
     "chemistry",
     "compiler",
-    "compression",
     "containers",
-    "corefile",
-    "CS400",
-    "CS500",
-    "data compression",
-    "disk quota",
+    "diskQuota",
     "earthquake",
     "engineering",
     "fea",
     "geo",
-    "git",
-    "github",
-    "globus",
     "gpu",
-    "home",
     "hydrodynamics",
     "ide",
-    "introduction",
-    "jupyter lab",
+    "jupyter",
     "login",
-    "mahuika",
-    "maui",
-    "mfa",
-    "milan",
-    "machine learning",
+    "machineLearning",
     "morphodynamics",
     "mpi",
     "multiphysics",
     "mynesi",
-    "nearline",
     "nobackup",
-    "omp",
-    "onboarding",
+    "multiproccessing",
     "ondemand",
-    "particle modelling",
+    "particleModelling",
     "profiling",
     "project",
-    "quotas",
-    "refresh",
     "releasenote",
-    "Project Membership",
-    "scaling",
-    "scp",
-    "software",
     "ssh",
     "storage",
-    "tape",
-    "terminal",
-    "toolchain",
-    "Data Transfer",
-    "tuakiri",
-    "upload",
-    "version control",
+    "DataTransfer",
     "visualisation",
-    "water quality testing",
-    "wave modelling",
-    "webinar",
+    "waveModelling",
     "windows",
-    "workshop",
-    "wsl",
-    "x11",
 ]
diff --git a/checks/README.md b/checks/README.md
@@ -2,6 +2,8 @@
 
 This directory contains QA tests for the documentation.
 
+All these checks are intended to run on the markdown files, not rendered files.
+
 Tests should be made as Python scripts to allow flexibility of use. Currently these checks are run two ways:
 
 - [GitHub Actions](https://docs.github.com/en/actions) as defined in [workflows](../.github/workflows/),
@@ -44,16 +46,8 @@ Individual rules can be disabled/enabled in [.markdownlint.json](../.markdownlin
 *This linter is defined in [run_meta_check.py](run_meta_check.py) script.*
 
 Catch-all for custom checks.
-Currently defined checks are:
 
-- title_redundant
-- title_length
-- meta_missing_description
-- meta_unexpected_key
-- minimum_tags
-- walk_toc
-- click_here
-- dynamic_slurm_link
+See script for details.
 
 ### Test Build
 
@@ -66,3 +60,12 @@ Each type of test has a debug job in VSCode.
 Most will run on the [fail_checks](fail_checks.md) page
 
 ![alt text](../docs/assets/images/debug_menu.png)
+
+## Tags
+
+Tags are used to help search indexing, but can also be used to search by topic.
+
+We don't want a large number of similar/duplicate tag topics, as this is visually messy and reduces the utility of being able to sort by one.
+There is a list of 'approved' tags [.approved_tags.yml](./.approved_tags.yml), feel free to add to it.
+
+Tags are checked in [run_meta_check.py](run_meta_check.py).
diff --git a/checks/fail_checks.md b/checks/fail_checks.md
@@ -3,7 +3,10 @@ created_at: 2025-01-28
 template: not_a_template
 not_a_parameter: This isn't valid
 tags:
-  - Not enough
+  - chemistryies
+  - move data
+  - gibberjabber
+  - mostly nonsense
 ---
 
 

diff --git a/checks/run_meta_check.py b/checks/run_meta_check.py
@@ -13,6 +13,7 @@
 import time
 from titlecase import titlecase
 from pathlib import Path
+from difflib import SequenceMatcher
 
 # Ignore files if they match this regex
 EXCLUDED_FROM_CHECKS = [
@@ -28,10 +29,10 @@
 
 MAX_TITLE_LENGTH = 28  # As font isn't monospace, this is only approx
 MAX_HEADER_LENGTH = 32  # minus 2 per extra header level
-MIN_TAGS = 1
+RANGE_TAGS = [1, 5]
 RANGE_SIBLING = [4, 8]
 DOC_ROOT = "docs"
-
+APPROVED_TAGS = "checks/.approved_tags.yml"
 # Warning level for missing parameters.
 EXPECTED_PARAMETERS = {
     "title": "",
@@ -43,7 +44,7 @@
     "postreq": "",
     "suggested": "",  # Add info here when implimented.
     "created_at": "",
-    "tags": "",  # Add info here when implimented.
+    "tags": yaml.safe_load(open(APPROVED_TAGS, 'r')),
     "search": "",
     "hide": ["toc", "nav", "tags"],
 }
@@ -263,6 +264,25 @@ def _nav_check():
         )
 
 
+def _jaccard_index(s1, s2):
+    set1 = set(s1.lower()) # Split into words
+    set2 = set(s2.lower())
+    intersection = len(set1.intersection(set2))
+    union = len(set1.union(set2))
+    return intersection / union
+
+def _sim_index(str1, str2):
+    return SequenceMatcher(None, str1, str2).ratio()
+
+def _most_similar(s1, op):
+    bestval = 0
+    bestword = ""
+    for o in op:
+        ji = _jaccard_index(s1, o)
+        if ji > bestval:
+            bestval = ji
+            bestword = o
+    return bestword, bestval
 def title_redundant():
     lineno = _get_lineno(r"^title:.*$")
     if "title" in meta.keys() and title_from_filename == meta["title"]:
@@ -300,9 +320,10 @@ def _test(v):
 
     for key, value in meta.items():
         if key not in EXPECTED_PARAMETERS.keys():
+            similar, ji = _most_similar(key, EXPECTED_PARAMETERS.keys())
             yield {
                 "line": _get_lineno(r"^" + key + r":.*$"),
-                "message": f"Unexpected parameter in front-matter '{key}'",
+                "message": f"Unexpected parameter in front-matter '{key}'{', did you mean \'' + similar + "\'?" if ji > 0.4 else ""} .",
             }
         elif EXPECTED_PARAMETERS[key]:
             if isinstance(value, list):
@@ -334,16 +355,31 @@ def title_capitalisation():
 '{correct_title}' is preferred",
         }
 
-def minimum_tags():
+def number_tags():
     if "tags" not in meta or not isinstance(meta["tags"], list):
         yield {"message": "'tags' property in meta is missing or malformed."}
-    elif len(meta["tags"]) < MIN_TAGS:
+    elif len(meta["tags"]) < RANGE_TAGS[0]:
         yield {
             "line": _get_lineno(r"^tags:.*$"),
-            "message": "Try to include at least 2 'tags'\
+            "message": f"Try to include at least {RANGE_TAGS[0]} 'tags'\
 (helps with search optimisation).",
         }
+    elif len(meta["tags"]) > RANGE_TAGS[1]:
+        yield {
+            "line": _get_lineno(r"^tags:.*$"),
+            "message": f"{RANGE_TAGS[1]} is a lot of 'tags', are you sure they are all useful?",
+        }
 
+def approved_tags():
+    if "tags" not in meta or not isinstance(meta["tags"], list):
+        return
+    for tag in meta["tags"]:
+        if tag not in EXPECTED_PARAMETERS["tags"]:
+            similar, ji = _most_similar(tag, EXPECTED_PARAMETERS["tags"])
+            yield {
+                "line": _get_lineno(rf".*{tag}.*"),
+                "message": f"Tag '{tag}' is not an approved tag, {'did you mean \'' + similar + "\'?" if ji > 0.4 else 'See \'' + APPROVED_TAGS + '\'.'}",
+            }
 
 def click_here():
     """
@@ -419,7 +455,8 @@ def dynamic_slurm_link():
     title_capitalisation,
     meta_missing_description,
     meta_unexpected_key,
-    minimum_tags,
+    number_tags,
+    approved_tags,
     walk_toc,
 ]
 

diff --git a/docs/Software/Available_Applications/index.md b/docs/Software/Available_Applications/index.md
@@ -14,12 +14,12 @@ hide:
 
 | Command                               | Description                                          | Example                                |
 | ------------------------------------- | ---------------------------------------------------- | -------------------------------------- |
-| `module load <module name>`           | Loads <module name>, (default version)               | `module load Python`                   |
-| `module load <module name>/<version>` | Loads <module name>, <version>                       | `module load Python/3.11.6-foss-2023a` |
+| `module load <module name>`           | Loads `<module name>`, (default version)               | `module load Python`                   |
+| `module load <module name>/<version>` | Loads `<module name>`, `<version>`                       | `module load Python/3.11.6-foss-2023a` |
 | `module purge`                        | Removes all loaded modules                           |                                        |
 | `module list`                         | Lists currently loaded modules.                      |                                        |
 | `module spider`                       | Lists all _available_ modules.                       |                                        |
-| `module spider <module name>`         | Searches (fuzzy) available modules for <module name> | `module spider python`                 |
+| `module spider <module name>`         | Searches (fuzzy) available modules for `<module name>`| `module spider python`                 |
 
 For a full list of module commands run `man module` or visit the [lmod documentation](https://lmod.readthedocs.io/en/latest/010_user.html).
 

diff --git a/docs/Storage/Cloud_Storage/.pages.yml b/docs/Storage/Cloud_Storage/.pages.yml
diff --git a/docs/Storage/Data_Recovery/.pages.yml b/docs/Storage/Data_Recovery/.pages.yml
diff --git a/docs/Storage/Data_Recovery/File_Recovery.md → docs/Storage/File_Recovery.md b/docs/Storage/Data_Recovery/File_Recovery.md → docs/Storage/File_Recovery.md
diff --git a/docs/Storage/File_Systems_and_Quotas/Filesystems_and_Quotas.md b/docs/Storage/File_Systems_and_Quotas/Filesystems_and_Quotas.md
@@ -67,7 +67,7 @@ but will prevent creation of new data or files.
 ### /home
 
 This filesystem is accessible from login, compute and ancillary nodes.
-Users should **not** run jobs from this filesystem. [Snapshots](../Data_Recovery/File_Recovery.md) are taken of all home directories
+Users should **not** run jobs from this filesystem. [Snapshots](../File_Recovery.md) are taken of all home directories
 daily.
 No cleaning policy will be applied to your home directory as long as
 your user account is active and you are a member of at least one
@@ -76,7 +76,7 @@ active project.
 ### /nesi/project
 
 This filesystem is accessible from all login, compute and ancillary
-nodes. [Snapshots](../Data_Recovery/File_Recovery.md) are taken daily. No
+nodes. [Snapshots](../File_Recovery.md) are taken daily. No
 cleaning policy is applied.
 
 It provides storage space for datasets, shared code or configuration
@@ -132,7 +132,7 @@ See more information about the long term storage see our
 ## Snapshots
 
 If you have accidentally deleted data you can recover it from
-a [snapshot](../Data_Recovery/File_Recovery.md).
+a [snapshot](../File_Recovery.md).
 Snapshots are taken daily of `home/` and `project` directories If you
 cannot find it in a snapshot, please ask us to recover it for you by
 {% include "partials/support_request.html" %}
diff --git a/...orage/Research_Developer_Cloud_Storage.md → ...orage/Research_Developer_Cloud_Storage.md b/...orage/Research_Developer_Cloud_Storage.md → ...orage/Research_Developer_Cloud_Storage.md