From 7e8fe77c0ca1fd7ab6f4d552bc2a933aaf21dc03 Mon Sep 17 00:00:00 2001 From: Martin Carlsson Date: Tue, 6 Jan 2026 21:15:26 +0100 Subject: [PATCH] =?UTF-8?q?Add=20tracking=20of=20which=20=C3=A4ndringsf?= =?UTF-8?q?=C3=B6rfattning=20repealed=20each=20section?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements selex:upphavd_av attribute to link repealed sections to their repealing ändringsförfattning. This resolves the TODO at format_sfs_text.py:718. Changes: - Add temporal/parse_anteckningar.py: Parser for Swedish amendment notes (anteckningar) that extracts repealed, amended, and new paragraphs - Modify sfs_processor.py: Build repeal_map from andringsforfattningar and pass to formatter - Modify formatters/format_sfs_text.py: Add selex:upphavd_av attribute to repealed sections with helper function for ID normalization - Add comprehensive test coverage: 27 parser tests + 8 integration tests Example output:
All 68 tests passing. Validated with real data from 2010:800.json. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- formatters/format_sfs_text.py | 62 +++++++- sfs_processor.py | 18 ++- temporal/parse_anteckningar.py | 155 ++++++++++++++++++++ test/test_format_sfs_text.py | 151 ++++++++++++++++++++ test/test_parse_anteckningar.py | 241 ++++++++++++++++++++++++++++++++ 5 files changed, 621 insertions(+), 6 deletions(-) create mode 100644 temporal/parse_anteckningar.py create mode 100644 test/test_parse_anteckningar.py diff --git a/formatters/format_sfs_text.py b/formatters/format_sfs_text.py index e3c4366..a1d9eb9 100644 --- a/formatters/format_sfs_text.py +++ b/formatters/format_sfs_text.py @@ -30,7 +30,7 @@ """ import re -from typing import Optional +from typing import Optional, Dict from .apply_links import apply_sfs_links, apply_internal_links, apply_eu_links, apply_law_name_links from util.text_utils import WHITESPACE_PATTERN @@ -149,7 +149,7 @@ def _adjust_heading_level_for_avdelning(base_level: str, inside_avdelning: bool) return base_level -def format_sfs_text_as_markdown(text: str, apply_links: bool = False) -> str: +def format_sfs_text_as_markdown(text: str, apply_links: bool = False, repeal_map: Optional[Dict[str, str]] = None) -> str: """ Formattera texten från en författningstext importerad från Regeringskansliets rättsdatabas till Markdown-format. @@ -159,6 +159,7 @@ def format_sfs_text_as_markdown(text: str, apply_links: bool = False) -> str: Args: text (str): Texten som ska formateras apply_links (bool): Om True, konvertera både interna paragrafnummer och SFS-beteckningar till markdown-länkar + repeal_map (Optional[Dict[str, str]]): Map of section IDs to the ändringsförfattning beteckning that repealed them Returns: str: Den formaterade texten @@ -500,7 +501,7 @@ def _is_section_ikraft(header_line: str, content: str) -> bool: re.search(INTOFORCE_ANY_PATTERN, content_lower) is not None) -def parse_logical_sections(text: str) -> str: +def parse_logical_sections(text: str, repeal_map: Optional[Dict[str, str]] = None) -> str: """ Dela upp texten i logiska sektioner baserat på Markdown-rubriker och omslut varje rubrik och dess innehåll med
-taggar. @@ -715,7 +716,13 @@ def process_current_section(): if upphor_datum: attributes.append(f'selex:upphor_datum="{upphor_datum}"') if has_upphavd: - attributes.append('selex:upphavd="true"') # TODO: Peka ut i vilken ändringsförfattning den upphävdes + attributes.append('selex:upphavd="true"') + + # Track which ändringsförfattning repealed this section + if repeal_map and section_id: + upphavd_av = _find_repeal_source(section_id, repeal_map) + if upphavd_av: + attributes.append(f'selex:upphavd_av="{upphavd_av}"') if ikraft_villkor: attributes.append(f'selex:ikraft_villkor="{ikraft_villkor}"') @@ -799,6 +806,53 @@ def process_current_section(): return '\n'.join(result) +def _find_repeal_source(section_id: str, repeal_map: Dict[str, str]) -> Optional[str]: + """ + Find which ändringsförfattning repealed this section. + + Tries multiple normalized forms to match section_id against repeal_map. + Example: section_id 'kap29.15' matches '29kap15§' in repeal_map + + Args: + section_id: Section ID like 'kap29.15' or 'kap1.15a' (generated by generate_section_id) + repeal_map: Map of normalized references to beteckning (e.g., '29kap15§' -> '2024:796') + + Returns: + Beteckning of repealing ändringsförfattning, or None + """ + # Direct match + if section_id in repeal_map: + return repeal_map[section_id] + + # Try normalized forms with § symbol added + # section_id format from generate_section_id: 'kap29.15' or 'kap2.15a' or just '15' + + # Try matching chapter.paragraph format (e.g., 'kap29.15') + match = re.match(r'kap(\d+[a-z]?)\.(\d+[a-z]?)$', section_id) + if match: + chapter = match.group(1) + paragraph = match.group(2) + + # Try normalized forms + variants = [ + f"{chapter}kap{paragraph}§", # '29kap15§' + f"{chapter}kap.{paragraph}§", # '29kap.15§' + f"{paragraph}§", # '15§' (chapter-less) + ] + + for variant in variants: + if variant in repeal_map: + return repeal_map[variant] + + # Try matching simple paragraph number (e.g., '15' or '15a') + match = re.match(r'^(\d+[a-z]?)$', section_id) + if match: + paragraph = match.group(1) + variant = f"{paragraph}§" + if variant in repeal_map: + return repeal_map[variant] + + return None def check_unprocessed_temporal_sections(text: str) -> None: diff --git a/sfs_processor.py b/sfs_processor.py index 1a27898..3f215ca 100644 --- a/sfs_processor.py +++ b/sfs_processor.py @@ -36,6 +36,7 @@ from temporal.title_temporal import title_temporal from temporal.amendments import extract_amendments from temporal.apply_temporal import apply_temporal, is_document_content_empty, add_empty_document_message +from temporal.parse_anteckningar import parse_anteckningar from exporters.git import create_init_git_commit from util.yaml_utils import format_yaml_value from util.datetime_utils import format_datetime @@ -434,11 +435,24 @@ def convert_to_markdown(data: Dict[str, Any], fetch_predocs_from_api: bool = Fal # Use the ignored content body (already includes heading) markdown_body = ignored_body else: + # Build repeal map from amendments to track which ändringsförfattning repealed each paragraph + repeal_map = {} + if data.get('andringsforfattningar'): + for amendment in data['andringsforfattningar']: + beteckning = amendment.get('beteckning') + anteckningar = amendment.get('anteckningar', '') + + if beteckning and anteckningar: + parsed = parse_anteckningar(anteckningar) + # Map each repealed paragraph to this amendment + for repealed_ref in parsed.get('repealed', []): + repeal_map[repealed_ref] = beteckning + # Format the content text to markdown - formatted_text = format_sfs_text_as_markdown(innehall_text, apply_links=apply_links) + formatted_text = format_sfs_text_as_markdown(innehall_text, apply_links=apply_links, repeal_map=repeal_map) # Apply section tags - formatted_text = parse_logical_sections(formatted_text) + formatted_text = parse_logical_sections(formatted_text, repeal_map=repeal_map) # Debug: Check if formatting resulted in empty text if not formatted_text.strip(): diff --git a/temporal/parse_anteckningar.py b/temporal/parse_anteckningar.py new file mode 100644 index 0000000..67003b9 --- /dev/null +++ b/temporal/parse_anteckningar.py @@ -0,0 +1,155 @@ +""" +Parser for Swedish legal amendment notes (anteckningar). + +This module parses the anteckningar field from ändringsförfattningar to extract +structured information about which paragraphs were repealed, amended, or added. + +Example anteckningar: + "upph. 29 kap. 15, 16 §§, rubr. närmast före 29 kap. 15 §; ändr. 10 kap. 37 §" + +Parsed result: + { + 'repealed': ['29kap15§', '29kap16§'], + 'amended': ['10kap37§'], + 'new': [] + } +""" + +import re +from typing import Dict, List + + +def parse_anteckningar(anteckningar: str) -> Dict[str, List[str]]: + """ + Parse Swedish amendment notes into structured data. + + Args: + anteckningar: The anteckningar string from an ändringsförfattning + + Returns: + Dictionary with keys: + - 'repealed': List of normalized paragraph references that were repealed (upph.) + - 'amended': List of normalized paragraph references that were amended (ändr.) + - 'new': List of normalized paragraph references that were added (ny/nya) + + Example: + >>> parse_anteckningar("upph. 29 kap. 15, 16 §§; ändr. 10 kap. 37 §") + {'repealed': ['29kap15§', '29kap16§'], 'amended': ['10kap37§'], 'new': []} + """ + result = { + 'repealed': [], + 'amended': [], + 'new': [] + } + + if not anteckningar or not anteckningar.strip(): + return result + + # Split on semicolons to separate major clauses + clauses = anteckningar.split(';') + + for clause in clauses: + clause = clause.strip() + if not clause: + continue + + # Identify action type and extract paragraphs + if clause.startswith('upph.'): + paragraphs = _extract_paragraphs(clause[5:].strip()) # Remove 'upph.' + result['repealed'].extend(paragraphs) + elif clause.startswith('ändr.'): + paragraphs = _extract_paragraphs(clause[5:].strip()) # Remove 'ändr.' + result['amended'].extend(paragraphs) + elif clause.startswith('ny ') or clause.startswith('nya '): + # Extract after 'ny ' or 'nya ' + start_idx = 3 if clause.startswith('nya') else 2 + paragraphs = _extract_paragraphs(clause[start_idx:].strip()) + result['new'].extend(paragraphs) + + return result + + +def _extract_paragraphs(text: str) -> List[str]: + """ + Extract normalized paragraph references from a text fragment. + + Handles patterns like: + - "29 kap. 15 §" → ['29kap15§'] + - "29 kap. 15, 16 §§" → ['29kap15§', '29kap16§'] + - "15 §" → ['15§'] + - "23 kap." → ['23kap'] (chapter-level, Phase 2) + + Args: + text: Text fragment after the action keyword (upph./ändr./ny) + + Returns: + List of normalized paragraph references + """ + paragraphs = [] + + # Skip patterns we don't handle yet (Phase 2) + if 'rubr.' in text or 'betecknas' in text or 'nuvarande' in text: + # Log for future enhancement but don't extract + # These are complex patterns for Phase 2 + pass + + # Pattern 1: Chapter + paragraphs + # Examples: "29 kap. 15, 16 §§", "29 kap. 15 §", "2 kap. 32, 33 §§" + chapter_pattern = r'(\d+(?:\s*[a-z])?)\s*kap\.\s*((?:\d+(?:\s*[a-z])?(?:\s*,\s*)?)+)\s*§' + + for match in re.finditer(chapter_pattern, text, re.IGNORECASE): + chapter = match.group(1).replace(' ', '').lower() + para_list = match.group(2) + + # Split on commas to get individual paragraph numbers + para_numbers = [p.strip().replace(' ', '').lower() for p in para_list.split(',')] + + for para_num in para_numbers: + if para_num: # Skip empty strings + normalized = f"{chapter}kap{para_num}§" + paragraphs.append(normalized) + + # Pattern 2: Chapter only (for chapter-level changes) + # Example: "23 kap." (without paragraph reference) + # Note: This is for Phase 2, but we detect it for completeness + chapter_only_pattern = r'(\d+(?:\s*[a-z])?)\s*kap\.(?!\s*\d)' + + for match in re.finditer(chapter_only_pattern, text, re.IGNORECASE): + chapter = match.group(1).replace(' ', '').lower() + # Chapter-level change - skip for Phase 1 + # In Phase 2, we'd add: paragraphs.append(f"{chapter}kap") + pass + + # Pattern 3: Paragraph without chapter + # Examples: "15 §", "15, 16 §§" + # These references are ambiguous without chapter context + para_only_pattern = r'(? str: + """ + Create a normalized section reference. + + Args: + chapter: Chapter number (e.g., '29', '2a') + paragraph: Paragraph number (e.g., '15', '15a') + + Returns: + Normalized reference (e.g., '29kap15§', '2akap15a§') + """ + chapter_clean = chapter.replace(' ', '').lower() + para_clean = paragraph.replace(' ', '').lower() + return f"{chapter_clean}kap{para_clean}§" diff --git a/test/test_format_sfs_text.py b/test/test_format_sfs_text.py index 63a0b05..48b6573 100644 --- a/test/test_format_sfs_text.py +++ b/test/test_format_sfs_text.py @@ -569,3 +569,154 @@ def test_avdelning_sections_with_parse_logical_sections(self): # Verify the AVDELNING header is still present assert "AVDELNING I" in result + + +# =========================================================================== +# Repeal Map Integration Tests +# =========================================================================== + +@pytest.mark.unit +class TestRepealMapIntegration: + """Test the repeal_map functionality for tracking ändringsförfattningar.""" + + def test_section_with_repeal_map_gets_upphavd_av_attribute(self): + """Test that a repealed section gets the upphavd_av attribute when repeal_map is provided.""" + text = """#### 15 § + +Denna paragraf har upphävts.""" + + # Create a repeal map indicating that paragraph 15 was repealed by 2024:796 + repeal_map = {'15§': '2024:796'} + + result = parse_logical_sections(text, repeal_map=repeal_map) + + # Should have upphavd status + assert 'selex:upphavd="true"' in result + + # Should have upphavd_av attribute + assert 'selex:upphavd_av="2024:796"' in result + + def test_section_with_chapter_repeal_map(self): + """Test that a repealed section with chapter reference gets the upphavd_av attribute.""" + text = """## 29 kap. + +#### 15 § + +Denna paragraf har upphävts.""" + + # Create a repeal map with chapter reference + repeal_map = {'29kap15§': '2024:796'} + + result = parse_logical_sections(text, repeal_map=repeal_map) + + # Should have upphavd status + assert 'selex:upphavd="true"' in result + + # Should have upphavd_av attribute + assert 'selex:upphavd_av="2024:796"' in result + + def test_section_without_match_no_upphavd_av(self): + """Test that a section without a match in repeal_map doesn't get upphavd_av attribute.""" + text = """#### 15 § + +Normal content.""" + + # Create a repeal map that doesn't include paragraph 15 + repeal_map = {'16§': '2024:796'} + + result = parse_logical_sections(text, repeal_map=repeal_map) + + # Should not have upphavd status (no "upphävd" in content) + assert 'selex:upphavd="true"' not in result + + # Should not have upphavd_av attribute + assert 'selex:upphavd_av' not in result + + def test_backward_compatibility_no_repeal_map(self): + """Test that parse_logical_sections works without repeal_map (backward compatibility).""" + text = """#### 15 § + +Denna paragraf har upphävts.""" + + # Call without repeal_map + result = parse_logical_sections(text) + + # Should have upphavd status (from content) + assert 'selex:upphavd="true"' in result + + # Should not have upphavd_av attribute + assert 'selex:upphavd_av' not in result + + def test_multiple_repealed_sections(self): + """Test handling multiple repealed sections with different ändringsförfattningar.""" + text = """## 29 kap. + +#### 15 § + +Denna paragraf har upphävts. + +#### 16 § + +Denna paragraf har upphävts.""" + + # Different amendments repealed different paragraphs + repeal_map = { + '29kap15§': '2024:796', + '29kap16§': '2023:123' + } + + result = parse_logical_sections(text, repeal_map=repeal_map) + + # Both should have upphavd_av attributes + assert 'selex:upphavd_av="2024:796"' in result + assert 'selex:upphavd_av="2023:123"' in result + + def test_format_sfs_text_with_repeal_map(self): + """Test that format_sfs_text_as_markdown accepts and passes repeal_map.""" + text = """29 kap. + +15 § + +Denna paragraf har upphävts.""" + + repeal_map = {'29kap15§': '2024:796'} + + # format_sfs_text_as_markdown should accept repeal_map + formatted = format_sfs_text_as_markdown(text, repeal_map=repeal_map) + + # Then parse_logical_sections should be called with it + result = parse_logical_sections(formatted, repeal_map=repeal_map) + + # Should have the upphavd_av attribute + assert 'selex:upphavd_av="2024:796"' in result + + def test_repeal_map_with_paragraph_letter(self): + """Test repeal map with paragraph letters (e.g., 15a§).""" + text = """## 2 kap. + +#### 33 a § + +Denna paragraf har upphävts.""" + + repeal_map = {'2kap33a§': '2024:500'} + + result = parse_logical_sections(text, repeal_map=repeal_map) + + # Should have upphavd_av attribute + assert 'selex:upphavd_av="2024:500"' in result + + def test_repeal_map_normalization(self): + """Test that section ID normalization works for matching.""" + text = """## 10 kap. + +#### 37 § + +Denna paragraf har upphävts.""" + + # Repeal map uses different notation + repeal_map = {'10kap37§': '2020:100'} + + result = parse_logical_sections(text, repeal_map=repeal_map) + + # Should match despite different notation + assert 'selex:upphavd_av="2020:100"' in result diff --git a/test/test_parse_anteckningar.py b/test/test_parse_anteckningar.py new file mode 100644 index 0000000..a287d14 --- /dev/null +++ b/test/test_parse_anteckningar.py @@ -0,0 +1,241 @@ +""" +Tests for the anteckningar parser (temporal/parse_anteckningar.py). +""" + +import pytest +from temporal.parse_anteckningar import parse_anteckningar + + +class TestParseAnteckningar: + """Test parsing of Swedish amendment notes.""" + + def test_empty_string(self): + """Empty anteckningar should return empty lists.""" + result = parse_anteckningar("") + assert result == {'repealed': [], 'amended': [], 'new': []} + + def test_none_input(self): + """None input should return empty lists.""" + result = parse_anteckningar(None) + assert result == {'repealed': [], 'amended': [], 'new': []} + + # --- REPEALED (upph.) Tests --- + + def test_upph_simple_paragraph(self): + """Parse simple repealed paragraph without chapter.""" + result = parse_anteckningar("upph. 15 §") + assert '15§' in result['repealed'] + assert len(result['repealed']) == 1 + + def test_upph_with_chapter(self): + """Parse repealed paragraph with chapter.""" + result = parse_anteckningar("upph. 29 kap. 15 §") + assert '29kap15§' in result['repealed'] + assert len(result['repealed']) == 1 + + def test_upph_paragraph_with_letter(self): + """Parse repealed paragraph with letter suffix.""" + result = parse_anteckningar("upph. 29 kap. 22 a §") + assert '29kap22a§' in result['repealed'] + assert len(result['repealed']) == 1 + + def test_upph_multiple_paragraphs_same_chapter(self): + """Parse multiple repealed paragraphs in same chapter.""" + result = parse_anteckningar("upph. 29 kap. 15, 16 §§") + assert '29kap15§' in result['repealed'] + assert '29kap16§' in result['repealed'] + assert len(result['repealed']) == 2 + + def test_upph_multiple_paragraphs_with_letters(self): + """Parse multiple paragraphs including one with letter.""" + result = parse_anteckningar("upph. 2 kap. 32, 33 §§") + assert '2kap32§' in result['repealed'] + assert '2kap33§' in result['repealed'] + assert len(result['repealed']) == 2 + + def test_upph_real_example_1(self): + """Real example from 2010:800.json.""" + anteckningar = "upph. 29 kap. 15, 16 §§, rubr. närmast före 29 kap. 15 §; ändr. 10 kap. 37 §, 11 kap. 36 §" + result = parse_anteckningar(anteckningar) + + # Should extract the repealed paragraphs + assert '29kap15§' in result['repealed'] + assert '29kap16§' in result['repealed'] + + # Should also extract the amended paragraphs + assert '10kap37§' in result['amended'] + assert '11kap36§' in result['amended'] + + def test_upph_real_example_2(self): + """Real example with chapter-level repeal.""" + anteckningar = "upph. 23 kap., 29 kap. 22 a §" + result = parse_anteckningar(anteckningar) + + # Should extract the paragraph repeal (chapter-level is Phase 2) + assert '29kap22a§' in result['repealed'] + + def test_upph_real_example_3(self): + """Real example with single paragraph.""" + anteckningar = "upph. 15 kap. 23 §; ändr. 3 kap. 2, 4, 5, 7, 10, 12 i §§" + result = parse_anteckningar(anteckningar) + + # Should extract the repealed paragraph + assert '15kap23§' in result['repealed'] + assert len(result['repealed']) == 1 + + # Should extract amended paragraphs (note: "12 i" has a letter) + assert '3kap2§' in result['amended'] + assert '3kap4§' in result['amended'] + assert '3kap5§' in result['amended'] + + # --- AMENDED (ändr.) Tests --- + + def test_andr_simple(self): + """Parse simple amended paragraph.""" + result = parse_anteckningar("ändr. 29 kap. 6 §") + assert '29kap6§' in result['amended'] + assert len(result['amended']) == 1 + + def test_andr_multiple_chapters(self): + """Parse amended paragraphs from multiple chapters.""" + result = parse_anteckningar("ändr. 15 kap. 19 §, 18 kap. 19 §, 19 kap. 27 §") + assert '15kap19§' in result['amended'] + assert '18kap19§' in result['amended'] + assert '19kap27§' in result['amended'] + assert len(result['amended']) == 3 + + def test_andr_multiple_same_chapter(self): + """Parse multiple amended paragraphs in same chapter.""" + result = parse_anteckningar("ändr. 10 kap. 4 §, 12 kap. 4 §, 13 kap. 4 §") + assert '10kap4§' in result['amended'] + assert '12kap4§' in result['amended'] + assert '13kap4§' in result['amended'] + + def test_andr_real_example(self): + """Real example with many amended paragraphs.""" + anteckningar = "ändr. 26 kap. 10, 15, 17, 18 §§, 28 kap. 2, 5 §§" + result = parse_anteckningar(anteckningar) + + # Chapter 26 paragraphs + assert '26kap10§' in result['amended'] + assert '26kap15§' in result['amended'] + assert '26kap17§' in result['amended'] + assert '26kap18§' in result['amended'] + + # Chapter 28 paragraphs + assert '28kap2§' in result['amended'] + assert '28kap5§' in result['amended'] + + # --- NEW (ny/nya) Tests --- + + def test_nya_simple(self): + """Parse new paragraphs.""" + result = parse_anteckningar("nya 26 kap. 14 a, 14 b, 14 c §§") + assert '26kap14a§' in result['new'] + assert '26kap14b§' in result['new'] + assert '26kap14c§' in result['new'] + + def test_ny_single(self): + """Parse single new paragraph with 'ny' (singular).""" + result = parse_anteckningar("ny 3 kap. 11 a §") + assert '3kap11a§' in result['new'] + + def test_nya_real_example(self): + """Real example with multiple new paragraphs.""" + anteckningar = "nya 26 kap. 14 a, 14 b, 14 c, 16 a, 16 b, 16 c, 16 d §§" + result = parse_anteckningar(anteckningar) + + assert '26kap14a§' in result['new'] + assert '26kap14b§' in result['new'] + assert '26kap14c§' in result['new'] + assert '26kap16a§' in result['new'] + assert '26kap16b§' in result['new'] + assert '26kap16c§' in result['new'] + assert '26kap16d§' in result['new'] + + # --- MIXED Tests --- + + def test_mixed_upph_and_andr(self): + """Parse mixed upph and ändr clauses.""" + result = parse_anteckningar("upph. 15 §; ändr. 20 §") + assert '15§' in result['repealed'] + assert '20§' in result['amended'] + + def test_complex_real_example(self): + """Complex real example with upph, ändr, and nya.""" + anteckningar = ("upph. 29 kap. 15, 16 §§, rubr. närmast före 29 kap. 15 §; " + "ändr. 10 kap. 37 §, 11 kap. 36 §, 12 kap. 24 §; " + "nya 26 kap. 14 a, 14 b §§") + result = parse_anteckningar(anteckningar) + + # Repealed + assert '29kap15§' in result['repealed'] + assert '29kap16§' in result['repealed'] + + # Amended + assert '10kap37§' in result['amended'] + assert '11kap36§' in result['amended'] + assert '12kap24§' in result['amended'] + + # New + assert '26kap14a§' in result['new'] + assert '26kap14b§' in result['new'] + + # --- EDGE CASES --- + + def test_whitespace_handling(self): + """Parser should handle extra whitespace.""" + result = parse_anteckningar("upph. 29 kap. 15 §") + assert '29kap15§' in result['repealed'] + + def test_case_insensitive(self): + """Parser should handle different cases.""" + # Note: The regex uses re.IGNORECASE + result = parse_anteckningar("upph. 29 KAP. 15 §") + assert '29kap15§' in result['repealed'] + + def test_ignores_rubr_patterns(self): + """Parser should skip 'rubr.' patterns (Phase 2).""" + result = parse_anteckningar("upph. 15 §, rubr. närmast före 15 §") + # Should extract the paragraph but not process rubr + assert '15§' in result['repealed'] + + def test_ignores_betecknas_patterns(self): + """Parser should skip 'betecknas' patterns (Phase 2).""" + anteckningar = "upph. 2 kap. 33 §; nuvarande 2 kap. 32 § betecknas 2 kap. 33 §" + result = parse_anteckningar(anteckningar) + # Should extract the repealed paragraph but not process betecknas + assert '2kap33§' in result['repealed'] + + def test_multiple_semicolons(self): + """Parser should handle multiple semicolons correctly.""" + result = parse_anteckningar("upph. 15 §; ändr. 20 §; nya 25 §") + assert '15§' in result['repealed'] + assert '20§' in result['amended'] + assert '25§' in result['new'] + + +class TestNormalization: + """Test that normalization produces consistent format.""" + + def test_removes_spaces(self): + """Normalized references should have no spaces.""" + result = parse_anteckningar("upph. 29 kap. 15 a §") + assert '29kap15a§' in result['repealed'] + # Verify no spaces + assert ' ' not in result['repealed'][0] + + def test_lowercase(self): + """Normalized references should be lowercase.""" + result = parse_anteckningar("upph. 29 KAP. 15 A §") + assert '29kap15a§' in result['repealed'] + # Verify lowercase + assert result['repealed'][0] == result['repealed'][0].lower() + + def test_consistent_format(self): + """All references should follow the same format.""" + result = parse_anteckningar("upph. 29 kap. 15 §, 2 kap. 33 a §") + for ref in result['repealed']: + # Should match pattern: \d+kap\d+[a-z]?§ + assert 'kap' in ref + assert ref.endswith('§')