Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 58 additions & 4 deletions formatters/format_sfs_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
"""

import re
from typing import Optional
from typing import Optional, Dict
from .apply_links import apply_sfs_links, apply_internal_links, apply_eu_links, apply_law_name_links
from util.text_utils import WHITESPACE_PATTERN

Expand Down Expand Up @@ -149,7 +149,7 @@ def _adjust_heading_level_for_avdelning(base_level: str, inside_avdelning: bool)
return base_level


def format_sfs_text_as_markdown(text: str, apply_links: bool = False) -> str:
def format_sfs_text_as_markdown(text: str, apply_links: bool = False, repeal_map: Optional[Dict[str, str]] = None) -> str:
"""
Formattera texten från en författningstext importerad från
Regeringskansliets rättsdatabas till Markdown-format.
Expand All @@ -159,6 +159,7 @@ def format_sfs_text_as_markdown(text: str, apply_links: bool = False) -> str:
Args:
text (str): Texten som ska formateras
apply_links (bool): Om True, konvertera både interna paragrafnummer och SFS-beteckningar till markdown-länkar
repeal_map (Optional[Dict[str, str]]): Map of section IDs to the ändringsförfattning beteckning that repealed them

Returns:
str: Den formaterade texten
Expand Down Expand Up @@ -500,7 +501,7 @@ def _is_section_ikraft(header_line: str, content: str) -> bool:
re.search(INTOFORCE_ANY_PATTERN, content_lower) is not None)


def parse_logical_sections(text: str) -> str:
def parse_logical_sections(text: str, repeal_map: Optional[Dict[str, str]] = None) -> str:
"""
Dela upp texten i logiska sektioner baserat på Markdown-rubriker och omslut
varje rubrik och dess innehåll med <section>-taggar.
Expand Down Expand Up @@ -715,7 +716,13 @@ def process_current_section():
if upphor_datum:
attributes.append(f'selex:upphor_datum="{upphor_datum}"')
if has_upphavd:
attributes.append('selex:upphavd="true"') # TODO: Peka ut i vilken ändringsförfattning den upphävdes
attributes.append('selex:upphavd="true"')

# Track which ändringsförfattning repealed this section
if repeal_map and section_id:
upphavd_av = _find_repeal_source(section_id, repeal_map)
if upphavd_av:
attributes.append(f'selex:upphavd_av="{upphavd_av}"')
if ikraft_villkor:
attributes.append(f'selex:ikraft_villkor="{ikraft_villkor}"')

Expand Down Expand Up @@ -799,6 +806,53 @@ def process_current_section():
return '\n'.join(result)


def _find_repeal_source(section_id: str, repeal_map: Dict[str, str]) -> Optional[str]:
"""
Find which ändringsförfattning repealed this section.

Tries multiple normalized forms to match section_id against repeal_map.
Example: section_id 'kap29.15' matches '29kap15§' in repeal_map

Args:
section_id: Section ID like 'kap29.15' or 'kap1.15a' (generated by generate_section_id)
repeal_map: Map of normalized references to beteckning (e.g., '29kap15§' -> '2024:796')

Returns:
Beteckning of repealing ändringsförfattning, or None
"""
# Direct match
if section_id in repeal_map:
return repeal_map[section_id]

# Try normalized forms with § symbol added
# section_id format from generate_section_id: 'kap29.15' or 'kap2.15a' or just '15'

# Try matching chapter.paragraph format (e.g., 'kap29.15')
match = re.match(r'kap(\d+[a-z]?)\.(\d+[a-z]?)$', section_id)
if match:
chapter = match.group(1)
paragraph = match.group(2)

# Try normalized forms
variants = [
f"{chapter}kap{paragraph}§", # '29kap15§'
f"{chapter}kap.{paragraph}§", # '29kap.15§'
f"{paragraph}§", # '15§' (chapter-less)
]

for variant in variants:
if variant in repeal_map:
return repeal_map[variant]

# Try matching simple paragraph number (e.g., '15' or '15a')
match = re.match(r'^(\d+[a-z]?)$', section_id)
if match:
paragraph = match.group(1)
variant = f"{paragraph}§"
if variant in repeal_map:
return repeal_map[variant]

return None


def check_unprocessed_temporal_sections(text: str) -> None:
Expand Down
18 changes: 16 additions & 2 deletions sfs_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
from temporal.title_temporal import title_temporal
from temporal.amendments import extract_amendments
from temporal.apply_temporal import apply_temporal, is_document_content_empty, add_empty_document_message
from temporal.parse_anteckningar import parse_anteckningar
from exporters.git import create_init_git_commit
from util.yaml_utils import format_yaml_value
from util.datetime_utils import format_datetime
Expand Down Expand Up @@ -434,11 +435,24 @@ def convert_to_markdown(data: Dict[str, Any], fetch_predocs_from_api: bool = Fal
# Use the ignored content body (already includes heading)
markdown_body = ignored_body
else:
# Build repeal map from amendments to track which ändringsförfattning repealed each paragraph
repeal_map = {}
if data.get('andringsforfattningar'):
for amendment in data['andringsforfattningar']:
beteckning = amendment.get('beteckning')
anteckningar = amendment.get('anteckningar', '')

if beteckning and anteckningar:
parsed = parse_anteckningar(anteckningar)
# Map each repealed paragraph to this amendment
for repealed_ref in parsed.get('repealed', []):
repeal_map[repealed_ref] = beteckning

# Format the content text to markdown
formatted_text = format_sfs_text_as_markdown(innehall_text, apply_links=apply_links)
formatted_text = format_sfs_text_as_markdown(innehall_text, apply_links=apply_links, repeal_map=repeal_map)

# Apply section tags
formatted_text = parse_logical_sections(formatted_text)
formatted_text = parse_logical_sections(formatted_text, repeal_map=repeal_map)

# Debug: Check if formatting resulted in empty text
if not formatted_text.strip():
Expand Down
155 changes: 155 additions & 0 deletions temporal/parse_anteckningar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
"""
Parser for Swedish legal amendment notes (anteckningar).

This module parses the anteckningar field from ändringsförfattningar to extract
structured information about which paragraphs were repealed, amended, or added.

Example anteckningar:
"upph. 29 kap. 15, 16 §§, rubr. närmast före 29 kap. 15 §; ändr. 10 kap. 37 §"

Parsed result:
{
'repealed': ['29kap15§', '29kap16§'],
'amended': ['10kap37§'],
'new': []
}
"""

import re
from typing import Dict, List


def parse_anteckningar(anteckningar: str) -> Dict[str, List[str]]:
"""
Parse Swedish amendment notes into structured data.

Args:
anteckningar: The anteckningar string from an ändringsförfattning

Returns:
Dictionary with keys:
- 'repealed': List of normalized paragraph references that were repealed (upph.)
- 'amended': List of normalized paragraph references that were amended (ändr.)
- 'new': List of normalized paragraph references that were added (ny/nya)

Example:
>>> parse_anteckningar("upph. 29 kap. 15, 16 §§; ändr. 10 kap. 37 §")
{'repealed': ['29kap15§', '29kap16§'], 'amended': ['10kap37§'], 'new': []}
"""
result = {
'repealed': [],
'amended': [],
'new': []
}

if not anteckningar or not anteckningar.strip():
return result

# Split on semicolons to separate major clauses
clauses = anteckningar.split(';')

for clause in clauses:
clause = clause.strip()
if not clause:
continue

# Identify action type and extract paragraphs
if clause.startswith('upph.'):
paragraphs = _extract_paragraphs(clause[5:].strip()) # Remove 'upph.'
result['repealed'].extend(paragraphs)
elif clause.startswith('ändr.'):
paragraphs = _extract_paragraphs(clause[5:].strip()) # Remove 'ändr.'
result['amended'].extend(paragraphs)
elif clause.startswith('ny ') or clause.startswith('nya '):
# Extract after 'ny ' or 'nya '
start_idx = 3 if clause.startswith('nya') else 2
paragraphs = _extract_paragraphs(clause[start_idx:].strip())
result['new'].extend(paragraphs)

return result


def _extract_paragraphs(text: str) -> List[str]:
"""
Extract normalized paragraph references from a text fragment.

Handles patterns like:
- "29 kap. 15 §" → ['29kap15§']
- "29 kap. 15, 16 §§" → ['29kap15§', '29kap16§']
- "15 §" → ['15§']
- "23 kap." → ['23kap'] (chapter-level, Phase 2)

Args:
text: Text fragment after the action keyword (upph./ändr./ny)

Returns:
List of normalized paragraph references
"""
paragraphs = []

# Skip patterns we don't handle yet (Phase 2)
if 'rubr.' in text or 'betecknas' in text or 'nuvarande' in text:
# Log for future enhancement but don't extract
# These are complex patterns for Phase 2
pass

# Pattern 1: Chapter + paragraphs
# Examples: "29 kap. 15, 16 §§", "29 kap. 15 §", "2 kap. 32, 33 §§"
chapter_pattern = r'(\d+(?:\s*[a-z])?)\s*kap\.\s*((?:\d+(?:\s*[a-z])?(?:\s*,\s*)?)+)\s*§'

for match in re.finditer(chapter_pattern, text, re.IGNORECASE):
chapter = match.group(1).replace(' ', '').lower()
para_list = match.group(2)

# Split on commas to get individual paragraph numbers
para_numbers = [p.strip().replace(' ', '').lower() for p in para_list.split(',')]

for para_num in para_numbers:
if para_num: # Skip empty strings
normalized = f"{chapter}kap{para_num}§"
paragraphs.append(normalized)

# Pattern 2: Chapter only (for chapter-level changes)
# Example: "23 kap." (without paragraph reference)
# Note: This is for Phase 2, but we detect it for completeness
chapter_only_pattern = r'(\d+(?:\s*[a-z])?)\s*kap\.(?!\s*\d)'

for match in re.finditer(chapter_only_pattern, text, re.IGNORECASE):
chapter = match.group(1).replace(' ', '').lower()
# Chapter-level change - skip for Phase 1
# In Phase 2, we'd add: paragraphs.append(f"{chapter}kap")
pass

# Pattern 3: Paragraph without chapter
# Examples: "15 §", "15, 16 §§"
# These references are ambiguous without chapter context
para_only_pattern = r'(?<!\d\s)(?<!kap\.\s)(\d+(?:\s*[a-z])?(?:\s*,\s*\d+(?:\s*[a-z])?)*)\s*§'

# Only match if there's no chapter context before it
if 'kap.' not in text:
for match in re.finditer(para_only_pattern, text, re.IGNORECASE):
para_list = match.group(1)
para_numbers = [p.strip().replace(' ', '').lower() for p in para_list.split(',')]

for para_num in para_numbers:
if para_num:
normalized = f"{para_num}§"
paragraphs.append(normalized)

return paragraphs


def _normalize_reference(chapter: str, paragraph: str) -> str:
"""
Create a normalized section reference.

Args:
chapter: Chapter number (e.g., '29', '2a')
paragraph: Paragraph number (e.g., '15', '15a')

Returns:
Normalized reference (e.g., '29kap15§', '2akap15a§')
"""
chapter_clean = chapter.replace(' ', '').lower()
para_clean = paragraph.replace(' ', '').lower()
return f"{chapter_clean}kap{para_clean}§"
Loading