gen_llmtxt/lambda_function.py at master · Altor-lab/gen_llmtxt · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
"""
AWS Lambda function for AI SEO optimization - llms.txt checker and generator.
"""
import json
import sys
import os

# Add src directory to path for imports
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))

from src.llms_txt_checker import LLMSTxtChecker
from src.llms_txt_validator import LLMSTxtValidator
from src.website_crawler import WebsiteCrawler
from src.llms_txt_generator import LLMSTxtGenerator


def lambda_handler(event, context):
    """
    AWS Lambda handler function.

    Expected event structure:
    {
        "url": "https://example.com",
        "options": {
            "max_pages": 20,  # optional
            "timeout": 10      # optional
        }
    }

    Returns:
    {
        "statusCode": 200,
        "body": {
            "url": "https://example.com",
            "llms_txt_exists": true/false,
            "validation": {...},  # if exists
            "generated_llms_txt": "...",  # if doesn't exist
            "recommendations": [...]
        }
    }
    """
    try:
        # Parse input
        if isinstance(event.get('body'), str):
            body = json.loads(event['body'])
        else:
            body = event

        url = body.get('url')
        if not url:
            return {
                'statusCode': 400,
                'body': json.dumps({
                    'error': 'Missing required parameter: url'
                })
            }

        options = body.get('options', {})
        max_pages = options.get('max_pages', 20)
        timeout = options.get('timeout', 10)

        # Initialize components
        checker = LLMSTxtChecker(timeout=timeout)
        validator = LLMSTxtValidator()
        crawler = WebsiteCrawler(max_pages=max_pages, timeout=timeout)
        generator = LLMSTxtGenerator()

        # Step 1: Check if llms.txt exists
        check_result = checker.check_llms_txt(url)

        response_data = {
            'url': url,
            'llms_txt_exists': check_result['exists'],
            'llms_txt_url': check_result.get('url')
        }

        if check_result['exists']:
            # Step 2a: Validate existing llms.txt
            content = check_result.get('content', '')
            validation_result = validator.validate(content)

            response_data['validation'] = {
                'valid': validation_result['valid'],
                'score': validation_result['score'],
                'issues': validation_result['issues'],
                'warnings': validation_result['warnings'],
                'sections_found': validation_result['sections_found'],
                'links_count': len(validation_result['links_found']),
                'has_optional_section': validation_result['has_optional_section']
            }

            response_data['current_llms_txt'] = content

            # Generate recommendations for improvement
            site_info = crawler.crawl(url)
            response_data['recommendations'] = generator.generate_recommendations(
                site_info,
                validation_result
            )

        else:
            # Step 2b: Generate new llms.txt
            site_info = crawler.crawl(url)

            if 'error' in site_info:
                return {
                    'statusCode': 500,
                    'body': json.dumps({
                        'error': f'Failed to crawl website: {site_info["error"]}'
                    })
                }

            generated_content = generator.generate(site_info)

            response_data['generated_llms_txt'] = generated_content
            response_data['site_info'] = {
                'title': site_info.get('title'),
                'description': site_info.get('description'),
                'documentation_links_found': len(site_info.get('documentation_links', [])),
                'navigation_links_found': len(site_info.get('navigation_links', []))
            }

            # Generate recommendations
            response_data['recommendations'] = generator.generate_recommendations(site_info)

        return {
            'statusCode': 200,
            'headers': {
                'Content-Type': 'application/json',
                'Access-Control-Allow-Origin': '*'
            },
            'body': json.dumps(response_data, indent=2)
        }

    except Exception as e:
        return {
            'statusCode': 500,
            'body': json.dumps({
                'error': str(e),
                'type': type(e).__name__
            })
        }


# For local testing
if __name__ == '__main__':
    # Test event
    test_event = {
        'url': 'https://fastht.ml'
    }

    result = lambda_handler(test_event, None)
    print(json.dumps(json.loads(result['body']), indent=2))