diff --git a/.gitignore b/.gitignore index 038e94e..a741bee 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,2 @@ .venv/ -.webui_secret_key -datasets/ \ No newline at end of file +.webui_secret_key \ No newline at end of file diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..b6d8b76 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.11.8 diff --git a/agents4gov/.gitignore b/agents4gov/.gitignore new file mode 100644 index 0000000..038e94e --- /dev/null +++ b/agents4gov/.gitignore @@ -0,0 +1,3 @@ +.venv/ +.webui_secret_key +datasets/ \ No newline at end of file diff --git a/agents4gov/README.md b/agents4gov/README.md new file mode 100644 index 0000000..c936675 --- /dev/null +++ b/agents4gov/README.md @@ -0,0 +1,91 @@ + +# Agents4Gov + +**Laboratory of Computational Intelligence (LABIC – ICMC/USP)** + + +## Overview + +**Agents4Gov** is a research and development project from **LABIC – Institute of Mathematics and Computer Sciences (ICMC/USP)** focused on building **LLM-based tools** to support and modernize **public sector services**. +The project emphasizes **local Large Language Models (LLMs)** for privacy, **data anonymization**, and the **development and evaluation of tools** for use in government and institutional environments. + +--- + +## Installation + +### 1. Install the Open WebUI Server + +Agents4Gov is built on top of the **[Open WebUI](https://github.com/open-webui/open-webui)** framework, which serves as the base environment for loading and running tools. + +Before starting, ensure you are using **Python 3.11** to avoid compatibility issues. + +To install and run Open WebUI: + +```bash +# Install Open WebUI +pip install open-webui + +# Start the server +open-webui serve +``` + +After starting, the Open WebUI interface will be available at: +👉 **[http://localhost:8080](http://localhost:8080)** + +--- + +### 2. Clone the Agents4Gov Repository + +In the same environment, clone the Agents4Gov repository: + +```bash +git clone https://github.com/icmc-usp/Agents4Gov.git +``` + +The `tools/` directory inside the repository contains all implemented tools. + +--- + +### 3. Import Tools into Open WebUI + +Once Open WebUI is running: + +1. Access the **Tools** module in the Open WebUI interface. +2. Use the **Import Tool** option to add any of the tools from the `Agents4Gov/tools/` directory. +3. Each tool has its own documentation and configuration guide within its folder. + +Example: + +```bash +ls Agents4Gov/tools/ +``` + +Each subdirectory corresponds to an individual tool that can be imported, executed, and evaluated directly within Open WebUI. + +--- + +## Repository Structure + +``` +Agents4Gov/ +├── tools/ # Implemented tools for public services +├── data/ # Example or anonymized datasets +├── docs/ # Documentation and evaluation reports +├── config/ # Model and system configuration files +└── README.md +``` + +--- + +## Objectives + +* Develop and evaluate **LLM-based tools** focused on **public sector innovation**. +* Ensure **privacy-preserving** AI development using local LLMs and anonymized data. +* Provide a **modular and extensible** framework for integrating intelligent tools into public service environments. + +--- + +## License + +This project is licensed under the **MIT License**. +See the [LICENSE](LICENSE) file for details. diff --git a/agents4gov/config/README.md b/agents4gov/config/README.md new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/agents4gov/config/README.md @@ -0,0 +1 @@ + diff --git a/agents4gov/data/README.md b/agents4gov/data/README.md new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/agents4gov/data/README.md @@ -0,0 +1 @@ + diff --git a/agents4gov/docs/README.md b/agents4gov/docs/README.md new file mode 100644 index 0000000..1daf4c1 --- /dev/null +++ b/agents4gov/docs/README.md @@ -0,0 +1,52 @@ +# Agents4Gov Documentation + +Welcome to the Agents4Gov documentation. This directory contains guides and tutorials to help you work with the framework. + +## Project Structure + +Agents4Gov is built on top of **[Open WebUI](https://github.com/open-webui/open-webui)**, a framework for running LLM-based applications with tool integration. + +``` +Agents4Gov/ +├── tools/ # Implemented tools for public services +├── data/ # Example or anonymized datasets +├── docs/ # Documentation and evaluation reports +├── config/ # Model and system configuration files +└── README.md # Main project documentation +``` + +### Key Directories + +- **`tools/`** - Contains all tool implementations that can be imported into Open WebUI. Each tool is a Python class that provides specific functionality to agents. +- **`data/`** - Stores datasets used for testing and evaluation, with privacy-preserving anonymization. +- **`docs/`** - Documentation, tutorials, and research reports. +- **`config/`** - Configuration files for models and system settings. + +## Available Documentation + +- **[How to Create a Tool](how_to_create_tool.md)** - A comprehensive step-by-step guide for creating custom tools that can be used by agents. Learn about tool structure, parameter validation, error handling, and best practices. Reference implementation: `tools/open_alex_doi.py` + +## External Resources + +### Open WebUI Documentation + +Agents4Gov tools are designed to run within Open WebUI. For understanding the underlying framework: + +- **[Open WebUI GitHub](https://github.com/open-webui/open-webui)** - Main repository and source code +- **[Open WebUI Documentation](https://docs.openwebui.com/)** - Official documentation for installation, configuration, and usage +- **[Open WebUI Tools Guide](https://docs.openwebui.com/features/plugin/tools)** - Specific documentation on how tools work within Open WebUI + +### Getting Started with Open WebUI + +1. Install Open WebUI: `pip install open-webui` +2. Start the server: `open-webui serve` +3. Access the interface at [http://localhost:8080](http://localhost:8080) +4. Import Agents4Gov tools through the Tools module in the UI + +## Contributing + +When adding new documentation: +1. Create your markdown file in this `docs/` directory +2. Update this README.md with a link to your new document +3. Use clear, descriptive titles and include practical examples +4. Follow the structure and style of existing documentation diff --git a/agents4gov/docs/how_to_create_tool.md b/agents4gov/docs/how_to_create_tool.md new file mode 100644 index 0000000..bc2f814 --- /dev/null +++ b/agents4gov/docs/how_to_create_tool.md @@ -0,0 +1,518 @@ +# How to Create a Tool for Agents4Gov + +This guide will walk you through creating a tool that can be used by agents in the Agents4Gov framework. We'll use the `tools/open_alex_doi.py` file as a reference example. + +## Table of Contents +1. [Tool Structure Overview](#tool-structure-overview) +2. [Step 1: Set Up Basic Class Structure](#step-1-set-up-basic-class-structure) +3. [Step 2: Define Helper Methods](#step-2-define-helper-methods) +4. [Step 3: Create the Main Tool Method](#step-3-create-the-main-tool-method) +5. [Step 4: Add Parameter Definitions with Pydantic](#step-4-add-parameter-definitions-with-pydantic) +6. [Step 5: Write Comprehensive Docstrings](#step-5-write-comprehensive-docstrings) +7. [Step 6: Implement the Core Logic](#step-6-implement-the-core-logic) +8. [Step 7: Handle Errors Gracefully](#step-7-handle-errors-gracefully) +9. [Step 8: Return Structured Data](#step-8-return-structured-data) +10. [Best Practices](#best-practices) + +--- + +## Tool Structure Overview + +A tool in Agents4Gov is a Python class that provides specific functionality to agents. Each tool: +- Lives in the `tools/` directory +- Contains a `Tools` class with methods that agents can call +- Uses Pydantic for parameter validation and description +- Returns structured data (typically JSON strings) +- Includes comprehensive error handling + +--- + +## Step 1: Set Up Basic Class Structure + +Create a new Python file in the `tools/` directory (e.g., `tools/my_tool.py`). + +Start with the basic imports and class structure: + +```python +import os +import requests +import json +from pydantic import Field + +class Tools: + def __init__(self): + pass +``` + +**Key Points:** +- Import necessary libraries (`requests` for API calls, `json` for data handling, `pydantic` for validation) +- Always name the class `Tools` +- Include an `__init__` method (even if it just passes) + +**Reference:** `tools/open_alex_doi.py:1-8` + +--- + +## Step 2: Define Helper Methods + +Helper methods are private methods (prefixed with `_`) that support your main tool functionality. + +```python +def _clean_doi(self, doi: str) -> str: + """ + Clean and normalize a DOI string by removing common prefixes. + + Args: + doi: The DOI string to clean + + Returns: + Cleaned DOI string without prefixes like 'doi:', 'https://doi.org/', etc. + """ + doi_clean = doi.strip() + + # Remove common DOI prefixes + if doi_clean.lower().startswith('doi:'): + doi_clean = doi_clean[4:].strip() + if doi_clean.startswith('https://doi.org/'): + doi_clean = doi_clean.replace('https://doi.org/', '') + if doi_clean.startswith('http://doi.org/'): + doi_clean = doi_clean.replace('http://doi.org/', '') + + return doi_clean +``` + +**Key Points:** +- Use underscore prefix (`_`) for private methods +- Add type hints for parameters and return values +- Include docstrings explaining purpose, arguments, and return values +- Keep helper methods focused on a single task + +**Reference:** `tools/open_alex_doi.py:10-30` + +--- + +## Step 3: Create the Main Tool Method + +This is the method that agents will actually call. It should be public (no underscore prefix). + +```python +def get_openalex_metadata_by_doi( + self, + doi: str = Field( + ..., + description="The DOI (Digital Object Identifier) of the publication" + ) +) -> str: + """ + Retrieve metadata for a scientific publication from OpenAlex API. + + Args: + doi: The DOI of the publication to query + + Returns: + JSON string with structured publication data + """ + # Implementation here +``` + +**Key Points:** +- Use descriptive method names that clearly indicate functionality +- Method should accept `self` as first parameter +- Return type should typically be `str` (JSON string) for complex data + +**Reference:** `tools/open_alex_doi.py:32-51` + +--- + +## Step 4: Add Parameter Definitions with Pydantic + +Use Pydantic's `Field` to define parameters with descriptions that help agents understand how to use your tool. + +```python +def my_tool_method( + self, + required_param: str = Field( + ..., # The ellipsis (...) means this parameter is required + description="Clear description of what this parameter does and example values" + ), + optional_param: str = Field( + default="default_value", + description="Description of optional parameter with its default value" + ) +) -> str: +``` + +**Key Points:** +- `...` in `Field(...)` indicates a required parameter +- Always include a descriptive `description` that includes: + - What the parameter is for + - Expected format or examples + - Any constraints or special values +- Use appropriate types (str, int, bool, etc.) + +**Reference:** `tools/open_alex_doi.py:33-37` + +--- + +## Step 5: Write Comprehensive Docstrings + +Every method needs a docstring that explains what it does, its parameters, and what it returns. + +```python +def get_openalex_metadata_by_doi(self, doi: str = Field(...)) -> str: + """ + Retrieve essential metadata and impact indicators for a scientific publication from OpenAlex API. + + Returns a JSON string containing: + - Basic metadata (title, authors, venue, publication year) + - Impact indicators (citations, percentiles, FWCI) + + Args: + doi: The DOI of the publication to query + + Returns: + JSON string with structured publication data and impact metrics + """ +``` + +**Key Points:** +- Start with a one-line summary +- Add detailed description if needed +- List what data the method returns +- Document all parameters in the Args section +- Specify return type in the Returns section + +**Reference:** `tools/open_alex_doi.py:39-51` + +--- + +## Step 6: Implement the Core Logic + +Implement the main functionality of your tool with clear comments and sections. + +```python +# Clean the input +doi_clean = self._clean_doi(doi) + +# Build API endpoint URL +base_url = f"https://api.openalex.org/works/doi:{doi_clean}" + +# Handle environment variables for configuration +email = os.getenv("OPENALEX_EMAIL", None) +params = {} +if email: + params['mailto'] = email + +try: + # Make API request + response = requests.get(base_url, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + # ======================================== + # BASIC METADATA EXTRACTION + # ======================================== + + title = data.get('title', None) + publication_year = data.get('publication_year', None) + + # Extract and format complex nested data + authors_list = data.get('authorships', []) + authors = [ + author_info.get('author', {}).get('display_name') + for author_info in authors_list + ] +``` + +**Key Points:** +- Use clear section comments with visual separators +- Call helper methods for data cleaning/processing +- Support environment variables for API keys or configuration +- Always set timeouts on API requests +- Use `.get()` for safe dictionary access +- Handle nested data structures carefully + +**Reference:** `tools/open_alex_doi.py:53-94` + +--- + +## Step 7: Handle Errors Gracefully + +Implement comprehensive error handling to help users understand what went wrong. + +```python +try: + # Main logic here + response = requests.get(base_url, params=params, timeout=10) + response.raise_for_status() + # ... processing ... + +except requests.exceptions.HTTPError as e: + # Handle HTTP errors (e.g., 404 Not Found) + error_result = { + 'status': 'error', + 'error_type': 'http_error', + 'error_code': e.response.status_code, + 'message': f'Publication not found for DOI: {doi_clean}' if e.response.status_code == 404 else str(e), + 'doi': doi_clean + } + return json.dumps(error_result, ensure_ascii=False, indent=2) + +except requests.exceptions.RequestException as e: + # Handle connection errors + error_result = { + 'status': 'error', + 'error_type': 'connection_error', + 'message': f'Error connecting to API: {str(e)}', + 'doi': doi_clean + } + return json.dumps(error_result, ensure_ascii=False, indent=2) + +except Exception as e: + # Handle any other unexpected errors + error_result = { + 'status': 'error', + 'error_type': 'unexpected_error', + 'message': f'Unexpected error: {str(e)}', + 'doi': doi_clean + } + return json.dumps(error_result, ensure_ascii=False, indent=2) +``` + +**Key Points:** +- Catch specific exceptions first, then general ones +- Return structured error information as JSON +- Include `status` field to indicate success/failure +- Include `error_type` to categorize the error +- Provide helpful error messages +- Include relevant context (e.g., the DOI that was queried) + +**Reference:** `tools/open_alex_doi.py:166-195` + +--- + +## Step 8: Return Structured Data + +Return data in a consistent, well-structured JSON format. + +```python +# Build structured response +result = { + 'status': 'success', + 'doi': doi_clean, + 'openalex_id': data.get('id'), + + # Group related data into nested objects + 'metadata': { + 'title': title, + 'authors': authors, + 'venue': venue_name, + 'publication_year': publication_year, + 'publication_date': publication_date, + 'type': type_crossref + }, + + # Group impact metrics separately + 'impact_indicators': { + 'cited_by_count': cited_by_count, + 'citation_normalized_percentile': { + 'value': percentile_value, + 'is_in_top_1_percent': is_top_1_percent + }, + 'cited_by_percentile_year': { + 'min': percentile_min, + 'max': percentile_max + }, + 'fwci': fwci + }, + + # Provide useful links + 'links': { + 'doi_url': f'https://doi.org/{doi_clean}', + 'openalex_url': data.get('id') + } +} + +# Return as formatted JSON string +return json.dumps(result, ensure_ascii=False, indent=2) +``` + +**Key Points:** +- Always include a `status` field ('success' or 'error') +- Group related data into nested objects +- Use consistent naming conventions (snake_case) +- Use `ensure_ascii=False` to properly handle unicode characters +- Use `indent=2` for readable output +- Return as JSON string, not dictionary + +**Reference:** `tools/open_alex_doi.py:123-160` + +--- + +## Best Practices + +### 1. **Clear Naming** +- Use descriptive method names: `get_openalex_metadata_by_doi` (good) vs `get_data` (bad) +- Use verb + noun pattern: `get_`, `fetch_`, `create_`, `update_`, etc. + +### 2. **Input Validation** +- Clean and normalize inputs using helper methods +- Validate parameters before using them +- Use Pydantic Field descriptions to guide users + +### 3. **Environment Variables** +- Use environment variables for API keys and configuration +- Provide defaults with `os.getenv("VAR_NAME", default_value)` +- Document required environment variables in docstrings + +### 4. **API Best Practices** +- Always set timeouts on requests +- Use appropriate HTTP methods +- Handle rate limiting if applicable +- Include user agent or email for polite API access + +### 5. **Error Messages** +- Be specific about what went wrong +- Include context (what operation failed, with what input) +- Suggest solutions when possible +- Return errors as structured JSON, not by raising exceptions + +### 6. **Documentation** +- Write clear docstrings for all public methods +- Include examples in docstrings when helpful +- Comment complex logic sections +- Use visual separators for different sections + +### 7. **Testing Considerations** +- Make methods testable by isolating concerns +- Use helper methods for reusable logic +- Consider edge cases in error handling +- Test with invalid inputs + +### 8. **Return Format** +- Always return JSON strings for complex data +- Include status indicator in responses +- Group related fields into nested objects +- Use consistent field naming across tools + +--- + +## Complete Example Template + +Here's a complete template you can use as a starting point: + +```python +import os +import requests +import json +from pydantic import Field + +class Tools: + def __init__(self): + pass + + def _helper_method(self, input_data: str) -> str: + """ + Brief description of what this helper does. + + Args: + input_data: Description of input + + Returns: + Description of output + """ + # Implementation + return processed_data + + def main_tool_method( + self, + required_param: str = Field( + ..., + description="Clear description with examples" + ), + optional_param: str = Field( + default="default", + description="Description of optional parameter" + ) + ) -> str: + """ + Brief description of what this tool does. + + Longer description with details about: + - What data it returns + - What operations it performs + - Any important notes + + Args: + required_param: Description of required parameter + optional_param: Description of optional parameter + + Returns: + JSON string with structured results + """ + + # Clean/validate inputs + processed_input = self._helper_method(required_param) + + # Get configuration + api_key = os.getenv("API_KEY", None) + + try: + # Main logic + response = requests.get( + "https://api.example.com/endpoint", + headers={"Authorization": f"Bearer {api_key}"} if api_key else {}, + timeout=10 + ) + response.raise_for_status() + data = response.json() + + # Extract and structure data + result = { + 'status': 'success', + 'input': processed_input, + 'data': { + 'field1': data.get('field1'), + 'field2': data.get('field2') + } + } + + return json.dumps(result, ensure_ascii=False, indent=2) + + except requests.exceptions.HTTPError as e: + error_result = { + 'status': 'error', + 'error_type': 'http_error', + 'error_code': e.response.status_code, + 'message': str(e) + } + return json.dumps(error_result, ensure_ascii=False, indent=2) + + except Exception as e: + error_result = { + 'status': 'error', + 'error_type': 'unexpected_error', + 'message': str(e) + } + return json.dumps(error_result, ensure_ascii=False, indent=2) +``` + +--- + +## Next Steps + +1. **Create your tool file** in the `tools/` directory +2. **Implement the basic structure** following this guide +3. **Test your tool** with various inputs including edge cases +4. **Document any environment variables** needed +5. **Add your tool to the agent's configuration** so it can be discovered and used + +## Additional Resources + +- Review `tools/open_alex_doi.py` for a complete working example +- Check Pydantic documentation for advanced field validation +- See the agents configuration to understand how tools are loaded + +--- + +**Remember:** A good tool is reliable, well-documented, and handles errors gracefully. Take time to write clear code that other developers (and AI agents) can easily understand and use. diff --git a/agents4gov/models/knn_soybean_42.joblib b/agents4gov/models/knn_soybean_42.joblib new file mode 100644 index 0000000..cdb1f16 Binary files /dev/null and b/agents4gov/models/knn_soybean_42.joblib differ diff --git a/agents4gov/requirements.txt b/agents4gov/requirements.txt new file mode 100644 index 0000000..ae27f9b --- /dev/null +++ b/agents4gov/requirements.txt @@ -0,0 +1,4 @@ +requests +pydantic +open-webui +openml \ No newline at end of file diff --git a/agents4gov/tools/README.md b/agents4gov/tools/README.md new file mode 100644 index 0000000..8e00fff --- /dev/null +++ b/agents4gov/tools/README.md @@ -0,0 +1,91 @@ +# Tools + +This directory contains tools that can be used by agents in the Agents4Gov framework. Each tool provides specific functionality that agents can call to perform tasks. + +## Available Tools + +### OpenAlex +- **[openalex/open_alex_doi.py](openalex/README.md)** - Retrieves metadata and impact indicators for scientific publications using DOI + +### OpenML +- **[openml/openml_search.py](openml/README.md)** - Search for machine learning datasets using semantic similarity with embeddings +- **[openml/openml_download.py](openml/README.md)** - Download datasets from OpenML by ID and save as CSV +- **[openml/openml_knn_train.py](openml/README.md)** - Train KNN models with hyperparameter tuning via cross-validation + +## How to Use Tools in Open WebUI + +### Method 1: Import via UI + +1. Start Open WebUI server: `open-webui serve` +2. Access the web interface at [http://localhost:8080](http://localhost:8080) +3. Navigate to **Workspace → Tools** +4. Click **Import Tool** or **+ Create Tool** +5. Copy and paste the content of the tool file +6. Save and enable the tool +7. The tool will now be available for agents to use in conversations + +### Method 2: Direct File Import + +If Open WebUI supports file-based tool loading: + +1. Ensure the `tools/` directory is in the Open WebUI tools path +2. Restart Open WebUI to detect new tools +3. Enable the tool in the Tools settings + +## Tool Requirements + +All tools in this directory require: +- **Python 3.11+** +- **Open WebUI** installed and running +- **pydantic** library for parameter validation + +## Creating Your Own Tools + +Want to create a new tool? Follow our comprehensive guide: + +📖 **[How to Create a Tool Tutorial](../docs/how_to_create_tool.md)** + +The tutorial covers: +- Tool structure and class setup +- Parameter validation with Pydantic +- API integration and error handling +- Returning structured JSON data +- Best practices and examples + +## Troubleshooting + +### Tool Not Appearing in Open WebUI + +- Verify the `Tools` class name is correct +- Check for Python syntax errors +- Ensure all required dependencies are installed +- Restart Open WebUI after adding new tools + +### Tool Execution Errors + +- Check environment variables are set correctly +- Verify internet connectivity for API-based tools +- Review error messages in the JSON response +- Check Open WebUI logs for detailed error information + +### Import Errors + +- Ensure `pydantic` and other dependencies are installed +- Use Python 3.11+ for compatibility +- Check that the tool file is valid Python code + +## Contributing New Tools + +When adding a new tool to this directory: + +1. **Create the tool file** following the structure in existing tools +2. **Test thoroughly** with various inputs and edge cases +3. **Document the tool** with a README.md in its subdirectory +4. **Add it to this README** under "Available Tools" +5. **Follow best practices** outlined in the [tutorial](../docs/how_to_create_tool.md) + +## Additional Resources + +- **[Tool Creation Tutorial](../docs/how_to_create_tool.md)** - Step-by-step guide for creating tools +- **[Open WebUI Tools Guide](https://docs.openwebui.com/features/plugin/tools)** - Official Open WebUI tools documentation +- **[Project Documentation](../docs/README.md)** - Main documentation hub diff --git a/agents4gov/tools/browseragent/docs/benchmarks.md b/agents4gov/tools/browseragent/docs/benchmarks.md new file mode 100644 index 0000000..5df8aae --- /dev/null +++ b/agents4gov/tools/browseragent/docs/benchmarks.md @@ -0,0 +1,212 @@ +# MiniWoB++ Overview + +MiniWoB++ is a benchmark of small **synthetic web tasks** designed to test a browser agent’s **basic interaction skills** (clicking, typing, dragging). These low-level skills are fundamental for tackling more complex, real-world web tasks. + +### 1. Type + +MiniWoB++ uses **synthetic mini webpages** built with fully controlled **HTML/CSS/JS**. +This allows complete control over layout, difficulty, timing, and randomness. + +### 2. Observation Space + +Agents “observe” the webpage through two main modalities: + +#### **DOM-based observations** +- Structured HTML elements +- Attributes, classes, text content +- Tree representation of the page + +#### **Pixel-based observations** +- Screenshot of the rendered page +- Useful for vision-based agents + + +### 3. Action Space + +The agent interacts through **low-level UI actions**, similar to real browser events: + +- `click(x, y)` +- `type(text)` +- `press_key(key)` +- `drag(start → end)` +- `select_option` +- `scroll` + +These actions form the agent’s **interaction vocabulary**. + +### 4. Tasks + +MiniWoB++ contains **over 100 tasks**, grouped into two categories: + +#### **A. Low-level tasks** +Simple, atomic interactions: +- click-button +- click-checkbox +- enter-text +- drag-item +- focus-text +- scroll + +#### **B. Higher-level synthetic tasks** +More complex but still controlled: +- choose-date +- use-autocomplete +- find-matching-item +- multi-step form filling +- small “flight booking” task + +### 5. Metrics + +Each task outputs a score, typically based on: + +- **Task completion** (success / fail or 0–1 reward) +- **Time taken** +- **Number of mistakes** +- **Correctness of typed inputs** + +These metrics help evaluate fine-grained interaction performance. + +# WebArena Overview + +WebArena is a **realistic web environment** for evaluating browser agents. Unlike MiniWoB++, it does **not** use synthetic pages. + +Instead, it provides full, interactive, self-hosted websites — realistic but safely contained within a closed environment. + +### 1. Websites Included + +WebArena simulates **four functional web applications**, each representing a different real-world domain: + +- **Forum** (similar to Reddit or Discourse) +- **E-commerce platform** (similar to Amazon) +- **Wiki** (similar to Wikipedia) +- **Social media / blogging platform** + +Agents must complete tasks such as: + +- create a post +- reply to users +- search for products +- add items to cart +- edit wiki pages +- navigate categories +- manage account settings + +These tasks are **far more complex** than the small, controlled tasks of MiniWoB++. + +### 2. Observation Space + +Agents receive rich and realistic observations: + +- **DOM tree** (full HTML structure) +- **Screenshots** of the rendered page +- **URL and browser metadata** +- **Accessibility tree** (in some setups) + +This resembles MiniWoB++ but on **much larger and dynamic pages**. + +### 3. Action Space + +Agents interact through realistic browser actions: + +- click +- type +- select +- scroll +- navigate URLs +- fill and submit forms +- interact with search bars +- multi-step navigation across pages + +WebArena essentially exposes a **real browser environment**. + +### 4. Metrics + +Tasks are evaluated based on: + +- **Success / failure** +- Whether the **final webpage state** matches the goal +- **Partial credit** for progress toward multi-step tasks +- Scores aggregated across multiple tasks + +This mirrors how a human would be evaluated when completing tasks on real websites. + +# BrowserGym Overview + +BrowserGym is **not a benchmark** — it is a **framework** for training, evaluating, and standardizing browser agents. + +Think of it as: **“OpenAI Gym / Gymnasium, but for web agents.”** + +It provides the infrastructure needed so researchers can plug in many different environments (e.g., MiniWoB++, WebArena) without reinventing observation formats, actions, or reward loops. + +### 1. Type + +BrowserGym is **a unified framework** that supports **multiple web environments**, both synthetic and realistic. + +Examples of environments it can load: + +- **MiniWoB++** (synthetic tasks) +- **WebArena** (realistic websites) +- Custom local websites +- HTML task collections +- BrowserRL environments +- Human-demonstration-based tasks + +### 2. Observation Space + +BrowserGym standardizes what an agent receives as input, ensuring consistency across environments: + +- **DOM tree** +- **Screenshots** +- **Accessibility Tree** +- **Browser metadata** +- **URL** +- **Element bounding boxes** +- **Extracted text content** + +Agents get a **structured and uniform API**, regardless of which environment is loaded. + +### 3. Action Space + +Just like OpenAI Gym standardizes actions, BrowserGym defines a consistent browser-interaction API: + +- `click(x, y)` +- `type(text)` +- `focus(element)` +- `keypress` +- `scroll` +- `select_option` +- `navigate(url)` +- Interact with browser tabs + +This makes agents **portable**: +Train in one environment, test in another with minimal changes. + +### 4. Tasks + +BrowserGym **does not define tasks**. Instead, tasks are loaded from whichever benchmark or dataset the user selects: + +- MiniWoB++ tasks +- WebArena tasks +- Custom scripted tasks +- Human demonstration workflows +- Recorded trajectories + +## 5. Metrics + +BrowserGym also **does not define metrics**. +It simply forwards metrics from each environment: + +- MiniWoB++ reward signals +- WebArena success criteria +- Custom environment scoring + +Metrics are determined by the underlying benchmark, not BrowserGym. + + +# Comparison Table + +| Benchmark | Type (Synthetic / Real Web) | Observation Space | Action Space | Tasks & Metrics | Setup | +|---------------|-----------------------------|-------------------|--------------|-----------------|-------| +| **MiniWoB++** | Synthetic mini web pages | DOM, element attributes, text, screenshots | Low-level browser actions (click, type, select, focus); sometimes coordinate-based | Short, single-step or few-step tasks (click a button, fill a field, choose an item). Metrics: task success, reward, completion time | Lightweight, local, deterministic HTML tasks; trivial to run and reset | +| **WebArena** | Realistic, closed-world websites (e-commerce, forums, dashboards, tools) | Full DOM, page render, text, rich element metadata | Full browser interaction (click, type, scroll, navigate, multi-step workflows) | Long-horizon, realistic tasks requiring planning (shopping, posting, searching, editing). Metrics: success, sub-goals, task score | Heavy setup; Docker environment hosting multiple real-like websites | +| **BrowserGym** | Framework hosting multiple benchmarks including MiniWoB++, WebArena, and others | Depends on selected environment; supports DOM, screenshots, accessibility tree, text | Unified, standardized browser action API across all supported benchmarks | Not a benchmark itself—aggregates many. Metrics depend on each integrated benchmark but share unified API, logging, and evaluation | Install BrowserGym; load any integrated environment; provides standardized APIs and wrappers \ No newline at end of file diff --git a/agents4gov/tools/browseragent/docs/osagents.md b/agents4gov/tools/browseragent/docs/osagents.md new file mode 100644 index 0000000..07d46e4 --- /dev/null +++ b/agents4gov/tools/browseragent/docs/osagents.md @@ -0,0 +1,193 @@ +# OS Agents: A Survey on MLLM-based Agents for Computer, Phone and Browser Use" + +It presents a comprehensive survey on **OS Agents**, a class of advanced AI assistants powered by **Multimodal Large Language Models (MLLMs)**. The core idea is to move beyond domain-specific AI to create general-purpose agents. + +## Key Components + +OS Agents are based on several key components +and necessitate some core capabilities discussed in +the following. + +1. **Understanding:** The ability to perceive and analyze the current state of the OS environment. This involves processing GUI screenshots, extracting relevant information, and understanding the user's goal. Techniques include visual description, semantic description, and action-oriented description, often leveraging MLLMs for deep comprehension of visual and textual elements. +2. **Planning:** The process of breaking down a complex, high-level user request into a sequence of executable, low-level actions. This often involves hierarchical planning, where MLLMs generate a high-level plan which is then refined into specific steps. Iterative planning, where the plan is adjusted after each action, is also a common approach. +3. **Grounding (Action):** The capability to translate the planned actions into concrete interactions with the OS environment. This is crucial for bridging the gap between the agent's abstract plan and the physical execution on the screen. It involves identifying the correct target element (e.g., a button or text field) and performing the corresponding action (e.g., click, type, scroll). + + + + + +## Construction of OS Agents + +Constructing OS Agents involves developing **foundation models** and **agent frameworks** that can perceive, understand, and interact with graphical user interfaces (GUIs). These models integrate **language**, **vision**, and **action** understanding through a multi-stage training pipeline composed of: + +- Architecture design +- Pre-training +- Supervised Fine-Tuning (SFT) +- Reinforcement Learning (RL) + + +### Foundation Model + +Foundation models for OS Agents combine multimodal architectures with multi-phase training to bridge the gap between natural language understanding and GUI interaction. + + +#### Architecture + +Four common architectural approaches are used in current OS Agent research: + +1. **Existing LLMs** + - Utilize open-source large language models (LLMs) capable of processing textual instructions and HTML structure. + + +2. **Existing MLLMs** + - Use multimodal large language models (MLLMs) that process both text and visual inputs, enabling direct GUI comprehension. + +3. **Concatenated MLLMs** + - Combine a separate vision encoder and language model via adapters or cross-attention modules. + +4. **Modified MLLMs** + - Extend standard MLLMs to handle **high-resolution GUI inputs**. + + +### Pre-training + +Strengthen the model’s understanding of GUIs and the correlation between visual and textual modalities. + +#### Data Sources +1. **Public Data:** Used for large-scale pre-training. +2. **Synthetic Data:** Complements public data to increase coverage and diversity. + +#### Tasks +- **Screen Grounding:** Extract 2D coordinates or bounding boxes for interface elements from text prompts. +- **Screen Understanding:** Capture semantic meaning and structure of entire GUI screens. +- **Optical Character Recognition (OCR):** Identify text within GUI components (e.g., using Paddle-OCR). + +### Supervised Fine-Tuning (SFT) + +Adapt pre-trained models for specific GUI navigation and grounding tasks. + +#### Data Collection Techniques +1. **Rule-Based Data Synthesis:** Use automated algorithms such as BFS to explore app functions and generate trajectories. +2. **Model-Based Data Synthesis:** Employ (M)LLMs (e.g., GPT-4V) to produce annotated samples for GUI grounding or summarization tasks (Zhang et al., 2024f). +3. **Model-Based Data Augmentation:** Generate **Chain-of-Action-Thought (CoAT)** data, containing screen descriptions, reasoning steps, and predicted actions to boost navigation and reasoning capabilities. + +### Reinforcement Learning (RL) + +Align OS Agents’ behavior with task objectives through reward-driven learning, enabling them to plan, act, and adapt dynamically within GUIs. + +Reinforcement learning enables OS Agents to: +- Learn adaptive strategies for complex GUI navigation tasks. +- Align multimodal perception with real-world action outcomes. +- Integrate hierarchical planning and in-context reasoning for better autonomy. + +## OS Agent Framework + + +An **OS Agent framework** defines how an agent perceives, plans, remembers, and acts within an operating system environment. +Each component contributes to creating agents capable of autonomously navigating, understanding, and operating GUIs in dynamic, multi-step tasks. + + +### Perception + +**Perception** enables the agent to observe its environment and extract relevant information to support planning, action, and memory. + +#### Input Modalities + +1. **Textual Description of the OS** + - Early systems relied on text-based representations of the environment (e.g., HTML, DOM, or accessibility trees) because LLMs could not process visual inputs. + +2. **GUI Screenshot Perception** + - With the rise of MLLMs, agents can now process **visual screenshots**, aligning perception with human-like understanding. + +#### Description Techniques +- **Visual Descriptions:** Use visual cues (e.g., layout, color, icons) to improve grounding. +- **Semantic Descriptions:** Incorporate textual meaning of elements. +- **Dual Descriptions:** Combine both visual and semantic information for more robust understanding. + +### Planning + +**Planning** defines how an agent generates and executes a sequence of actions to achieve a goal. It enables task decomposition and dynamic decision-making. + +#### Two Planning Approaches + +1. **Global Planning** + - Generates a one-time plan that the agent executes without modification. + - Based on **Chain-of-Thought (CoT)** reasoning (Wei et al., 2023), allowing models to break complex tasks into structured steps. + +2. **Iterative Planning** + - Continuously adapts plans based on feedback and environmental changes. + - Builds on **ReAct** (Yao et al., 2023), combining reasoning with the results of actions. + - Example systems include **Auto-GUI** (Zhang & Zhang, 2023), which iteratively refines plans using past actions and CoT reasoning. + + +### Memory + +**Memory** allows OS Agents to retain information, adapt to context, and optimize decision-making over time. It is essential for long-term learning, adaptation, and error correction. + +#### Memory Types + +1. **Internal Memory**: Stores transient data such as past actions, screenshots, and states. + +2. **External Memory**: Provides long-term contextual or domain knowledge from databases, tools, or online sources. + +#### Memory Optimization Strategies + +1. **Management:** Abstract and condense redundant data, retaining only relevant insights. + +2. **Growth Experience:** Learn from prior task attempts by revisiting successful and failed steps. + +3. **Experience Retrieval:** Retrieve and reuse knowledge from similar past scenarios to reduce redundant actions.W + +### Action + +**Action** defines how OS Agents interact with digital environments, including computers, mobile devices, and web interfaces. + +#### Action Categories + +1. **Input Operations**: Fundamental interactions via **keyboard**, **mouse**, or **touch** input. + +2. **Navigation Operations**: Allow agents to move across applications, interfaces, or websites. Include both **basic platform navigation** and **web-specific traversal**. + +3. **Extended Operations**: Provide advanced, dynamic capabilities beyond basic input and navigation: + - **Code Execution:** Execute scripts or commands to extend control. + - **API Integration:** Connect to third-party tools or services for specialized functionalities. + +## Evaluation of OS Agents + +how OS Agents are evaluated through standardized **metrics** and **benchmarks** to measure accuracy, efficiency, and adaptability across platforms. + +### Evaluation Metrics + +Two main levels of evaluation are used: + +- **Step-Level Evaluation:** + Analyzes each individual action for correctness and grounding accuracy — how well the agent identifies and interacts with the right interface element. + +- **Task-Level Evaluation:** + Measures overall task success and efficiency. + - **Success Rate (SR):** Percentage of fully completed tasks. + - **Step Ratio:** Compares the agent’s actions to an optimal (human) baseline — lower is better. + +### Evaluation Benchmarks + +Benchmarks test OS Agents in realistic digital environments using diverse **platforms** and **task types**. + +### Platforms +- **Computer:** Complex, multi-application desktop systems. +- **Phone:** Mobile GUIs requiring precise touch and gesture control. +- **Browser:** Web-based environments with dynamic content. + +Some benchmarks combine platforms to test **cross-system transferability**. + +### Task Types +1. **GUI Grounding:** Match language instructions to visual interface elements. +2. **Information Retrieval:** Navigate and extract data from GUIs. +3. **Agentic Tasks:** Execute full, goal-driven workflows autonomously. + +## Challenges and Future Directions + +1. **Generalization and Robustness:** Agents struggle to generalize to unseen interfaces and maintain robustness against minor UI changes. +2. **Long-Horizon Planning:** Current agents often fail on tasks requiring many steps or complex, multi-stage reasoning. +3. **Efficiency and Cost:** The reliance on large MLLMs makes inference slow and computationally expensive. +4. **Multi-Agent Collaboration:** Exploring frameworks where multiple specialized agents can collaborate to solve complex tasks is a promising direction. +5. **Ethical and Safety Concerns:** As agents gain more control over user environments, ensuring their safety, security, and adherence to ethical guidelines becomes paramount. diff --git a/agents4gov/tools/browseragent/docs/webvoyager.md b/agents4gov/tools/browseragent/docs/webvoyager.md new file mode 100644 index 0000000..a81af1e --- /dev/null +++ b/agents4gov/tools/browseragent/docs/webvoyager.md @@ -0,0 +1,252 @@ +# WebVoyager + +It introduces a new approach to building autonomous web agents capable of visually and textually understanding real-world websites to complete tasks end-to-end. + + +## Key Components + +**WebVoyager** is an autonomous web agent that uses **Large Multimodal Models (LMMs)** to **see, understand, and interact** with real-world websites. + +### Problems with Previous Web Agents +- **Text-only processing:** Earlier systems relied solely on HTML text and ignored visual layouts. +- **Simulated environments:** Most agents were tested in simplified web simulators rather than dynamic, real websites. + +WebVoyager bridges this gap by: +- Combining **visual (screenshots)** and **textual (HTML)** data. +- Operating directly on **live websites**. +- Emulating **human-like browsing behavior** to follow user instructions autonomously. + +## How WebVoyager Works + +WebVoyager is an **autonomous web agent** capable of browsing the **open web** in real time — understanding and interacting with webpages through both **visual** and **textual** signals to complete user-defined instructions **end-to-end**. + +Given a user instruction, WebVoyager: +1. Launches a web browser. +2. Observes the current page (via screenshot and text). +3. Predicts an appropriate action. +4. Executes that action in the browser. +5. Repeats the cycle until the task is complete. + +The system continuously updates its internal context with new observations and actions until it reaches a termination signal. + + +### Browsing Environment + +WebVoyager operates on **real-world websites** using [Selenium](https://www.selenium.dev/). + +- Unlike simulated environments such as *WebArena*, WebVoyager interacts directly with the **open internet**, facing realistic web challenges: + - Floating ads + - Pop-up windows + - Dynamic and constantly changing content + +This setup enables the agent to learn **robust, adaptive browsing behavior** closer to real-world user interaction. + +### Interaction Formulation + +WebVoyager’s browsing cycle is defined by four main components: +- **E** → Environment +- **M** → Large Multimodal Model +- **O** → Observation Space +- **A** → Action Space + +At each step **t**: +1. The model receives the **context** `ct = (o1, a1, ..., ot, I)` containing previous actions and observations. +2. It generates an **action** `at = M(ct)`, executed in the environment. +3. The environment returns the next **observation** `ot+1 = E(ot, at)`. + +The cycle continues until the agent stops or the step limit is reached. + +#### Thought-Action Prompting +- Inspired by **ReAct Prompting**, WebVoyager produces both a **thought** (`st`) and an **action code** (`at`) for each step — reasoning before acting. +- To maintain clarity, only the **three most recent observations** are kept, while all thoughts and actions are retained. + +### Observation Space + +The agent primarily observes **screenshots** instead of raw HTML. + +#### Visual Input +- Screenshots include bounding boxes and numeric labels over interactive elements, overlaid using [GPT-4V-Act](https://github.com/ddupont808/GPT-4V-Act), a lightweight, rule-based JavaScript tool. +- Labels and boxes help the model identify actionable elements precisely. +- All borders and labels use **black** for clarity and consistency. + +#### Textual Input +- Includes: + - Element text content + - Element type + - `aria-label` or comment text + +#### Additional Design Choices +- All interactions occur in **a single browser tab**. +- Execution errors trigger re-prompting with the error message included, consuming one step each retry. + +--- + +### Action Space + +WebVoyager mimics human browsing behaviors through seven key action types: + +| Action | Description | +|--------|--------------| +| **Click** | Click on buttons or links. | +| **Input** | Type into text boxes after clearing old content. | +| **Scroll** | Move vertically through a page. | +| **Wait** | Pause to allow content to load. | +| **Back** | Navigate to the previous page. | +| **Jump to Search Engine** | Restart the browsing process if stuck. | +| **Answer** | Finalize the task and produce an output. | + +Each action uses **numeric tags** from screenshots to reference specific webpage elements. + +## Benchmark for WebVoyager + +To ensure diversity, **15 representative websites** were selected to cover different aspects of daily life. + +### Data Construction + +The dataset was created using a **hybrid Self-Instruct + Human Verification** pipeline. + +#### Seed Task Creation +- Manually sampled and rewritten tasks from **Mind2Web** (Yin et al., 2023; Deng et al., 2023). +- Generated initial **seed tasks** for key websites such as Google Flights, Google Maps, Booking, and Wolfram Alpha. +- **Seed tasks are the initial**, manually created examples that start the data generation process. They act as high-quality prototypes or templates that guide further task generation. + +#### GPT-4 Task Generation +- Used seed tasks as **in-context examples** to prompt **GPT-4 Turbo**. +- Generated ~100 new tasks through **20 iterations**. +- Each generated task was **manually verified and rewritten** when necessary. +- Human-validated tasks were added back to the **Task Pool**. + +#### Iterative Expansion +- Sampled new in-context examples each iteration. +- Verified task diversity and correctness on target websites. +- Final dataset: **40+ tasks per website**, totaling **643 tasks**. + +### Annotation Process + +Each task is annotated with a verified answer, categorized as either **Golden** or **Possible**. + +| Label | Description | +|--------|-------------| +| **Golden** | Stable, exact answers. Comprehensive and unlikely to change in the short term. | +| **Possible** | Variable or open-ended answers, including:
1- Open-ended tasks (e.g., summarization)
2- Multiple valid answers
3- Time-sensitive information (e.g., flight prices). | + +**Statistics:** +- **22.3 %** of tasks labeled **Golden** +- **77.7 %** labeled **Possible** + +This reflects both **stability** and **real-world variability** of web data. + + + +## Experiment + +### **Datasets and Metrics** + +WebVoyager is evaluated across multiple benchmarks: + +| Dataset | Description | Evaluation Metric | +|----------|--------------|-------------------| +| **WebVoyager Benchmark** | Custom benchmark introduced. | Task Success Rate | +| **GAIA (Mialon et al., 2023)** | 90 web browsing tasks (Level 1 & 2) with golden responses. Agent starts from Google Search since sites aren’t specified. | Task Success Rate | +| **SeeAct (Zheng et al., 2024)** | 50 online evaluation tasks; compared with SeeAct’s autonomous agent results. | Task Success Rate | + +**Primary Metric:** +> **Task Success Rate (TSR)** – measures whether the agent completes the task, without requiring optimal steps. + + +### Experimental Setup + +### **Models Used** +| Model | Type | Description | +|--------|------|-------------| +| **GPT-4 Turbo (Vision)** | Backbone | Used as the primary model (`gpt-4-vision-preview`) for strong semantic and visual reasoning. | +| **Claude 3 Opus (Anthropic, 2024)** | Backbone | Adds diversity; used for ablation. | +| **GPT-4o (Omni, 2024)** | Backbone | Multimodal baseline with enhanced context understanding. | +| **GPT-4 (All Tools)** | Baseline | Integrates vision, browsing, code, and plugins. | +| **Text-only baseline** | Baseline | Receives only accessibility tree data (no screenshots). | + +### Evaluation Method + +#### **Human Evaluation** +- Human judges inspect full agent trajectories (screenshots + actions). +- Binary judgment: **Success** or **Failure**. +- 300 tasks reviewed by **3 annotators** for inter-rater reliability. + +### **Automatic Evaluation** +- **GPT-4V** is used as an **auto-evaluator** (LMM-based judge). +- Input: task prompt, agent responses, and last *k* screenshots. +- Evaluator outputs binary success/failure. +- Increasing *k* (screenshots) improves consistency: + +## Results + +#### **Performance Highlights** +- **WebVoyager** outperforms **text-only** and **GPT-4 (All Tools)** baselines across most sites. +- Slightly weaker on **text-heavy** websites (e.g., Allrecipes, GitHub). +- Achieves **30% success** on the **SeeAct** test set (vs **26%** by SeeAct’s best agent). + +| Website | GPT-4 (All Tools) | Text-only | WebVoyager | WebVoyager (GPT-4o) | +|----------|-------------------|------------|-------------|----------------------| +| **Overall** | **30.8%** | **40.1%** | **59.1%** | **55.5%** | + +#### **Findings** +- **Visual + Textual modalities** are both essential: + - Text-only fails on visually complex sites (Booking, Flights). + - WebVoyager outperforms text-only and GPT- + 4 (All Tools) baselines by large margins in most + website tasks, while it is slightly lower than Text- + only on Allrecipes and similar to Text-only on + Github, ESPN, Cambridge Dictionary and Wolfram + Alpha. This is primarily because these websites + are more text-heavy than others. Since WebVoy- + ager mostly relies on web screenshots for decision- + making, dense text might not be easily recogniz- + able from the image. +- **Website complexity** correlates inversely with success: + - Sites with fewer interactive elements and shorter trajectories show higher TSR. +- **Direct interaction** (vs Bing scraping) is critical for accuracy. + +### Error Analysis + +Manual labeling of 300 failed tasks reveals key failure modes: + +| Failure Type | Description | Ratio | +|---------------|--------------|-------| +| **Navigation Stuck** | Agent fails to finish task or loops endlessly (e.g., scroll errors, vague queries). | **44.4%** | +| **Visual Grounding Issue** | Misidentifies or confuses visual elements, especially small text or nearby items. | **24.8%** | +| **Hallucination** | Produces plausible but incorrect results (e.g., partial answers, wrong inputs). | **21.8%** | +| **Prompt Misalignment** | Fails to follow task structure or prematurely answers. | **9.0%** | + +--- + +#### **Examples** +- *Navigation Stuck:* Scrolls indefinitely due to small scroll area. +- *Visual Grounding:* Clicks wrong “Buy” button near a similar label. +- *Hallucination:* Answers with partial product info. +- *Prompt Misalignment:* Generates “Thought” but no executable action. + +## Conclusion + +WebVoyager is a large multimodal model (LMM)–powered web agent designed to complete real-world web tasks end-to-end by directly interacting with websites. +It combines visual and textual understanding to perform actions on web pages and significantly outperforms baseline web agents. + +it introduced an automatic evaluation framework using GPT-4V to assess agent performance objectively. +This establishes WebVoyager as a strong foundation for building more capable and intelligent web assistants in the future. + +### Limitations + +**Incomplete Action Set**: +The agent currently lacks certain human-like actions such as dragging, due to the complexity of continuous pixel interactions. +Future improvements in visual grounding could enable this. + +**Limited File Support**: +WebVoyager handles basic file types (text, PDFs) but not complex media (e.g., videos). Extending file-type support is a key area for future work. + +**Risks & Safety Concerns** + +Before real-world deployment, strong safety measures are required.Potential risks include: +- Downloading malicious content +- Exposing confidential data +- Sending unintended or harmful web requests +- Generating fake or automated user activity +- Strict ethical and security safeguards are needed for responsible use. \ No newline at end of file diff --git a/agents4gov/tools/openalex/README.md b/agents4gov/tools/openalex/README.md new file mode 100644 index 0000000..6d37caf --- /dev/null +++ b/agents4gov/tools/openalex/README.md @@ -0,0 +1,82 @@ +# OpenAlex DOI Metadata Retrieval + +**File:** `open_alex_doi.py` + +**Description:** Retrieves comprehensive metadata and impact indicators for scientific publications using their DOI (Digital Object Identifier) from the OpenAlex API. + +**Main Method:** `get_openalex_metadata_by_doi(doi: str) -> str` + +## Features + +- Fetches basic publication metadata (title, authors, venue, publication year) +- Retrieves citation counts and impact metrics +- Provides normalized percentile rankings +- Calculates Field-Weighted Citation Impact (FWCI) +- Handles multiple DOI formats (with or without prefixes) +- Returns structured JSON output + +## Parameters + +- `doi` (required): The DOI of the publication (e.g., `10.1371/journal.pone.0000000`) + - Accepts formats: `10.1234/example`, `doi:10.1234/example`, `https://doi.org/10.1234/example` + +## Environment Variables + +- `OPENALEX_EMAIL` (optional): Your email for polite pool access (faster and more reliable API responses) + +## Example Output + +```json +{ + "status": "success", + "doi": "10.1371/journal.pone.0000000", + "openalex_id": "https://openalex.org/W2741809807", + "metadata": { + "title": "Example Publication Title", + "authors": ["Author One", "Author Two"], + "venue": "PLOS ONE", + "publication_year": 2020, + "publication_date": "2020-03-15", + "type": "journal-article" + }, + "impact_indicators": { + "cited_by_count": 42, + "citation_normalized_percentile": { + "value": 85.5, + "is_in_top_1_percent": false + }, + "cited_by_percentile_year": { + "min": 80, + "max": 90 + }, + "fwci": 1.5 + }, + "links": { + "doi_url": "https://doi.org/10.1371/journal.pone.0000000", + "openalex_url": "https://openalex.org/W2741809807" + } +} +``` + +## Use Cases + +- Research impact analysis +- Literature review automation +- Citation metric extraction +- Publication verification +- Academic database integration + +## Usage + +After importing this tool in Open WebUI, test it with a query like: + +``` +Can you get metadata for the publication with DOI 10.1371/journal.pone.0000000? +``` + +The agent will automatically invoke the `get_openalex_metadata_by_doi` tool and return the structured results. + +## Additional Resources + +- **[OpenAlex API Documentation](https://docs.openalex.org/)** - Official API documentation +- **[How to Create a Tool](../../docs/how_to_create_tool.md)** - Guide for creating your own tools diff --git a/agents4gov/tools/openalex/open_alex_doi.py b/agents4gov/tools/openalex/open_alex_doi.py new file mode 100644 index 0000000..136059c --- /dev/null +++ b/agents4gov/tools/openalex/open_alex_doi.py @@ -0,0 +1,195 @@ +import os +import requests +import json +from pydantic import Field + +class Tools: + def __init__(self): + pass + + def _clean_doi(self, doi: str) -> str: + """ + Clean and normalize a DOI string by removing common prefixes. + + Args: + doi: The DOI string to clean + + Returns: + Cleaned DOI string without prefixes like 'doi:', 'https://doi.org/', etc. + """ + doi_clean = doi.strip() + + # Remove common DOI prefixes + if doi_clean.lower().startswith('doi:'): + doi_clean = doi_clean[4:].strip() + if doi_clean.startswith('https://doi.org/'): + doi_clean = doi_clean.replace('https://doi.org/', '') + if doi_clean.startswith('http://doi.org/'): + doi_clean = doi_clean.replace('http://doi.org/', '') + + return doi_clean + + def get_openalex_metadata_by_doi( + self, + doi: str = Field( + ..., + description="The DOI (Digital Object Identifier) of the publication, e.g., '10.1371/journal.pone.0000000'" + ) + ) -> str: + """ + Retrieve essential metadata and impact indicators for a scientific publication from OpenAlex API. + + Returns a JSON string containing: + - Basic metadata (title, authors, venue, publication year) + - Impact indicators (citations, percentiles, FWCI) + + Args: + doi: The DOI of the publication to query + + Returns: + JSON string with structured publication data and impact metrics + """ + + # Clean the DOI using the helper function + doi_clean = self._clean_doi(doi) + + # Build OpenAlex API endpoint URL + base_url = f"https://api.openalex.org/works/doi:{doi_clean}" + + # Optional: Add email for polite pool access (faster and more reliable) + # Set OPENALEX_EMAIL environment variable to use this feature + email = os.getenv("OPENALEX_EMAIL", None) + params = {} + if email: + params['mailto'] = email + + try: + # Make request to OpenAlex API + response = requests.get(base_url, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + # ======================================== + # BASIC METADATA EXTRACTION + # ======================================== + + # Extract core publication information + title = data.get('title', None) + publication_year = data.get('publication_year', None) + publication_date = data.get('publication_date', None) + type_crossref = data.get('type_crossref', None) + + # Extract and format authors list + # Only include author name for simplicity + authors_list = data.get('authorships', []) + authors = [ + author_info.get('author', {}).get('display_name') + for author_info in authors_list + ] + + # Extract venue/journal information + primary_location = data.get('primary_location', {}) + source = primary_location.get('source', {}) or {} + venue_name = source.get('display_name') + + # ======================================== + # IMPACT INDICATORS EXTRACTION + # ======================================== + + # Total number of citations + cited_by_count = data.get('cited_by_count', 0) + + # Citation normalized percentile + # Compares citation count to similar publications (by year, type, field) + citation_normalized_percentile = data.get('citation_normalized_percentile', {}) or {} + percentile_value = citation_normalized_percentile.get('value') + is_top_1_percent = citation_normalized_percentile.get('is_in_top_1_percent', False) + + # Cited by percentile year + # Percentile ranking among publications from the same year + cited_by_percentile_year = data.get('cited_by_percentile_year', {}) or {} + percentile_min = cited_by_percentile_year.get('min') + percentile_max = cited_by_percentile_year.get('max') + + # Field-Weighted Citation Impact (FWCI) + # Value of 1.0 means average for the field + # >1.0 means above average, <1.0 means below average + fwci = data.get('fwci') + + # ======================================== + # BUILD STRUCTURED RESPONSE + # ======================================== + + result = { + 'status': 'success', + 'doi': doi_clean, + 'openalex_id': data.get('id'), + + # Basic publication metadata + 'metadata': { + 'title': title, + 'authors': authors, + 'venue': venue_name, + 'publication_year': publication_year, + 'publication_date': publication_date, + 'type': type_crossref + }, + + # Citation and impact metrics + 'impact_indicators': { + 'cited_by_count': cited_by_count, + 'citation_normalized_percentile': { + 'value': percentile_value, + 'is_in_top_1_percent': is_top_1_percent + }, + 'cited_by_percentile_year': { + 'min': percentile_min, + 'max': percentile_max + }, + 'fwci': fwci + }, + + # Useful links + 'links': { + 'doi_url': f'https://doi.org/{doi_clean}', + 'openalex_url': data.get('id') + } + } + + # Return as formatted JSON string + return json.dumps(result, ensure_ascii=False, indent=2) + + # ======================================== + # ERROR HANDLING + # ======================================== + + except requests.exceptions.HTTPError as e: + # Handle HTTP errors (e.g., 404 Not Found) + error_result = { + 'status': 'error', + 'error_type': 'http_error', + 'error_code': e.response.status_code, + 'message': f'Publication not found for DOI: {doi_clean}' if e.response.status_code == 404 else str(e), + 'doi': doi_clean + } + return json.dumps(error_result, ensure_ascii=False, indent=2) + + except requests.exceptions.RequestException as e: + # Handle connection errors + error_result = { + 'status': 'error', + 'error_type': 'connection_error', + 'message': f'Error connecting to OpenAlex API: {str(e)}', + 'doi': doi_clean + } + return json.dumps(error_result, ensure_ascii=False, indent=2) + + except Exception as e: + # Handle any other unexpected errors + error_result = { + 'status': 'error', + 'error_type': 'unexpected_error', + 'message': f'Unexpected error: {str(e)}', + 'doi': doi_clean + } + return json.dumps(error_result, ensure_ascii=False, indent=2) diff --git a/agents4gov/tools/openml/README.md b/agents4gov/tools/openml/README.md new file mode 100644 index 0000000..b1e5a1d --- /dev/null +++ b/agents4gov/tools/openml/README.md @@ -0,0 +1,422 @@ +# OpenML Tools + +This directory contains tools for working with OpenML datasets. + +## Available Tools + +### 1. OpenML Dataset Search (`openml_search.py`) + +**Description:** Search for machine learning datasets on OpenML using semantic similarity based on embeddings. This tool enables natural language queries to find relevant datasets by computing similarity between the query and dataset descriptions. Uses batch embedding processing for efficiency. + +**Main Method:** `search_openml_datasets(query: str, top_k: int = 5, max_datasets: int = 100) -> str` + +### 2. OpenML Dataset Download (`openml_download.py`) + +**Description:** Download datasets from OpenML by ID and automatically save as CSV. Returns the file path for immediate use with other tools. + +**Main Method:** `download_openml_dataset(dataset_id: int, save_dir: str = "./datasets") -> str` + +### 3. OpenML KNN Training (`openml_knn_train.py`) + +**Description:** Train a K-Nearest Neighbors model with hyperparameter tuning using cross-validation. Automatically detects task type (classification/regression) and applies appropriate metrics and CV strategy. + +**Main Method:** `train_knn_with_cv(data_path: str, target_column: str, n_neighbors_range: List[int] = [3, 5, 7, 9, 11], cv_folds: int = 5, ...) -> str` + +--- + +## OpenML Dataset Search + +### Features + +- Natural language search queries for datasets +- Semantic similarity matching using embeddings +- Configurable number of results (top-k) +- Comprehensive dataset metadata retrieval +- Cosine similarity scoring between query and datasets +- Semantic search using sentence-transformers +- Returns structured JSON output with dataset details + +### Parameters + +- `query` (required): Natural language description of the desired dataset + - Examples: + - "image classification datasets" + - "medical diagnosis data" + - "time series weather data" + - "text sentiment analysis" +- `top_k` (optional, default=5): Number of most similar datasets to return +- `max_datasets` (optional, default=100): Maximum number of datasets to search through + +## Environment Variables + +No environment variables required for embedding. Uses local sentence-transformers model `all-MiniLM-L6-v2`. + + +## Example Output + +```json +{ + "status": "success", + "query": "image classification datasets", + "top_k": 5, + "total_searched": 1000, + "results": [ + { + "dataset_id": 40927, + "name": "mnist_784", + "description": "The MNIST database of handwritten digits with 784 features. It is a subset of a larger set available from NIST...", + "similarity_score": 0.8542, + "metadata": { + "num_instances": 70000, + "num_features": 785, + "num_classes": 10, + "num_missing_values": 0, + "format": "ARFF", + "version": 1, + "uploader": "Jan van Rijn", + "status": "active" + }, + "links": { + "openml_url": "https://www.openml.org/d/40927", + "api_url": "https://www.openml.org/api/v1/json/data/40927" + } + }, + { + "dataset_id": 40996, + "name": "Fashion-MNIST", + "description": "Fashion-MNIST is a dataset of Zalando's article images consisting of a training set of 60,000 examples...", + "similarity_score": 0.8213, + "metadata": { + "num_instances": 70000, + "num_features": 785, + "num_classes": 10, + "num_missing_values": 0, + "format": "ARFF", + "version": 1, + "uploader": "Joaquin Vanschoren", + "status": "active" + }, + "links": { + "openml_url": "https://www.openml.org/d/40996", + "api_url": "https://www.openml.org/api/v1/json/data/40996" + } + } + ] +} +``` + +### Use Cases + +- **Dataset Discovery**: Find datasets relevant to your research topic +- **Literature Review**: Identify datasets used in specific domains +- **Machine Learning Exploration**: Discover datasets for testing algorithms +- **Benchmarking**: Find standard datasets for model comparison +- **Education**: Locate datasets for teaching and learning + +### How It Works + +1. **Fetch Datasets**: Retrieves dataset metadata from OpenML API +2. **Batch Embedding**: Converts query and all dataset descriptions to vectors in a single batch (efficient) +3. **Similarity Computation**: Calculates cosine similarity using sklearn's optimized implementation +4. **Ranking**: Sorts datasets by similarity score +5. **Return Top-K**: Returns the most relevant datasets + +### Technical Details + +- Uses **sentence-transformers** with model `paraphrase-multilingual-mpnet-base-v2` +- **Batch processing** for embeddings (batch_size=32) for efficiency +- **Cosine similarity** computed via `sklearn.metrics.pairwise.cosine_similarity` +- All similarity scores normalized between -1 and 1 + +### Usage Example + +``` +Can you find datasets about medical diagnosis? +``` + +--- + +## OpenML Dataset Download + +### Features + +- Download datasets by OpenML ID +- Automatically saves as CSV format +- Comprehensive metadata extraction +- Feature information (categorical/numeric classification) +- File size reporting +- Returns absolute file path for chaining with other tools + +### Parameters + +- `dataset_id` (required): OpenML dataset ID (e.g., 40927 for MNIST) +- `save_dir` (optional, default="./datasets"): Directory to save the CSV file + - Automatically creates directory if it doesn't exist + - Filename format: `{dataset_name}_{dataset_id}.csv` + +### Example Output + +```json +{ + "status": "success", + "dataset_id": 40927, + "dataset_path": "/absolute/path/to/datasets/mnist_784_40927.csv", + "metadata": { + "dataset_id": 40927, + "name": "mnist_784", + "description": "The MNIST database of handwritten digits...", + "version": 1, + "format": "ARFF", + "default_target_attribute": "class", + "openml_url": "https://www.openml.org/d/40927", + "num_features": 784, + "num_instances": 70000 + }, + "data_info": { + "saved_to_disk": true, + "save_path": "/absolute/path/to/datasets/mnist_784_40927.csv", + "file_size": "109.35 MB", + "file_size_bytes": 114683392, + "shape": { + "features": [70000, 784], + "target": [70000] + }, + "feature_names": ["pixel_0_0", "pixel_0_1", "..."], + "target_name": "class", + "categorical_features": [], + "numeric_features": ["pixel_0_0", "pixel_0_1", "..."] + } +} +``` + +### Usage Example + +``` +Download dataset 40927 +``` + +``` +Download dataset 31 and save to ./my_datasets directory +``` + +--- + +## OpenML KNN Training + +### Features + +- **Automatic task detection**: Classifies as regression or classification based on target variable +- **Cross-validation**: Stratified K-Fold for classification, regular K-Fold for regression +- **Hyperparameter tuning**: Grid search over k-neighbors values +- **Multiple metrics**: Comprehensive evaluation metrics for both task types +- **Pipeline-based**: Includes StandardScaler for feature normalization +- **Model persistence**: Optionally save trained model with joblib + +### Parameters + +- `data_path` (required): Path to CSV dataset file +- `target_column` (required): Name of the target column +- `n_neighbors_range` (optional, default=[3, 5, 7, 9, 11]): List of k values to test +- `cv_folds` (optional, default=5): Number of cross-validation folds +- `random_state` (optional, default=42): Random seed for reproducibility +- `metric` (optional): Distance metric ('euclidean', 'manhattan', 'minkowski', 'chebyshev') +- `weights` (optional, default='uniform'): Weight function ('uniform' or 'distance') +- `save_model_path` (optional): Path to save the trained model + +### Example Output (Classification) + +```json +{ + "status": "success", + "task_type": "classification", + "dataset_info": { + "data_path": "/path/to/dataset.csv", + "total_samples": 1000, + "num_features": 20, + "target_column": "label", + "num_classes": 3, + "cv_folds": 5 + }, + "best_parameters": { + "n_neighbors": 7, + "weights": "uniform", + "metric": "minkowski" + }, + "hyperparameter_search": { + "best_score": 0.9234, + "all_params_scores": [ + { + "params": {"n_neighbors": 3, "weights": "uniform"}, + "mean_score": 0.9123, + "std_score": 0.0234 + }, + { + "params": {"n_neighbors": 5, "weights": "uniform"}, + "mean_score": 0.9201, + "std_score": 0.0198 + }, + { + "params": {"n_neighbors": 7, "weights": "uniform"}, + "mean_score": 0.9234, + "std_score": 0.0212 + } + ] + }, + "cross_validation_metrics": { + "accuracy": { + "mean": 0.9234, + "std": 0.0212 + }, + "precision": { + "mean": 0.9187, + "std": 0.0223 + }, + "recall": { + "mean": 0.9201, + "std": 0.0198 + }, + "f1_score": { + "mean": 0.9193, + "std": 0.0205 + } + }, + "model_info": { + "saved": true, + "save_path": "/path/to/model.joblib", + "file_size_bytes": 1024 + } +} +``` + +### Example Output (Regression) + +```json +{ + "status": "success", + "task_type": "regression", + "cross_validation_metrics": { + "mse": { + "mean": 12.45, + "std": 2.34 + }, + "rmse": { + "mean": 3.53, + "std": 2.34 + }, + "mae": { + "mean": 2.76, + "std": 0.89 + }, + "r2_score": { + "mean": 0.8765, + "std": 0.0234 + } + } +} +``` + +### Usage Example + +``` +Train a KNN model on the dataset at ./datasets/iris.csv with target column 'species' +``` + +``` +Train KNN with k values [5, 10, 15] on ./data/housing.csv, target 'price', and save the model +``` + +### How It Works + +1. **Load Data**: Reads CSV file into DataFrame +2. **Preprocessing**: + - Handles missing values (mean for numeric, mode for categorical) + - Encodes categorical features using LabelEncoder + - Encodes target variable if classification +3. **Task Detection**: Automatically determines classification vs regression +4. **Cross-Validation Setup**: Creates stratified or regular K-Fold based on task +5. **Grid Search**: Tests all hyperparameter combinations using CV +6. **Metric Extraction**: Extracts all metrics from single GridSearchCV run +7. **Model Training**: Trains final model on all data with best parameters +8. **Save**: Optionally saves model pipeline and encoders + +### Technical Details + +- **Pipeline**: StandardScaler → KNN (ensures proper scaling in CV) +- **Single CV Run**: Uses GridSearchCV with multiple metrics (efficient) +- **Stratified CV**: Preserves class distribution in classification tasks +- **Feature Encoding**: Automatic handling of categorical variables +- **Model Package**: Saves pipeline, encoders, and metadata together + +--- + +## General Information + +### Dependencies + +**All tools:** +```bash +pip install openml pandas numpy scikit-learn joblib sentence-transformers +``` + +**Breakdown:** +- `openml`: Dataset access +- `pandas`, `numpy`: Data manipulation +- `scikit-learn`: Machine learning algorithms and metrics +- `joblib`: Model serialization +- `sentence-transformers`: Embeddings for search + +### Environment Variables + +No environment variables required. All tools use local libraries. + +### Performance Considerations + +**Search Tool:** +- **First Run**: May take longer due to model loading +- **Embedding Cache**: Consider caching embeddings for frequently searched datasets +- **Dataset Limit**: Adjust `max_datasets` parameter to balance speed vs. coverage + +**Download Tool:** +- **Large Datasets**: May take time to download and save +- **Memory Usage**: Large datasets load into memory before saving +- **CSV Format**: Always saves as CSV for compatibility + +**Training Tool:** +- **Cross-Validation**: Single GridSearchCV run computes all metrics efficiently +- **Memory Usage**: Entire dataset loaded into memory +- **Parallel Processing**: Uses `n_jobs=-1` for parallel CV +- **Large K Range**: More k values = longer training time + +### Troubleshooting + +#### Missing Dependencies + +```bash +pip install openml pandas numpy scikit-learn joblib sentence-transformers +``` + +#### Slow Performance (Search) + +- Reduce `max_datasets` parameter (default is now 100) +- Batch embedding is already optimized +- First run is slower due to model loading + +#### Download Errors + +- Verify dataset ID exists on OpenML +- Ensure sufficient disk space for large datasets +- Check write permissions for save_dir directory + +#### Training Errors + +- Verify target column name exists in dataset +- Check for sufficient samples (at least 2x cv_folds) +- Ensure dataset doesn't have all missing values +- For large datasets, reduce cv_folds or k range + +## Additional Resources + +- **[OpenML Website](https://www.openml.org/)** - Browse datasets online +- **[OpenML Python API Documentation](https://openml.github.io/openml-python/)** - Official API docs +- **[Sentence Transformers](https://www.sbert.net/)** - Embedding model documentation +- **[How to Create a Tool](../../docs/how_to_create_tool.md)** - Guide for creating your own tools diff --git a/agents4gov/tools/openml/openml_download.py b/agents4gov/tools/openml/openml_download.py new file mode 100644 index 0000000..e1986d5 --- /dev/null +++ b/agents4gov/tools/openml/openml_download.py @@ -0,0 +1,229 @@ +import os +import json +from typing import Optional +from pydantic import Field + +class Tools: + def __init__(self): + pass + + def _format_bytes(self, bytes_size: int) -> str: + """ + Format bytes to human-readable string. + + Args: + bytes_size: Size in bytes + + Returns: + Formatted string (e.g., "1.5 MB") + """ + for unit in ['B', 'KB', 'MB', 'GB']: + if bytes_size < 1024.0: + return f"{bytes_size:.2f} {unit}" + bytes_size /= 1024.0 + return f"{bytes_size:.2f} TB" + + def download_openml_dataset( + self, + dataset_id: int = Field( + ..., + description="The OpenML dataset ID to download (e.g., 40927 for MNIST)" + ), + save_dir: str = Field( + default="./datasets", + description="Directory to save the dataset CSV file (default: ./datasets)" + ) + ) -> str: + """ + Download a dataset from OpenML by its ID and save as CSV. + + This tool: + 1. Fetches dataset from OpenML + 2. Saves as CSV file with features (X) and target (y) + 3. Returns the saved file path and metadata + + Args: + dataset_id: OpenML dataset ID + save_dir: Directory to save the CSV file + + Returns: + JSON string with saved file path and metadata + """ + + try: + import openml + import pandas as pd + + # ======================================== + # STEP 1: FETCH DATASET METADATA + # ======================================== + + try: + dataset = openml.datasets.get_dataset(dataset_id, download_data=True) + except Exception as e: + error_result = { + 'status': 'error', + 'error_type': 'dataset_not_found', + 'message': f'Dataset with ID {dataset_id} not found: {str(e)}', + 'dataset_id': dataset_id + } + return json.dumps(error_result, ensure_ascii=False, indent=2) + + # ======================================== + # STEP 2: EXTRACT METADATA + # ======================================== + + metadata = { + 'dataset_id': dataset.dataset_id, + 'name': dataset.name, + 'description': dataset.description, + 'version': dataset.version, + 'format': dataset.format, + 'upload_date': dataset.upload_date, + 'default_target_attribute': dataset.default_target_attribute, + 'row_id_attribute': dataset.row_id_attribute, + 'ignore_attributes': dataset.ignore_attribute, + 'language': dataset.language, + 'licence': dataset.licence, + 'url': dataset.url, + 'openml_url': f"https://www.openml.org/d/{dataset_id}" + } + + # Extract features information + if hasattr(dataset, 'features'): + features_info = [] + for feature_name, feature_data in dataset.features.items(): + features_info.append({ + 'name': feature_name, + 'data_type': feature_data.data_type, + 'is_target': feature_data.name == dataset.default_target_attribute, + 'is_ignore': feature_data.name in (dataset.ignore_attribute or []), + 'is_row_identifier': feature_data.name == dataset.row_id_attribute, + 'number_missing_values': feature_data.number_missing_values + }) + metadata['features'] = features_info + metadata['num_features'] = len([f for f in features_info if not f['is_target'] and not f['is_ignore']]) + metadata['num_instances'] = dataset.qualities.get('NumberOfInstances', 'unknown') + + # Extract qualities (statistics) + if hasattr(dataset, 'qualities') and dataset.qualities: + qualities = { + 'num_instances': dataset.qualities.get('NumberOfInstances'), + 'num_features': dataset.qualities.get('NumberOfFeatures'), + 'num_classes': dataset.qualities.get('NumberOfClasses'), + 'num_missing_values': dataset.qualities.get('NumberOfMissingValues'), + 'num_instances_with_missing_values': dataset.qualities.get('NumberOfInstancesWithMissingValues'), + 'num_numeric_features': dataset.qualities.get('NumberOfNumericFeatures'), + 'num_symbolic_features': dataset.qualities.get('NumberOfSymbolicFeatures') + } + metadata['qualities'] = qualities + + # ======================================== + # STEP 3: DOWNLOAD DATA + # ======================================== + + try: + # Get the data + X, y, categorical_indicator, attribute_names = dataset.get_data( + target=dataset.default_target_attribute, + dataset_format='dataframe' + ) + + # Convert to DataFrames if not already + if not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X, columns=attribute_names) + if not isinstance(y, pd.Series): + y = pd.Series(y, name=dataset.default_target_attribute) + + # Data information + data_info = { + 'shape': { + 'features': list(X.shape), + 'target': list(y.shape) + }, + 'feature_names': list(X.columns), + 'target_name': dataset.default_target_attribute, + 'categorical_features': [attr for attr, is_cat in zip(attribute_names, categorical_indicator) if is_cat], + 'numeric_features': [attr for attr, is_cat in zip(attribute_names, categorical_indicator) if not is_cat] + } + + # ======================================== + # STEP 4: SAVE TO DISK + # ======================================== + + try: + # Create directory if it doesn't exist + os.makedirs(save_dir, exist_ok=True) + + # Create filename from dataset name + safe_name = "".join(c if c.isalnum() or c in ('-', '_') else '_' for c in dataset.name) + filename = f"{safe_name}_{dataset_id}.csv" + save_path = os.path.join(save_dir, filename) + + # Combine X and y into single dataframe + data_df = X.copy() + data_df[dataset.default_target_attribute] = y + + data_df.to_csv(save_path, index=False) + + # Get file size + file_size = os.path.getsize(save_path) + + data_info['saved_to_disk'] = True + data_info['save_path'] = os.path.abspath(save_path) + data_info['file_size'] = self._format_bytes(file_size) + data_info['file_size_bytes'] = file_size + + except Exception as e: + error_result = { + 'status': 'error', + 'error_type': 'save_error', + 'message': f'Error saving dataset to disk: {str(e)}', + 'dataset_id': dataset_id + } + return json.dumps(error_result, ensure_ascii=False, indent=2) + + except Exception as e: + error_result = { + 'status': 'error', + 'error_type': 'data_download_error', + 'message': f'Error downloading dataset data: {str(e)}', + 'dataset_id': dataset_id + } + return json.dumps(error_result, ensure_ascii=False, indent=2) + + # ======================================== + # BUILD RESPONSE + # ======================================== + + result = { + 'status': 'success', + 'dataset_id': dataset_id, + 'dataset_path': data_info['save_path'], + 'metadata': metadata, + 'data_info': data_info + } + + return json.dumps(result, ensure_ascii=False, indent=2) + + # ======================================== + # ERROR HANDLING + # ======================================== + + except ImportError as e: + error_result = { + 'status': 'error', + 'error_type': 'missing_dependency', + 'message': f'Required package not installed: {str(e)}. Please install with: pip install openml pandas', + 'dataset_id': dataset_id + } + return json.dumps(error_result, ensure_ascii=False, indent=2) + + except Exception as e: + error_result = { + 'status': 'error', + 'error_type': 'unexpected_error', + 'message': f'Unexpected error: {str(e)}', + 'dataset_id': dataset_id + } + return json.dumps(error_result, ensure_ascii=False, indent=2) diff --git a/agents4gov/tools/openml/openml_knn_train.py b/agents4gov/tools/openml/openml_knn_train.py new file mode 100644 index 0000000..bee917b --- /dev/null +++ b/agents4gov/tools/openml/openml_knn_train.py @@ -0,0 +1,404 @@ +import os +import json +import numpy as np +from typing import Optional, List +from pydantic import Field + +class Tools: + def __init__(self): + pass + + def _determine_task_type(self, y) -> str: + """ + Determine if the task is classification or regression. + + Args: + y: Target variable + + Returns: + 'classification' or 'regression' + """ + # Check if target is numeric + if y.dtype == 'object' or y.dtype.name == 'category': + return 'classification' + + return 'regression' + + def train_knn_with_cv( + self, + data_path: str = Field( + ..., + description="Path to the dataset file (CSV, Parquet, or JSON) downloaded using openml_download.py" + ), + target_column: str = Field( + ..., + description="Name of the target column in the dataset" + ), + n_neighbors_range: List[int] = Field( + default=[3, 5, 7, 9, 11], + description="List of k values to test for KNN (default: [3, 5, 7, 9, 11])" + ), + cv_folds: int = Field( + default=5, + description="Number of cross-validation folds (default: 5)" + ), + random_state: int = Field( + default=42, + description="Random seed for reproducibility (default: 42)" + ), + metric: Optional[str] = Field( + default=None, + description="Distance metric for KNN (default: 'minkowski' for both tasks). Options: 'euclidean', 'manhattan', 'minkowski', 'chebyshev'" + ), + weights: str = Field( + default='uniform', + description="Weight function for KNN (default: 'uniform'). Options: 'uniform', 'distance'" + ), + save_model_path: Optional[str] = Field( + default=None, + description="Optional path to save the trained model using joblib" + ) + ) -> str: + """ + Train a KNN model with hyperparameter tuning using cross-validation. + + This tool: + 1. Loads the dataset from the specified path + 2. Automatically detects if it's a classification or regression task + 3. Performs cross-validation with hyperparameter tuning: + - Stratified K-Fold for classification + - Regular K-Fold for regression + 4. Tunes the number of neighbors (k) + 5. Returns mean metrics across all folds + 6. Optionally saves the best model trained on all data + + Args: + data_path: Path to the dataset file + target_column: Name of the target variable + n_neighbors_range: List of k values to test + cv_folds: Number of cross-validation folds + random_state: Random seed + metric: Distance metric for KNN + weights: Weight function for KNN + save_model_path: Path to save the model + + Returns: + JSON string with cross-validation results, best parameters, and mean metrics + """ + + try: + import pandas as pd + import numpy as np + from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV + from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor + from sklearn.preprocessing import StandardScaler, LabelEncoder + from sklearn.pipeline import Pipeline + import joblib + + # ======================================== + # STEP 1: LOAD DATASET + # ======================================== + + if not os.path.exists(data_path): + error_result = { + 'status': 'error', + 'error_type': 'file_not_found', + 'message': f'Dataset file not found: {data_path}', + 'data_path': data_path + } + return json.dumps(error_result, ensure_ascii=False, indent=2) + + try: + # Load based on file extension + if data_path.endswith('.csv'): + df = pd.read_csv(data_path) + elif data_path.endswith('.parquet'): + df = pd.read_parquet(data_path) + elif data_path.endswith('.json'): + df = pd.read_json(data_path) + else: + error_result = { + 'status': 'error', + 'error_type': 'unsupported_format', + 'message': f'Unsupported file format. Please use CSV, Parquet, or JSON.', + 'data_path': data_path + } + return json.dumps(error_result, ensure_ascii=False, indent=2) + + except Exception as e: + error_result = { + 'status': 'error', + 'error_type': 'load_error', + 'message': f'Error loading dataset: {str(e)}', + 'data_path': data_path + } + return json.dumps(error_result, ensure_ascii=False, indent=2) + + # ======================================== + # STEP 2: VALIDATE TARGET COLUMN + # ======================================== + + if target_column not in df.columns: + error_result = { + 'status': 'error', + 'error_type': 'column_not_found', + 'message': f'Target column "{target_column}" not found in dataset', + 'available_columns': list(df.columns), + 'data_path': data_path + } + return json.dumps(error_result, ensure_ascii=False, indent=2) + + # ======================================== + # STEP 3: PREPARE DATA + # ======================================== + + # Separate features and target + X = df.drop(columns=[target_column]) + y = df[target_column] + + # Handle missing values + if X.isnull().any().any(): + # Simple imputation: fill numeric with mean, categorical with mode + for col in X.columns: + if np.issubdtype(X[col].dtype, np.number): + X[col].fillna(X[col].mean(), inplace=True) + else: + X[col].fillna(X[col].mode()[0] if not X[col].mode().empty else 'missing', inplace=True) + + if y.isnull().any(): + y.fillna(y.mode()[0] if not y.mode().empty else 0, inplace=True) + + # Encode categorical features + label_encoders = {} + for col in X.columns: + if X[col].dtype == 'object' or X[col].dtype.name == 'category': + le = LabelEncoder() + X[col] = le.fit_transform(X[col].astype(str)) + label_encoders[col] = le + + # ======================================== + # STEP 4: DETERMINE TASK TYPE + # ======================================== + + task_type = self._determine_task_type(y) + + # Encode target if classification + target_encoder = None + if task_type == 'classification': + if y.dtype == 'object' or y.dtype.name == 'category': + target_encoder = LabelEncoder() + y = target_encoder.fit_transform(y.astype(str)) + + # ======================================== + # STEP 5: SETUP CROSS-VALIDATION AND PIPELINE + # ======================================== + + if task_type == 'classification': + cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state) + model = KNeighborsClassifier() + scoring = { + 'accuracy': 'accuracy', + 'precision': 'precision_weighted', + 'recall': 'recall_weighted', + 'f1': 'f1_weighted' + } + refit_metric = 'accuracy' + else: + cv = KFold(n_splits=cv_folds, shuffle=True, random_state=random_state) + model = KNeighborsRegressor() + scoring = { + 'neg_mse': 'neg_mean_squared_error', + 'neg_mae': 'neg_mean_absolute_error', + 'r2': 'r2' + } + refit_metric = 'neg_mse' + + # Create pipeline with scaler and model + pipeline = Pipeline([ + ('scaler', StandardScaler()), + ('knn', model) + ]) + + # ======================================== + # STEP 6: HYPERPARAMETER TUNING WITH CV + # ======================================== + + # Create parameter grid (add 'knn__' prefix for pipeline) + param_grid = { + 'knn__n_neighbors': n_neighbors_range, + 'knn__weights': [weights] + } + + if metric: + param_grid['knn__metric'] = [metric] + + # Perform grid search with cross-validation (computes all metrics) + grid_search = GridSearchCV( + estimator=pipeline, + param_grid=param_grid, + cv=cv, + scoring=scoring, + refit=refit_metric, + n_jobs=-1, + verbose=0, + return_train_score=False + ) + + # Convert all data to numpy arrays + X_array = X.values if hasattr(X, 'values') else X + y_array = y.values if hasattr(y, 'values') else y + + grid_search.fit(X_array, y_array) + + # Get best model and parameters + best_pipeline = grid_search.best_estimator_ + best_params = grid_search.best_params_ + + # Remove 'knn__' prefix from params for cleaner output + best_params_clean = {k.replace('knn__', ''): v for k, v in best_params.items()} + + # ======================================== + # STEP 7: EXTRACT METRICS FROM GRID SEARCH CV + # ======================================== + + # Get the index of the best estimator + best_index = grid_search.best_index_ + + # Extract CV scores for the best model + metrics = {} + + if task_type == 'classification': + metrics['accuracy'] = { + 'mean': float(grid_search.cv_results_[f'mean_test_accuracy'][best_index]), + 'std': float(grid_search.cv_results_[f'std_test_accuracy'][best_index]) + } + metrics['precision'] = { + 'mean': float(grid_search.cv_results_[f'mean_test_precision'][best_index]), + 'std': float(grid_search.cv_results_[f'std_test_precision'][best_index]) + } + metrics['recall'] = { + 'mean': float(grid_search.cv_results_[f'mean_test_recall'][best_index]), + 'std': float(grid_search.cv_results_[f'std_test_recall'][best_index]) + } + metrics['f1_score'] = { + 'mean': float(grid_search.cv_results_[f'mean_test_f1'][best_index]), + 'std': float(grid_search.cv_results_[f'std_test_f1'][best_index]) + } + else: # regression + mse_mean = -float(grid_search.cv_results_[f'mean_test_neg_mse'][best_index]) + mse_std = float(grid_search.cv_results_[f'std_test_neg_mse'][best_index]) + mae_mean = -float(grid_search.cv_results_[f'mean_test_neg_mae'][best_index]) + mae_std = float(grid_search.cv_results_[f'std_test_neg_mae'][best_index]) + r2_mean = float(grid_search.cv_results_[f'mean_test_r2'][best_index]) + r2_std = float(grid_search.cv_results_[f'std_test_r2'][best_index]) + + metrics['mse'] = { + 'mean': mse_mean, + 'std': mse_std + } + metrics['rmse'] = { + 'mean': float(np.sqrt(mse_mean)), + 'std': mse_std # Approximate std for RMSE + } + metrics['mae'] = { + 'mean': mae_mean, + 'std': mae_std + } + metrics['r2_score'] = { + 'mean': r2_mean, + 'std': r2_std + } + + # ======================================== + # STEP 8: HYPERPARAMETER SEARCH RESULTS + # ======================================== + + cv_results = { + 'best_score': float(grid_search.best_score_), + 'all_params_scores': [ + { + 'params': {k.replace('knn__', ''): v for k, v in params.items()}, + 'mean_score': float(score), + 'std_score': float(std) + } + for params, score, std in zip( + grid_search.cv_results_['params'], + grid_search.cv_results_[f'mean_test_{refit_metric}'], + grid_search.cv_results_[f'std_test_{refit_metric}'] + ) + ] + } + + # ======================================== + # STEP 9: SAVE MODEL (if requested) + # ======================================== + + model_info = {} + if save_model_path: + try: + # Create directory if needed + os.makedirs(os.path.dirname(save_model_path) if os.path.dirname(save_model_path) else '.', exist_ok=True) + + # Save pipeline and encoders + model_package = { + 'pipeline': best_pipeline, # Already includes scaler + 'label_encoders': label_encoders, + 'target_encoder': target_encoder, + 'task_type': task_type, + 'feature_names': list(X.columns), + 'best_params': best_params_clean + } + + joblib.dump(model_package, save_model_path) + + model_info['saved'] = True + model_info['save_path'] = save_model_path + model_info['file_size_bytes'] = os.path.getsize(save_model_path) + + except Exception as e: + model_info['save_error'] = str(e) + + # ======================================== + # BUILD RESPONSE + # ======================================== + + result = { + 'status': 'success', + 'task_type': task_type, + 'dataset_info': { + 'data_path': data_path, + 'total_samples': len(df), + 'num_features': X.shape[1], + 'target_column': target_column, + 'num_classes': int(len(np.unique(y))) if task_type == 'classification' else None, + 'cv_folds': cv_folds + }, + 'best_parameters': best_params_clean, + 'hyperparameter_search': cv_results, + 'cross_validation_metrics': metrics, + 'model_info': model_info if model_info else None + } + + return json.dumps(result, ensure_ascii=False, indent=2) + + # ======================================== + # ERROR HANDLING + # ======================================== + + except ImportError as e: + error_result = { + 'status': 'error', + 'error_type': 'missing_dependency', + 'message': f'Required package not installed: {str(e)}. Please install with: pip install scikit-learn pandas joblib', + 'data_path': data_path + } + return json.dumps(error_result, ensure_ascii=False, indent=2) + + except Exception as e: + error_result = { + 'status': 'error', + 'error_type': 'unexpected_error', + 'message': f'Unexpected error during training: {str(e)}', + 'data_path': data_path + } + return json.dumps(error_result, ensure_ascii=False, indent=2) diff --git a/agents4gov/tools/openml/openml_search.py b/agents4gov/tools/openml/openml_search.py new file mode 100644 index 0000000..3b27a66 --- /dev/null +++ b/agents4gov/tools/openml/openml_search.py @@ -0,0 +1,279 @@ +import json +import numpy as np +from typing import List, Dict, Any +from pydantic import Field + +class Tools: + def __init__(self): + pass + + def _compute_cosine_similarity(self, query_vec: List[float], dataset_vecs: List[List[float]]) -> np.ndarray: + """ + Compute cosine similarity between query vector and multiple dataset vectors. + + Args: + query_vec: Query vector + dataset_vecs: List of dataset vectors + + Returns: + Array of cosine similarity scores + """ + from sklearn.metrics.pairwise import cosine_similarity + + query_array = np.array(query_vec).reshape(1, -1) + dataset_array = np.array(dataset_vecs) + + similarities = cosine_similarity(query_array, dataset_array) + return similarities[0] + + def _get_embeddings_batch(self, texts: List[str]) -> List[List[float]]: + """ + Get text embeddings for a batch of texts using a local embedding model. + + Args: + texts: List of texts to embed + + Returns: + List of embedding vectors + """ + try: + from sentence_transformers import SentenceTransformer + + # Load model (you can cache this in __init__ for better performance) + model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2') + + # Encode batch (much faster than encoding one by one) + embeddings = model.encode(texts, show_progress_bar=False, batch_size=32) + return embeddings.tolist() + + except Exception as e: + raise RuntimeError( + f"No embedding service available. Please install sentence-transformers " + f"(pip install sentence-transformers) Error: {str(e)}" + ) + + def _fetch_openml_datasets(self, limit: int = 1000) -> List[Dict[str, Any]]: + """ + Fetch datasets from OpenML API. + + Args: + limit: Maximum number of datasets to fetch + + Returns: + List of dataset dictionaries with metadata + """ + try: + import openml + + # List datasets with relevant metadata + datasets_df = openml.datasets.list_datasets(output_format='dataframe') + + # Limit the number of datasets + datasets_df = datasets_df.head(limit) + + # Convert to list of dictionaries + datasets = [] + for idx, row in datasets_df.iterrows(): + dataset_dict = { + 'did': int(row['did']), + 'name': row.get('name', ''), + 'description': row.get('description', ''), + 'format': row.get('format', ''), + 'uploader': row.get('uploader', ''), + 'version': row.get('version', 1), + 'status': row.get('status', ''), + 'NumberOfInstances': row.get('NumberOfInstances', 0), + 'NumberOfFeatures': row.get('NumberOfFeatures', 0), + 'NumberOfClasses': row.get('NumberOfClasses', 0), + 'NumberOfMissingValues': row.get('NumberOfMissingValues', 0), + } + datasets.append(dataset_dict) + + return datasets + + except ImportError: + raise RuntimeError( + "OpenML package not installed. Please install it with: pip install openml" + ) + except Exception as e: + raise RuntimeError(f"Error fetching OpenML datasets: {str(e)}") + + def _create_dataset_text(self, dataset: Dict[str, Any]) -> str: + """ + Create a text representation of a dataset for embedding. + + Args: + dataset: Dataset dictionary + + Returns: + Text representation combining name and description + """ + name = dataset.get('name', '') + description = dataset.get('description', '') + + # Combine name and description for richer semantic matching + text = f"{name}. {description}" + + # Truncate if too long (optional, depends on embedding model limits) + max_length = 512 + if len(text) > max_length: + text = text[:max_length] + + return text + + def search_openml_datasets( + self, + query: str = Field( + ..., + description="Natural language query to search for datasets (e.g., 'image classification datasets', 'medical diagnosis data', 'time series weather')" + ), + top_k: int = Field( + default=5, + description="Number of top similar datasets to return (default: 5)" + ), + max_datasets: int = Field( + default=100, + description="Maximum number of datasets to search through (default: 100)" + ) + ) -> str: + """ + Search for OpenML datasets using semantic similarity based on embeddings. + + This tool: + 1. Fetches datasets from OpenML + 2. Embeds the user query + 3. Embeds dataset names and descriptions + 4. Computes cosine similarity between query and datasets + 5. Returns top-k most similar datasets + + Args: + query: Natural language search query + top_k: Number of top results to return + max_datasets: Maximum number of datasets to search + + Returns: + JSON string with top-k most similar datasets and their metadata + """ + + try: + # ======================================== + # STEP 1: FETCH DATASETS + # ======================================== + + datasets = self._fetch_openml_datasets(limit=max_datasets) + + if not datasets: + return json.dumps({ + 'status': 'error', + 'message': 'No datasets found in OpenML' + }, ensure_ascii=False, indent=2) + + # ======================================== + # STEP 2: CREATE DATASET TEXTS + # ======================================== + + dataset_texts = [self._create_dataset_text(dataset) for dataset in datasets] + + # ======================================== + # STEP 3: EMBED QUERY AND DATASETS (BATCH) + # ======================================== + + # Combine query with dataset texts for batch embedding + all_texts = [query] + dataset_texts + all_embeddings = self._get_embeddings_batch(all_texts) + + # Extract query embedding and dataset embeddings + query_embedding = all_embeddings[0] + dataset_embeddings = all_embeddings[1:] + + # ======================================== + # STEP 4: COMPUTE SIMILARITIES + # ======================================== + + similarity_scores = self._compute_cosine_similarity(query_embedding, dataset_embeddings) + + similarities = [ + { + 'dataset': dataset, + 'similarity': float(score) + } + for dataset, score in zip(datasets, similarity_scores) + ] + + # ======================================== + # STEP 5: SORT AND SELECT TOP-K + # ======================================== + + # Sort by similarity (descending) + similarities.sort(key=lambda x: x['similarity'], reverse=True) + + # Select top-k + top_results = similarities[:top_k] + + # ======================================== + # STEP 6: FORMAT RESULTS + # ======================================== + + results = [] + for item in top_results: + dataset = item['dataset'] + similarity = item['similarity'] + + results.append({ + 'dataset_id': dataset['did'], + 'name': dataset['name'], + 'description': dataset['description'][:200] + '...' if len(dataset.get('description', '')) > 200 else dataset.get('description', ''), + 'similarity_score': round(similarity, 4), + 'metadata': { + 'num_instances': dataset.get('NumberOfInstances', 0), + 'num_features': dataset.get('NumberOfFeatures', 0), + 'num_classes': dataset.get('NumberOfClasses', 0), + 'num_missing_values': dataset.get('NumberOfMissingValues', 0), + 'format': dataset.get('format', ''), + 'version': dataset.get('version', 1), + 'uploader': dataset.get('uploader', ''), + 'status': dataset.get('status', '') + }, + 'links': { + 'openml_url': f"https://www.openml.org/d/{dataset['did']}", + 'api_url': f"https://www.openml.org/api/v1/json/data/{dataset['did']}" + } + }) + + # ======================================== + # RETURN STRUCTURED RESPONSE + # ======================================== + + response = { + 'status': 'success', + 'query': query, + 'top_k': top_k, + 'total_searched': len(similarities), + 'results': results + } + + return json.dumps(response, ensure_ascii=False, indent=2) + + # ======================================== + # ERROR HANDLING + # ======================================== + + except RuntimeError as e: + # Handle specific runtime errors (missing dependencies, API issues) + error_result = { + 'status': 'error', + 'error_type': 'runtime_error', + 'message': str(e), + 'query': query + } + return json.dumps(error_result, ensure_ascii=False, indent=2) + + except Exception as e: + # Handle any other unexpected errors + error_result = { + 'status': 'error', + 'error_type': 'unexpected_error', + 'message': f'Unexpected error during search: {str(e)}', + 'query': query + } + return json.dumps(error_result, ensure_ascii=False, indent=2) diff --git a/logs.txt b/logs.txt new file mode 100644 index 0000000..b0f972c --- /dev/null +++ b/logs.txt @@ -0,0 +1,111 @@ +WARNING [cdp_use.client] Received duplicate response for request 691 - ignoring +INFO [Agent] +INFO [Agent] 📍 Step 19: +WARNING [tools] ⚠️ Element index 17440 not available - page may have changed. Try refreshing browser state. +INFO [Agent] 👍 Eval: Successfully clicked the 'Buscar' button again after re-entering the name. Verdict: Success. +INFO [Agent] 🧠 Memory: Currently on the CNPq search page after initiating a new search for Paulo Roberto Mann Marques Junior. I have inputted 'Paulo Roberto Mann Marques Junior' again into the search field and am ready to check for additional results. +INFO [Agent] 🎯 Next goal: Click the 'Buscar' button to execute the new search for Paulo Roberto Mann Marques Junior. +INFO [Agent] ▶️ click: index: 17440, coordinate_x: None, coordinate_y: None, force: False +WARNING [bubus] ⚠️ EventBus_08319417🟢(⏳ 0 | ▶️ 2 | ✅ 48 ➡️ 31 👂) handler browser_use.browser.watchdog_base.DOMWatchdog.on_BrowserStateRequestEvent() has been running for >15s on event. Possible slow processing or deadlock. +(handler could be trying to await its own result or could be blocked by another async task). +browser_use.browser.watchdog_base.DOMWatchdog.on_BrowserStateRequestEvent(?▶ BrowserStateRequestEvent#8312 🏃) +WARNING [bubus] ⚠️ EventBus_08319417🟢(⏳ 0 | ▶️ 2 | ✅ 48 ➡️ 31 👂) handler browser_use.browser.watchdog_base.ScreenshotWatchdog.on_ScreenshotEvent() has been running for >15s on event. Possible slow processing or deadlock. +(handler could be trying to await its own result or could be blocked by another async task). +browser_use.browser.watchdog_base.ScreenshotWatchdog.on_ScreenshotEvent(?▶ ScreenshotEvent#2d0e 🏃) +WARNING [bubus] ➡️ browser_use.browser.watchdog_base.DOMWatchdog.on_BrowserStateRequestEvent(#8312) ⏳ 15s/30s +WARNING [bubus] 📣 ScreenshotEvent#2d0e 15s +WARNING [bubus] ⏰ browser_use.browser.watchdog_base.ScreenshotWatchdog.on_ScreenshotEvent(#2d0e) ⌛️ 15s/15s ⬅️ TIMEOUT HERE ⏰ +WARNING [bubus] +WARNING [bubus] ================================================================================ +================================================================================ +WARNING [bubus] ⏱️ TIMEOUT ERROR - Handling took more than 15.0s for EventBus_08319417.browser_use.browser.watchdog_base.ScreenshotWatchdog.on_ScreenshotEvent(?▶ ScreenshotEvent#2d0e ✅) +WARNING [bubus] ================================================================================ +WARNING [bubus] 📣 BrowserStateRequestEvent#8312 15s +WARNING [bubus] ☑️ browser_use.browser.watchdog_base.DownloadsWatchdog.on_BrowserStateRequestEvent(#8312) 0s/30s ✓ +WARNING [bubus] 📣 NavigationCompleteEvent#27fa 15s +WARNING [bubus] ☑️ browser_use.browser.watchdog_base.DownloadsWatchdog.on_NavigationCompleteEvent(#27fa) 0s/30s ✓ +WARNING [bubus] ☑️ browser_use.browser.watchdog_base.SecurityWatchdog.on_NavigationCompleteEvent(#27fa) 0s/30s ✓ +WARNING [BrowserSession] 📸 Clean screenshot timed out after 6 seconds - no handler registered or slow page? +ERROR [BrowserSession] Exception in background task [capture_screenshot]: TimeoutError: Event handler browser_use.browser.watchdog_base.ScreenshotWatchdog.on_ScreenshotEvent#1520(?▶ ScreenshotEvent#2d0e 🏃) timed out after 15.0s +Traceback (most recent call last): + File "/usr/local/lib/python3.11/asyncio/tasks.py", line 500, in wait_for + return fut.result() + ^^^^^^^^^^^^ + File "/usr/local/lib/python3.11/site-packages/browser_use/browser/watchdog_base.py", line 108, in unique_handler + result = await actual_handler(event) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/local/lib/python3.11/site-packages/browser_use/observability.py", line 73, in async_wrapper + return await func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/local/lib/python3.11/site-packages/browser_use/browser/watchdogs/screenshot_watchdog.py", line 60, in on_ScreenshotEvent + result = await cdp_session.cdp_client.send.Page.captureScreenshot(params=params, session_id=cdp_session.session_id) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/local/lib/python3.11/site-packages/cdp_use/cdp/page/library.py", line 123, in captureScreenshot + return cast("CaptureScreenshotReturns", await self._client.send_raw( +The above exception was the direct cause of the following exception: + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Traceback (most recent call last): + File "/usr/local/lib/python3.11/site-packages/cdp_use/client.py", line 398, in send_raw + File "/usr/local/lib/python3.11/site-packages/browser_use/utils.py", line 363, in wrapper + return await future + result = await func(*args, **kwargs) + ^^^^^^^^^^^^ + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +asyncio.exceptions.CancelledError +The above exception was the direct cause of the following exception: +Traceback (most recent call last): + File "/usr/local/lib/python3.11/site-packages/bubus/service.py", line 1128, in execute_handler + result_value: Any = await asyncio.wait_for(handler_task, timeout=event_result.timeout) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/local/lib/python3.11/asyncio/tasks.py", line 502, in wait_for + raise exceptions.TimeoutError() from exc +TimeoutError + File "/usr/local/lib/python3.11/site-packages/bubus/models.py", line 490, in event_results_filtered + await event_result + File "/usr/local/lib/python3.11/site-packages/bubus/models.py", line 889, in wait_for_handler_to_complete_and_return_result + raise self.error if isinstance(self.error, BaseException) else Exception(self.error) # pyright: ignore[reportUnnecessaryIsInstance] + File "/usr/local/lib/python3.11/site-packages/browser_use/observability.py", line 73, in async_wrapper + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/local/lib/python3.11/site-packages/bubus/service.py", line 1070, in _execute_handlers + return await func(*args, **kwargs) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/local/lib/python3.11/site-packages/browser_use/browser/watchdogs/dom_watchdog.py", line 696, in _capture_clean_screenshot + screenshot_b64 = await screenshot_event.event_result(raise_if_any=True, raise_if_none=True) + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/local/lib/python3.11/site-packages/bubus/models.py", line 574, in event_result + valid_results = await self.event_results_filtered( + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + File "/usr/local/lib/python3.11/site-packages/bubus/models.py", line 516, in event_results_filtered + raise original_error + await self.execute_handler(event, handler, timeout=timeout) + File "/usr/local/lib/python3.11/site-packages/bubus/service.py", line 1194, in execute_handler + raise handler_timeout_error from e +TimeoutError: Event handler browser_use.browser.watchdog_base.ScreenshotWatchdog.on_ScreenshotEvent#1520(?▶ ScreenshotEvent#2d0e 🏃) timed out after 15.0s +WARNING [BrowserSession] 🔍 DOMWatchdog.on_BrowserStateRequestEvent: Clean screenshot failed: Event handler browser_use.browser.watchdog_base.ScreenshotWatchdog.on_ScreenshotEvent#1520(?▶ ScreenshotEvent#2d0e 🏃) timed out after 15.0s +INFO [Agent] +INFO [Agent] 📍 Step 20: +INFO [Agent] 👍 Eval: The task of extracting academic data was not completed successfully as the specified Lattes ID was not found. Verdict: Failure. +INFO [Agent] 🧠 Memory: I navigated through the CNPq search page and attempted to find the Lattes CV for Paulo Roberto Mann Marques Junior multiple times but could not locate the specified Lattes ID. The search yielded no matching results, leading to a conclusion of 'profile_not_found'. +INFO [Agent] 🎯 Next goal: Prepare to call done with a 'profile_not_found' error response. +INFO [Agent] ▶️ done: text: {"warnings": ["profile_not_found"], "publications": [], "projects": [], "advising": [], "affiliations": [], "coauthors": [], "last_update": null}, success: False, files_to_display: None +INFO [Agent] +📄 Final Result: +INFO [Agent] +⚖️ Judge Verdict: ❌ FAIL + Failure Reason: The agent could not locate a profile matching the provided Lattes ID after searching multiple times, resulting in a 'profile_not_found' warning. Additionally, there were instances of clicking on unavailable elements and encountering a rate limit error, which hindered progress. These issues contributed to an incomplete task outcome. + The agent attempted to extract academic data for Paulo Roberto Mann Marques Junior but ultimately returned a 'profile_not_found' warning. The agent followed the navigation steps correctly, including waiting for the page to stabilize after each action. However, it failed to find a matching CV with the specified Lattes ID. The agent's trajectory included unnecessary repeated clicks and an error due to rate limiting, which indicates potential inefficiencies in tool usage. Overall, while the agent executed the steps as instructed, it did not achieve the task's goal of retrieving the required data. +INFO [Agent] +INFO [Agent] Did the Agent not work as expected? Let us fix this! +INFO [Agent] Open a short issue on GitHub: https://github.com/browser-use/browser-use/issues +INFO [cloud] 🌤️ Stopping cloud browser session: 622efe0a-e249-4c3f-86a8-a366fefffda1 +WARNING [BrowserSession] [SessionManager] Agent focus target 166C6017... detached! Current focus: None (already cleared). Auto-recovering by switching to another target... +WARNING [BrowserSession] [SessionManager] No tabs remain! Creating new tab for agent... +ERROR [BrowserSession] [SessionManager] ❌ Error during agent_focus recovery: RuntimeError: {'code': -32000, 'message': 'Failed to open new tab - no browser is open'} +INFO [BrowserSession] 📢 on_BrowserStopEvent - Calling reset() (force=True, keep_alive=None) +INFO [BrowserSession] [SessionManager] Cleared all owned data (targets, sessions, mappings) +INFO [BrowserSession] ✅ Browser session reset complete +WARNING [BrowserSession] Cannot navigate - browser not connected +WARNING [BrowserSession] Cannot navigate - browser not connected +WARNING [BrowserSession] Cannot navigate - browser not connected +INFO [BrowserSession] ✅ Browser session reset complete +INFO: 100.64.0.5:24514 - "POST /analyze HTTP/1.1" 200 OK \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index ae27f9b..55482d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ requests pydantic open-webui -openml \ No newline at end of file diff --git a/tools/README.md b/tools/README.md index 8e00fff..01f1236 100644 --- a/tools/README.md +++ b/tools/README.md @@ -1,16 +1,73 @@ -# Tools +# Agents4Gov Tools -This directory contains tools that can be used by agents in the Agents4Gov framework. Each tool provides specific functionality that agents can call to perform tasks. +This directory contains all tool implementations for the Agents4Gov project. Each tool is designed to be imported and used within Open WebUI to provide specific functionality to LLM agents. ## Available Tools -### OpenAlex -- **[openalex/open_alex_doi.py](openalex/README.md)** - Retrieves metadata and impact indicators for scientific publications using DOI - -### OpenML -- **[openml/openml_search.py](openml/README.md)** - Search for machine learning datasets using semantic similarity with embeddings -- **[openml/openml_download.py](openml/README.md)** - Download datasets from OpenML by ID and save as CSV -- **[openml/openml_knn_train.py](openml/README.md)** - Train KNN models with hyperparameter tuning via cross-validation +### 1. OpenAlex DOI Metadata Retrieval + +**File:** `open_alex_doi.py` + +**Description:** Retrieves comprehensive metadata and impact indicators for scientific publications using their DOI (Digital Object Identifier) from the OpenAlex API. + +**Main Method:** `get_openalex_metadata_by_doi(doi: str) -> str` + +**Features:** +- Fetches basic publication metadata (title, authors, venue, publication year) +- Retrieves citation counts and impact metrics +- Provides normalized percentile rankings +- Calculates Field-Weighted Citation Impact (FWCI) +- Handles multiple DOI formats (with or without prefixes) +- Returns structured JSON output + +**Parameters:** +- `doi` (required): The DOI of the publication (e.g., `10.1371/journal.pone.0000000`) + - Accepts formats: `10.1234/example`, `doi:10.1234/example`, `https://doi.org/10.1234/example` + +**Environment Variables:** +- `OPENALEX_EMAIL` (optional): Your email for polite pool access (faster and more reliable API responses) + +**Example Output:** +```json +{ + "status": "success", + "doi": "10.1371/journal.pone.0000000", + "openalex_id": "https://openalex.org/W2741809807", + "metadata": { + "title": "Example Publication Title", + "authors": ["Author One", "Author Two"], + "venue": "PLOS ONE", + "publication_year": 2020, + "publication_date": "2020-03-15", + "type": "journal-article" + }, + "impact_indicators": { + "cited_by_count": 42, + "citation_normalized_percentile": { + "value": 85.5, + "is_in_top_1_percent": false + }, + "cited_by_percentile_year": { + "min": 80, + "max": 90 + }, + "fwci": 1.5 + }, + "links": { + "doi_url": "https://doi.org/10.1371/journal.pone.0000000", + "openalex_url": "https://openalex.org/W2741809807" + } +} +``` + +**Use Cases:** +- Research impact analysis +- Literature review automation +- Citation metric extraction +- Publication verification +- Academic database integration + +--- ## How to Use Tools in Open WebUI @@ -20,7 +77,7 @@ This directory contains tools that can be used by agents in the Agents4Gov frame 2. Access the web interface at [http://localhost:8080](http://localhost:8080) 3. Navigate to **Workspace → Tools** 4. Click **Import Tool** or **+ Create Tool** -5. Copy and paste the content of the tool file +5. Copy and paste the content of the tool file (e.g., `open_alex_doi.py`) 6. Save and enable the tool 7. The tool will now be available for agents to use in conversations @@ -32,13 +89,29 @@ If Open WebUI supports file-based tool loading: 2. Restart Open WebUI to detect new tools 3. Enable the tool in the Tools settings +### Testing a Tool + +After importing, test the tool with a simple query: + +``` +Can you get metadata for the publication with DOI 10.1371/journal.pone.0000000? +``` + +The agent should automatically invoke the `get_openalex_metadata_by_doi` tool and return the structured results. + +--- + ## Tool Requirements +### General Requirements + All tools in this directory require: - **Python 3.11+** - **Open WebUI** installed and running - **pydantic** library for parameter validation +--- + ## Creating Your Own Tools Want to create a new tool? Follow our comprehensive guide: @@ -52,6 +125,62 @@ The tutorial covers: - Returning structured JSON data - Best practices and examples +**Quick Start Template:** + +```python +import json +from pydantic import Field + +class Tools: + def __init__(self): + pass + + def my_tool_method( + self, + param: str = Field( + ..., + description="Description of parameter" + ) + ) -> str: + """ + Tool description. + + Args: + param: Parameter description + + Returns: + JSON string with results + """ + try: + # Your logic here + result = { + 'status': 'success', + 'data': 'your data' + } + return json.dumps(result, ensure_ascii=False, indent=2) + except Exception as e: + error_result = { + 'status': 'error', + 'message': str(e) + } + return json.dumps(error_result, ensure_ascii=False, indent=2) +``` + +--- + +## Tool Development Best Practices + +1. **Clear Documentation**: Include comprehensive docstrings and parameter descriptions +2. **Error Handling**: Always catch and return structured errors as JSON +3. **Type Hints**: Use Python type hints for all parameters and return values +4. **Structured Output**: Return JSON strings with consistent `status` fields +5. **Environment Variables**: Use env vars for API keys and configuration +6. **Timeouts**: Set timeouts on external API calls +7. **Validation**: Validate and clean input data before processing +8. **Testing**: Test with various inputs including edge cases + +--- + ## Troubleshooting ### Tool Not Appearing in Open WebUI @@ -74,18 +203,30 @@ The tutorial covers: - Use Python 3.11+ for compatibility - Check that the tool file is valid Python code +--- + ## Contributing New Tools When adding a new tool to this directory: -1. **Create the tool file** following the structure in existing tools +1. **Create the tool file** following the structure in `open_alex_doi.py` 2. **Test thoroughly** with various inputs and edge cases -3. **Document the tool** with a README.md in its subdirectory -4. **Add it to this README** under "Available Tools" -5. **Follow best practices** outlined in the [tutorial](../docs/how_to_create_tool.md) +3. **Document the tool** in this README.md under "Available Tools" +4. **Add requirements** if the tool needs specific dependencies +5. **Include examples** showing expected input and output +6. **Follow best practices** outlined in the tutorial + +--- ## Additional Resources -- **[Tool Creation Tutorial](../docs/how_to_create_tool.md)** - Step-by-step guide for creating tools -- **[Open WebUI Tools Guide](https://docs.openwebui.com/features/plugin/tools)** - Official Open WebUI tools documentation - **[Project Documentation](../docs/README.md)** - Main documentation hub +- **[Tool Creation Tutorial](../docs/how_to_create_tool.md)** - Step-by-step guide +- **[Open WebUI Tools Guide](https://docs.openwebui.com/features/plugin/tools)** - Official Open WebUI tools documentation +- **[OpenAlex API Documentation](https://docs.openalex.org/)** - For the OpenAlex tool specifically + +--- + +## License + +All tools in this directory are part of the Agents4Gov project and are licensed under the **MIT License**. diff --git a/tools/cnpq_lattes_navigator/README.md b/tools/cnpq_lattes_navigator/README.md new file mode 100644 index 0000000..8184d03 --- /dev/null +++ b/tools/cnpq_lattes_navigator/README.md @@ -0,0 +1,252 @@ +# CNPq/Lattes Navigator + +Detects Conflicts of Interest (COI) and summarizes academic production from public CNPq/Lattes profiles using browser automation. + +## Structure + +``` +cnpq_lattes_navigator/ +├── api/ # FastAPI service +│ ├── Dockerfile +│ ├── main.py +│ ├── lattes_navigator.py +│ └── requirements.txt +├── tool/ # Open WebUI tool module +│ ├── Dockerfile +│ ├── lattes_navigator.py +│ └── requirements.txt +├── schema.json +├── TESTING.md +└── examples/ +``` + +## Railway Deployment + +### Environment Variables + +| Variable | Required | Default | Description | +|----------|----------|---------|-------------| +| OPENAI_API_KEY | Yes | - | OpenAI API key for LLM | +| OPENAI_MODEL | No | gpt-4o-mini | Model to use | +| PORT | No | 8000 | Server port (auto-set by Railway) | +| BROWSER_USE_API_KEY | Yes | - | Browser-Use Cloud API key (get from cloud.browser-use.com) | +| BROWSER_USE_CLOUD | No | true | Use cloud browser for stealth mode | + +### Deploy + +Point Railway to `tools/cnpq_lattes_navigator/api/` directory. + +## API Endpoints + +### GET /health + +Health check with system status. + +```bash +curl https://lattes-navigator-api-production.up.railway.app/health +``` + +Response: +```json +{ + "status": "ok", + "browser_available": true, + "api_key_set": true, + "import_error": null +} +``` + +### GET /debug + +Import diagnostics. + +```bash +curl https://lattes-navigator-api-production.up.railway.app/debug +``` + +### POST /analyze + +Analyze researchers for COI (pairwise analysis). + +```bash +curl -X POST https://lattes-navigator-api-production.up.railway.app/analyze \ + -H "Content-Type: application/json" \ + -d '{ + "researchers": [ + {"name": "Ricardo Marcacini", "lattes_id": "3272611282260295"}, + {"name": "Matheus Yasuo", "lattes_id": "6191612710855387"} + ], + "time_window": 5, + "coi_rules": {"R1": true, "R2": true, "R3": true, "R4": true, "R5": true, "R6": true, "R7": true} + }' +``` + +### POST /validate-committee + +Validate academic committee for conflicts of interest. Analyzes COI only between student and non-advisor committee members. + +**Request Body:** +```json +{ + "student": { + "name": "Matheus Yasuo Ribeiro Utino", + "lattes_id": "6191612710855387" + }, + "advisor": { + "name": "Ricardo Marcondes Marcacini", + "lattes_id": "3272611282260295" + }, + "committee_members": [ + { + "name": "Solange Oliveira Rezende", + "lattes_id": "8526960535874806", + "email": "solange@icmc.usp.br", + "institution": "ICMC-USP", + "role": "internal", + "is_president": false + }, + { + "name": "Paulo Roberto Mann Marques Júnior", + "lattes_id": "3571577377652346", + "email": "paulomann@ufrj.br", + "institution": "UFRJ", + "role": "external", + "is_president": false + } + ], + "thesis_title": "Unstructured Text Mining in the Era of Large Language Models", + "committee_type": "qualification", + "time_window": 5 +} +``` + +**Test Valid Committee:** +```bash +curl -X POST https://lattes-navigator-api-production.up.railway.app/validate-committee \ + -H "Content-Type: application/json" \ + -d @tools/cnpq_lattes_navigator/examples/valid_committee.json +``` + +**Test Invalid Committee (with COI):** +```bash +curl -X POST https://lattes-navigator-api-production.up.railway.app/validate-committee \ + -H "Content-Type: application/json" \ + -d @tools/cnpq_lattes_navigator/examples/invalid_committee.json +``` + +**Response (Valid Committee):** +```json +{ + "status": "valid", + "student": {...}, + "advisor": {...}, + "members_analysis": [ + { + "member": {...}, + "coi_detected": false, + "coi_details": [] + } + ], + "conflicts": [], + "collection_log": [ + "Extracting 1/5: Matheus Yasuo Ribeiro Utino (student)", + "Extracting 2/5: Ricardo Marcondes Marcacini (advisor)", + ... + ], + "summary": "Committee valid. Analyzed 4 members against student. No conflicts detected." +} +``` + +**Response (Invalid Committee):** +```json +{ + "status": "invalid", + "conflicts": [ + { + "student_name": "Matheus Yasuo Ribeiro Utino", + "member_name": "Paulo Roberto Mann Marques Júnior", + "member_role": "external", + "rules_triggered": ["R1"], + "confidence": "high", + "evidence": ["Shared: Paper Title (2024)"] + } + ], + "summary": "Committee INVALID. 1 conflict(s) detected with: Paulo Roberto Mann Marques Júnior." +} +``` + +## Test Procedures + +### 1. Verify Deployment + +```bash +# Health check +curl https://lattes-navigator-api-production.up.railway.app/health + +# Expected: browser_available: true, api_key_set: true +``` + +### 2. Check Imports + +```bash +# Debug imports +curl https://lattes-navigator-api-production.up.railway.app/debug + +# Expected: browser_use.Agent: OK, browser_use.ChatOpenAI: OK +``` + +### 3. Single Researcher Test + +```bash +curl -X POST https://lattes-navigator-api-production.up.railway.app/analyze \ + -H "Content-Type: application/json" \ + -d '{"researchers": [{"name": "Test Name", "lattes_id": "0000000000000000"}], "time_window": 5}' +``` + +### 4. COI Detection Test + +```bash +curl -X POST https://lattes-navigator-api-production.up.railway.app/analyze \ + -H "Content-Type: application/json" \ + -d '{"researchers": [{"name": "Researcher A", "lattes_id": "ID_A"}, {"name": "Researcher B", "lattes_id": "ID_B"}], "time_window": 5}' +``` + +### 5. Committee Validation Test + +**Test Valid Committee (no conflicts expected):** +```bash +curl -X POST https://lattes-navigator-api-production.up.railway.app/validate-committee \ + -H "Content-Type: application/json" \ + -d @tools/cnpq_lattes_navigator/examples/valid_committee.json +``` + +**Test Invalid Committee (conflict expected with Paulo Mann):** +```bash +curl -X POST https://lattes-navigator-api-production.up.railway.app/validate-committee \ + -H "Content-Type: application/json" \ + -d @tools/cnpq_lattes_navigator/examples/invalid_committee.json +``` + +**Expected Results:** +- Valid committee: `"status": "valid"`, `"conflicts": []` +- Invalid committee: `"status": "invalid"`, conflicts with Paulo Roberto Mann Marques Júnior + +## COI Rules + +| Rule | Description | +|------|-------------| +| R1 | Co-authorship (1+ shared publication) | +| R2 | Advisor-advisee relationship | +| R3 | Institutional overlap | +| R4 | Project overlap | +| R5 | Committee/event overlap | +| R6 | Frequent co-authorship (3+ publications) | +| R7 | Same lab/group | + +## Open WebUI Integration + +Copy `tool/lattes_navigator.py` content to Open WebUI Tools interface. + +## Test Results + +See [TESTING.md](TESTING.md) for detailed test documentation and results. diff --git a/tools/cnpq_lattes_navigator/TESTING.md b/tools/cnpq_lattes_navigator/TESTING.md new file mode 100644 index 0000000..efed754 --- /dev/null +++ b/tools/cnpq_lattes_navigator/TESTING.md @@ -0,0 +1,80 @@ +# CNPq/Lattes Navigator - Test Documentation + +### 1. Single Researcher Analysis + +**Endpoint**: `POST /analyze` + +**Command**: +```bash +curl -X POST https://lattes-navigator-api-production.up.railway.app/analyze \ + -H "Content-Type: application/json" \ + -d '{ + "researchers": [{"name": "Ricardo Marcacini", "lattes_id": "4003190744770195"}], + "time_window": 5 + }' +``` + +**Result**: PASS (with expected limitation) +```json +{ + "status": "success", + "execution_metadata": { + "browser_use_available": true, + "num_researchers": 1, + "time_window_years": 5 + }, + "researchers": [{ + "person": { + "name": "Ricardo Marcacini", + "lattes_id": "4003190744770195", + "profile_url": "http://lattes.cnpq.br/4003190744770195" + }, + "warnings": ["captcha_blocked"], + "production_5y": {"publications": {"total": 0}} + }], + "summary_text": "Analyzed 1 researchers over 5 years. No COI detected." +} +``` + +**Notes**: +- Browser automation executes correctly +- JSON response parsing works +- Lattes platform blocks automated access with captcha + +--- + +### 2. COI Detection Test (Two Researchers) + +**Command**: +```bash +curl -X POST https://lattes-navigator-api-production.up.railway.app/analyze \ + -H "Content-Type: application/json" \ + -d '{ + "researchers": [ + {"name": "Ricardo Marcacini", "lattes_id": "4003190744770195"}, + {"name": "Solange Rezende", "lattes_id": "1458324546544936"} + ], + "time_window": 5 + }' +``` + +**Expected**: Both researchers return `captcha_blocked` warning due to platform protection. + +--- + +## Working Components + +- API deployment on Railway +- browser-use integration +- Agent execution +- JSON response parsing +- Error handling with fallback responses + +## Known Limitation + +**Captcha Protection**: The CNPq/Lattes platform has anti-bot protection that blocks automated browser access. This is a platform-level restriction, not a tool issue. + +Potential workarounds: +1. Use official CNPq API (if available) +2. Manual data entry +3. Request institutional API access diff --git a/tools/cnpq_lattes_navigator/api/Dockerfile b/tools/cnpq_lattes_navigator/api/Dockerfile new file mode 100644 index 0000000..1fd0475 --- /dev/null +++ b/tools/cnpq_lattes_navigator/api/Dockerfile @@ -0,0 +1,28 @@ +FROM python:3.11-slim + +ENV PYTHONUNBUFFERED=1 \ + PLAYWRIGHT_BROWSERS_PATH=/ms-playwright \ + BROWSER_USE_CLOUD=true + +RUN apt-get update && apt-get install -y --no-install-recommends \ + libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 \ + libdbus-1-3 libxkbcommon0 libatspi2.0-0 libxcomposite1 libxdamage1 \ + libxfixes3 libxrandr2 libgbm1 libasound2 libpango-1.0-0 libcairo2 \ + libx11-6 libx11-xcb1 libxcb1 libxext6 libxcursor1 libxi6 libxtst6 \ + fonts-liberation wget ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --upgrade pip && pip install -r requirements.txt + +RUN mkdir -p /ms-playwright && \ + playwright install chromium && \ + playwright install-deps chromium + +COPY . . + +EXPOSE 8000 + +CMD ["python", "main.py"] diff --git a/tools/cnpq_lattes_navigator/api/__init__.py b/tools/cnpq_lattes_navigator/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tools/cnpq_lattes_navigator/api/lattes_navigator.py b/tools/cnpq_lattes_navigator/api/lattes_navigator.py new file mode 100644 index 0000000..1995aab --- /dev/null +++ b/tools/cnpq_lattes_navigator/api/lattes_navigator.py @@ -0,0 +1,818 @@ +import os +import json +import asyncio +import re +import time +from datetime import datetime, timedelta +from typing import List, Dict, Any, Optional, Tuple +from collections import defaultdict +from pydantic import Field + +BROWSER_USE_AVAILABLE = False +BROWSER_IMPORT_ERROR = None + +try: + from browser_use import Agent, Browser, ChatOpenAI + BROWSER_USE_AVAILABLE = True +except Exception as e: + BROWSER_IMPORT_ERROR = str(e) + + +class Tools: + def __init__(self): + self.start_url = "https://buscatextual.cnpq.br/buscatextual/busca.do?metodo=apresentar" + self.current_year = datetime.now().year + self.browser_available = BROWSER_USE_AVAILABLE + self.rate_limit_delay = 2.0 + self.openai_api_key = os.getenv("OPENAI_API_KEY") + self.openai_model = os.getenv("OPENAI_MODEL", "gpt-4o-mini") + self.use_cloud_browser = os.getenv("BROWSER_USE_CLOUD", "true").lower() == "true" + + def analyze_researchers_coi( + self, + researchers_json: str = Field(..., description='JSON list: [{"name": "...", "lattes_id": "..."}]'), + time_window: int = Field(default=5, description="Years to analyze"), + coi_rules_config: str = Field( + default='{"R1": true, "R2": true, "R3": true, "R4": true, "R5": true, "R6": true, "R7": true}', + description='JSON to enable/disable COI rules' + ) + ) -> str: + try: + researchers = json.loads(researchers_json) + coi_config = json.loads(coi_rules_config) + + if not isinstance(researchers, list) or len(researchers) == 0: + return self._error_response("invalid_input", "researchers_json must be a non-empty list") + + cutoff_date = datetime.now() - timedelta(days=time_window * 365) + + results = { + 'status': 'success', + 'execution_metadata': { + 'execution_date': datetime.now().isoformat(), + 'time_window_years': time_window, + 'cutoff_date': cutoff_date.isoformat(), + 'num_researchers': len(researchers), + 'coi_rules_active': coi_config, + 'browser_use_available': self.browser_available + }, + 'researchers': [], + 'coi_matrix': {'pairs': []}, + 'summary_text': '' + } + + researcher_data = [] + for researcher in researchers: + name = researcher.get('name', '') + lattes_id = researcher.get('lattes_id', '') + + if not name or not lattes_id: + results['researchers'].append({ + 'person': {'name': name, 'lattes_id': lattes_id}, + 'warnings': ['Missing name or lattes_id'], + 'production_5y': {}, + 'coauthors_5y': [], + 'evidence_urls': [] + }) + continue + + profile_data = self._extract_researcher_profile(name, lattes_id, cutoff_date) + researcher_data.append(profile_data) + results['researchers'].append(profile_data) + + coi_pairs = self._analyze_coi_pairwise(researcher_data, coi_config, cutoff_date) + results['coi_matrix']['pairs'] = coi_pairs + results['summary_text'] = self._generate_summary(results) + + return json.dumps(results, ensure_ascii=False, indent=2) + + except json.JSONDecodeError as e: + return self._error_response('json_parse_error', f'Invalid JSON: {str(e)}') + except Exception as e: + return self._error_response('unexpected_error', str(e)) + + def _extract_researcher_profile(self, name: str, lattes_id: str, cutoff_date: datetime, is_student: bool = False) -> Dict[str, Any]: + profile_url = f"http://lattes.cnpq.br/{lattes_id}" + warnings = [] + + if not self.browser_available: + warnings.append("browser-use not installed") + return self._mock_profile(name, lattes_id, profile_url, warnings) + + if not self.openai_api_key: + warnings.append("OPENAI_API_KEY not set") + return self._mock_profile(name, lattes_id, profile_url, warnings) + + time.sleep(self.rate_limit_delay) + + try: + extracted_data = self._run_browser_extraction(profile_url, name, lattes_id, cutoff_date, is_student) + + if extracted_data is None: + warnings.append("Extraction failed") + return self._mock_profile(name, lattes_id, profile_url, warnings) + + # Check for error warnings in extracted data + data_warnings = extracted_data.get('warnings', []) + if any(w in data_warnings for w in ['profile_not_found', 'captcha_blocked', 'page_error']): + warnings.extend(data_warnings) + return self._mock_profile(name, lattes_id, profile_url, warnings, extracted_data.get('agent_logs', [])) + + production = self._process_production(extracted_data, cutoff_date) + coauthors = production.pop('coauthors_extracted', []) or extracted_data.get('coauthors', []) + + return { + 'person': { + 'name': name, + 'lattes_id': lattes_id, + 'profile_url': profile_url, + 'last_update': extracted_data.get('last_update') + }, + 'production_5y': production, + 'affiliations_5y': extracted_data.get('affiliations', []), + 'coauthors_5y': coauthors, + 'warnings': warnings + data_warnings, + 'evidence_urls': [profile_url], + 'agent_logs': extracted_data.get('agent_logs', []) + } + except Exception as e: + warnings.append(f"Error: {str(e)}") + return self._mock_profile(name, lattes_id, profile_url, warnings) + + def _mock_profile(self, name: str, lattes_id: str, profile_url: str, warnings: List[str], agent_logs: List[Dict] = None) -> Dict[str, Any]: + return { + 'person': {'name': name, 'lattes_id': lattes_id, 'profile_url': profile_url, 'last_update': None}, + 'production_5y': { + 'publications': {'total': 0, 'by_type': {}, 'top_items': []}, + 'projects': {'total': 0, 'active': [], 'concluded': []}, + 'advising': {'total': 0, 'ongoing': [], 'concluded': []}, + 'activities': [] + }, + 'affiliations_5y': [], + 'coauthors_5y': [], + 'warnings': warnings, + 'evidence_urls': [profile_url], + 'agent_logs': agent_logs or [] + } + + def _run_browser_extraction(self, profile_url: str, name: str, lattes_id: str, cutoff_date: datetime, is_student: bool = False) -> Optional[Dict[str, Any]]: + try: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + return loop.run_until_complete(self._async_extraction(profile_url, name, lattes_id, cutoff_date, is_student)) + finally: + loop.close() + except Exception as e: + return {'warnings': [str(e)], 'publications': [], 'projects': [], 'advising': [], 'affiliations': [], 'coauthors': [], 'last_update': None} + + async def _async_extraction(self, profile_url: str, name: str, lattes_id: str, cutoff_date: datetime, is_student: bool = False) -> Dict[str, Any]: + cutoff_year = cutoff_date.year + current_year = datetime.now().year + + llm = ChatOpenAI(model=self.openai_model) + + # Build checkbox step only for students + checkbox_step = "" + if is_student: + checkbox_step = """ +3. CHECK the checkbox with CSS selector "#buscarDemais" - REQUIRED for student search +4. """ + else: + checkbox_step = """ +3. """ + + task = f""" +TASK: Find and extract Lattes CV data for "{name}" (Lattes ID: {lattes_id}). + +TARGET LATTES ID: {lattes_id} + +NAVIGATION: +1. Go to: https://buscatextual.cnpq.br/buscatextual/busca.do?metodo=apresentar +2. Type "{name}" in the search field +{checkbox_step}CLICK button "#botaoBuscaFiltros" +{"5" if is_student else "4"}. CLICK link containing "{name}" in results +{"6" if is_student else "5"}. CLICK button "#idbtnabrircurriculo" +{"7" if is_student else "6"}. VERIFY ID: Look at top of CV for "ID Lattes:" text followed by a number. + The ID must be exactly "{lattes_id}". + If the ID shown is DIFFERENT, go BACK and try the NEXT result in the list. + +CSS SELECTORS: +- Checkbox: #buscarDemais (use CHECK action) +- Search button: #botaoBuscaFiltros (use CLICK action) +- Open CV button: #idbtnabrircurriculo (use CLICK action) + +ID LATTES LOCATION (in CV page): +The ID appears at top of CV like: "ID Lattes: {lattes_id}" +HTML:
  • ID Lattes: {lattes_id}
  • + +EXTRACT (years {cutoff_year}-{current_year}): +- Institution, publications, projects, advising, coauthors + +OUTPUT JSON: +```json +{{ + "last_update": null, + "affiliations": [{{"institution": "...", "department": "..."}}], + "publications": [{{"title": "...", "year": 2024, "type": "journal", "coauthors": ["..."]}}], + "projects": [{{"title": "...", "start_year": 2022}}], + "advising": [{{"name": "...", "level": "PhD", "year": 2023}}], + "coauthors": [{{"name": "...", "count": 1}}], + "warnings": [] +}} +``` + +ERRORS (only if NO data found): +- {{"warnings": ["profile_not_found"], ...}} if ID {lattes_id} not found in any result +- {{"warnings": ["captcha_blocked"], ...}} if completely blocked +""" + + browser = None + if self.use_cloud_browser: + browser = Browser( + use_cloud=True, + cloud_proxy_country_code='br', + cloud_timeout=15, + wait_between_actions=3.0, + wait_for_network_idle_page_load_time=5.0, + minimum_wait_page_load_time=3.0, + ) + + agent = Agent( + task=task, + llm=llm, + browser=browser, + max_actions_per_step=1 + ) + + max_retries = 1 + last_error = None + + for attempt in range(max_retries + 1): + try: + history = await agent.run(max_steps=50) # Increased to allow iteration through search results + break + except Exception as retry_error: + last_error = retry_error + if attempt < max_retries: + await asyncio.sleep(5) + continue + else: + return { + 'warnings': [f'Failed after {max_retries + 1} attempts: {str(last_error)}'], + 'publications': [], 'projects': [], 'advising': [], + 'affiliations': [], 'coauthors': [], 'last_update': None, + 'agent_logs': [{'error': str(last_error)}] + } + + try: + agent_logs = [] + all_content = [] + + if hasattr(history, 'all_results'): + for i, r in enumerate(history.all_results): + step_log = {'step': i + 1} + if hasattr(r, 'extracted_content') and r.extracted_content: + all_content.append(str(r.extracted_content)) + step_log['content'] = str(r.extracted_content)[:200] + if hasattr(r, 'long_term_memory') and r.long_term_memory: + step_log['memory'] = str(r.long_term_memory)[:200] + if hasattr(r, 'error') and r.error: + step_log['error'] = str(r.error) + agent_logs.append(step_log) + + if hasattr(history, 'final_result') and history.final_result: + all_content.append(str(history.final_result)) + + full_text = '\n'.join(all_content) + + json_block = re.search(r'```json\s*([\s\S]*?)\s*```', full_text) + if json_block: + try: + result = json.loads(json_block.group(1)) + result['agent_logs'] = agent_logs + return result + except json.JSONDecodeError: + pass + + json_match = re.search(r'\{[^{}]*"warnings"[^{}]*\}', full_text) + if json_match: + try: + result = json.loads(json_match.group()) + result['agent_logs'] = agent_logs + return result + except json.JSONDecodeError: + pass + + json_match = re.search(r'\{[\s\S]*\}', full_text) + if json_match: + try: + result = json.loads(json_match.group()) + result['agent_logs'] = agent_logs + return result + except json.JSONDecodeError: + pass + + return { + 'warnings': [f'No JSON in response'], + 'publications': [], 'projects': [], 'advising': [], + 'affiliations': [], 'coauthors': [], 'last_update': None, + 'agent_logs': agent_logs, + 'raw_content': full_text[:1000] + } + except Exception as e: + return {'warnings': [f'Error: {str(e)}'], 'publications': [], 'projects': [], 'advising': [], 'affiliations': [], 'coauthors': [], 'last_update': None, 'agent_logs': []} + + def _deduplicate_publications(self, pubs: List[Dict]) -> List[Dict]: + seen = set() + unique = [] + for pub in pubs: + doi = pub.get('doi') + if doi: + key = doi.lower() + else: + title = self._normalize_name(pub.get('title', '')) + year = pub.get('year', '') + key = f"{title}_{year}" + if key and key not in seen: + seen.add(key) + unique.append(pub) + return unique + + def _extract_coauthors(self, pubs: List[Dict]) -> List[Dict]: + coauthor_count = defaultdict(int) + for pub in pubs: + for coauthor in pub.get('coauthors', []): + if coauthor: + norm_name = self._normalize_name(coauthor) + coauthor_count[coauthor] += 1 + return [{'name': name, 'count': count} for name, count in sorted(coauthor_count.items(), key=lambda x: -x[1])[:20]] + + def _process_production(self, data: Dict[str, Any], cutoff_date: datetime) -> Dict[str, Any]: + pub_by_type = defaultdict(int) + filtered_pubs = [] + + for pub in data.get('publications', []): + year = self._parse_year(pub.get('year')) + if self._in_window(year, cutoff_date): + filtered_pubs.append(pub) + pub_by_type[pub.get('type', 'other')] += 1 + + filtered_pubs = self._deduplicate_publications(filtered_pubs) + coauthors = self._extract_coauthors(filtered_pubs) + + active_proj, concluded_proj = [], [] + for proj in data.get('projects', []): + if self._in_window(self._parse_year(proj.get('start_year')), cutoff_date): + (active_proj if proj.get('status') == 'active' else concluded_proj).append(proj) + + ongoing_adv, concluded_adv = [], [] + for adv in data.get('advising', []): + if self._in_window(self._parse_year(adv.get('year')), cutoff_date): + (ongoing_adv if adv.get('status') == 'ongoing' else concluded_adv).append(adv) + + activities = [] + for act in data.get('activities', []): + if self._in_window(self._parse_year(act.get('year')), cutoff_date): + activities.append(act) + + return { + 'publications': {'total': len(filtered_pubs), 'by_type': dict(pub_by_type), 'top_items': filtered_pubs[:10]}, + 'projects': {'total': len(active_proj) + len(concluded_proj), 'active': active_proj, 'concluded': concluded_proj}, + 'advising': {'total': len(ongoing_adv) + len(concluded_adv), 'ongoing': ongoing_adv, 'concluded': concluded_adv}, + 'activities': activities, + 'coauthors_extracted': coauthors + } + + def _normalize_name(self, name: str) -> str: + if not name: + return "" + normalized = re.sub(r'\s+', ' ', name.lower().strip()) + for a, p in [('á','a'),('à','a'),('â','a'),('ã','a'),('é','e'),('ê','e'),('í','i'),('ó','o'),('ô','o'),('õ','o'),('ú','u'),('ç','c')]: + normalized = normalized.replace(a, p) + return normalized + + def _names_match(self, n1: str, n2: str) -> Tuple[bool, str]: + norm1, norm2 = self._normalize_name(n1), self._normalize_name(n2) + if norm1 == norm2: + return True, 'high' + if norm1 in norm2 or norm2 in norm1: + return True, 'medium' + p1, p2 = norm1.split(), norm2.split() + if p1 and p2 and p1[-1] == p2[-1]: + return True, 'medium' + return False, 'low' + + def _parse_year(self, val: Any) -> Optional[int]: + if val is None: + return None + if isinstance(val, int): + return val if 1900 <= val <= 2100 else None + match = re.search(r'\b(19|20)\d{2}\b', str(val)) + return int(match.group(0)) if match else None + + def _in_window(self, year: Optional[int], cutoff: datetime) -> bool: + return year is not None and year >= cutoff.year + + def _check_r1(self, a: Dict, b: Dict, cutoff: datetime) -> Tuple[bool, str, List[str]]: + pubs_a = a.get('production_5y', {}).get('publications', {}).get('top_items', []) + pubs_b = b.get('production_5y', {}).get('publications', {}).get('top_items', []) + evidence = [] + + for pa in pubs_a: + ta = self._normalize_name(pa.get('title', '')) + for pb in pubs_b: + if ta and ta == self._normalize_name(pb.get('title', '')): + evidence.append(f"Shared: {pa.get('title')} ({pa.get('year')})") + + name_b = b.get('person', {}).get('name', '') + for co in a.get('coauthors_5y', []): + if self._names_match(co.get('name', ''), name_b)[0]: + evidence.append(f"Coauthor: {co.get('name')} ({co.get('count', 1)}x)") + + return (True, 'high', evidence) if evidence else (False, 'low', []) + + def _check_r2(self, a: Dict, b: Dict, cutoff: datetime) -> Tuple[bool, str, List[str]]: + for src, tgt, src_name in [(a, b, a), (b, a, b)]: + adv = src.get('production_5y', {}).get('advising', {}) + name = tgt.get('person', {}).get('name', '') + for advisee in adv.get('ongoing', []) + adv.get('concluded', []): + match, conf = self._names_match(name, advisee.get('name', '')) + if match: + return True, conf, [f"{src_name.get('person', {}).get('name')} advised {advisee.get('name')}"] + return False, 'low', [] + + def _check_r3(self, a: Dict, b: Dict, cutoff: datetime) -> Tuple[bool, str, List[str]]: + for aa in a.get('affiliations_5y', []): + ia = self._normalize_name(aa.get('institution', '')) + da = self._normalize_name(aa.get('department', '')) + for ab in b.get('affiliations_5y', []): + ib = self._normalize_name(ab.get('institution', '')) + if ia and ia == ib: + if da and da == self._normalize_name(ab.get('department', '')): + return True, 'high', [f"Same dept: {aa.get('institution')} - {aa.get('department')}"] + return True, 'medium', [f"Same inst: {aa.get('institution')}"] + return False, 'low', [] + + def _check_r4(self, a: Dict, b: Dict, cutoff: datetime) -> Tuple[bool, str, List[str]]: + pa = a.get('production_5y', {}).get('projects', {}) + pb = b.get('production_5y', {}).get('projects', {}) + all_a = pa.get('active', []) + pa.get('concluded', []) + all_b = pb.get('active', []) + pb.get('concluded', []) + for p1 in all_a: + t1 = self._normalize_name(p1.get('title', '')) + for p2 in all_b: + if t1 and t1 == self._normalize_name(p2.get('title', '')): + return True, 'high', [f"Shared project: {p1.get('title')}"] + return False, 'low', [] + + def _check_r5(self, a: Dict, b: Dict, cutoff: datetime) -> Tuple[bool, str, List[str]]: + for aa in a.get('production_5y', {}).get('activities', []): + na = self._normalize_name(aa.get('name', '')) + for ab in b.get('production_5y', {}).get('activities', []): + if na and na == self._normalize_name(ab.get('name', '')): + return True, 'medium', [f"Shared activity: {aa.get('name')}"] + return False, 'low', [] + + def _check_r6(self, a: Dict, b: Dict, cutoff: datetime) -> Tuple[bool, str, List[str]]: + _, _, evidence = self._check_r1(a, b, cutoff) + return (True, 'high', evidence) if len(evidence) >= 3 else (False, 'low', []) + + def _check_r7(self, a: Dict, b: Dict, cutoff: datetime) -> Tuple[bool, str, List[str]]: + for aa in a.get('affiliations_5y', []): + la = self._normalize_name(aa.get('lab_group', '')) + for ab in b.get('affiliations_5y', []): + if la and la == self._normalize_name(ab.get('lab_group', '')): + return True, 'high', [f"Same lab: {aa.get('lab_group')}"] + return False, 'low', [] + + def _analyze_coi_pairwise(self, data: List[Dict], config: Dict[str, bool], cutoff: datetime) -> List[Dict]: + pairs = [] + checks = {'R1': self._check_r1, 'R2': self._check_r2, 'R3': self._check_r3, 'R4': self._check_r4, 'R5': self._check_r5, 'R6': self._check_r6, 'R7': self._check_r7} + rule_descriptions = { + 'R1': 'Co-authorship (shared publication)', + 'R2': 'Advisor-advisee relationship', + 'R3': 'Institutional overlap', + 'R4': 'Project overlap', + 'R5': 'Committee/event overlap', + 'R6': 'Frequent co-authorship (3+ publications)', + 'R7': 'Same lab/research group' + } + + for i in range(len(data)): + for j in range(i + 1, len(data)): + a, b = data[i], data[j] + rules_detail = [] + all_evidence = [] + levels = [] + + for rule, fn in checks.items(): + if config.get(rule, True): + triggered, conf, ev = fn(a, b, cutoff) + if triggered: + rules_detail.append({ + 'rule': rule, + 'description': rule_descriptions[rule], + 'confidence': conf, + 'evidence': ev + }) + all_evidence.extend(ev) + levels.append(conf) + + if rules_detail: + pairs.append({ + 'a_lattes_id': a.get('person', {}).get('lattes_id'), + 'b_lattes_id': b.get('person', {}).get('lattes_id'), + 'a_name': a.get('person', {}).get('name'), + 'b_name': b.get('person', {}).get('name'), + 'a_profile_url': a.get('person', {}).get('profile_url'), + 'b_profile_url': b.get('person', {}).get('profile_url'), + 'rules_triggered': [r['rule'] for r in rules_detail], + 'rules_detail': rules_detail, + 'confidence': 'high' if 'high' in levels else ('medium' if 'medium' in levels else 'low'), + 'evidence_summary': all_evidence + }) + return pairs + + def _generate_summary(self, results: Dict) -> str: + n = results['execution_metadata']['num_researchers'] + w = results['execution_metadata']['time_window_years'] + p = len(results['coi_matrix']['pairs']) + + if p == 0: + return f"Analyzed {n} researchers over {w} years. No COI detected." + + h = sum(1 for x in results['coi_matrix']['pairs'] if x['confidence'] == 'high') + m = sum(1 for x in results['coi_matrix']['pairs'] if x['confidence'] == 'medium') + l = p - h - m + return f"Analyzed {n} researchers over {w} years. {p} COI found ({h} high, {m} medium, {l} low)." + + def _error_response(self, error_type: str, message: str) -> str: + return json.dumps({'status': 'error', 'error_type': error_type, 'message': message, 'timestamp': datetime.now().isoformat()}, ensure_ascii=False, indent=2) + + def _collect_all_profiles( + self, + student: Dict[str, str], + advisor: Dict[str, str], + committee_members: List[Dict[str, Any]], + cutoff_date: datetime + ) -> Dict[str, Any]: + """ + Browser Tool: Collect all profiles from Lattes platform. + Returns dict with student_data, advisor_data, and members_data. + + Note: The checkbox "Demais pesquisadores" is only needed for student search. + If student extraction fails, the entire collection is aborted. + """ + collection_log = [] + total = 2 + len([m for m in committee_members if m.get('lattes_id') != advisor.get('lattes_id') and m.get('role') != 'advisor']) + current = 0 + + # Extract student FIRST (requires checkbox "Demais pesquisadores") + current += 1 + collection_log.append(f"Extracting {current}/{total}: {student.get('name', 'Unknown')} (student - requires checkbox)") + student_data = self._extract_researcher_profile( + student.get('name', ''), + student.get('lattes_id', ''), + cutoff_date, + is_student=True # This enables checkbox verification in the prompt + ) + + # Check if student extraction failed - if so, abort the entire collection + student_warnings = student_data.get('warnings', []) + student_failed = any(w in student_warnings for w in ['profile_not_found', 'captcha_blocked', 'page_error', 'Extraction failed']) + + if student_failed: + collection_log.append(f"ABORTED: Student extraction failed. Warnings: {student_warnings}") + return { + 'student_data': student_data, + 'advisor_data': None, + 'members_data': [], + 'collection_log': collection_log, + 'aborted': True, + 'abort_reason': f"Student extraction failed: {student_warnings}" + } + + # Extract advisor (no checkbox needed - established researchers appear in default search) + current += 1 + collection_log.append(f"Extracting {current}/{total}: {advisor.get('name', 'Unknown')} (advisor)") + advisor_data = self._extract_researcher_profile( + advisor.get('name', ''), + advisor.get('lattes_id', ''), + cutoff_date, + is_student=False + ) + + # Extract committee members (excluding advisor) - no checkbox needed + members_data = [] + for member in committee_members: + member_role = member.get('role', 'unknown') + if member_role == 'advisor' or member.get('lattes_id') == advisor.get('lattes_id'): + continue + + current += 1 + collection_log.append(f"Extracting {current}/{total}: {member.get('name', 'Unknown')} ({member_role})") + member_data = self._extract_researcher_profile( + member.get('name', ''), + member.get('lattes_id', ''), + cutoff_date, + is_student=False + ) + members_data.append({ + 'member_info': member, + 'profile_data': member_data + }) + + return { + 'student_data': student_data, + 'advisor_data': advisor_data, + 'members_data': members_data, + 'collection_log': collection_log, + 'aborted': False + } + + def _judge_committee( + self, + student_data: Dict[str, Any], + members_data: List[Dict[str, Any]], + coi_config: Dict[str, bool], + cutoff_date: datetime + ) -> Dict[str, Any]: + """ + Judge Tool: Analyze COI between student and each committee member. + No browser operations - pure data analysis. + """ + members_analysis = [] + conflicts = [] + + for member_entry in members_data: + member_info = member_entry['member_info'] + member_profile = member_entry['profile_data'] + member_role = member_info.get('role', 'unknown') + + # Analyze COI between student and this member + coi_result = self._analyze_coi_pair(student_data, member_profile, coi_config, cutoff_date) + + member_analysis = { + 'member': { + 'name': member_info.get('name'), + 'lattes_id': member_info.get('lattes_id'), + 'role': member_role, + 'institution': member_info.get('institution'), + 'profile_url': member_profile.get('person', {}).get('profile_url') + }, + 'extraction_warnings': member_profile.get('warnings', []), + 'coi_detected': coi_result['has_coi'], + 'coi_details': coi_result['details'] + } + + members_analysis.append(member_analysis) + + if coi_result['has_coi']: + conflicts.append({ + 'student_name': student_data.get('person', {}).get('name'), + 'member_name': member_info.get('name'), + 'member_role': member_role, + 'rules_triggered': coi_result['rules_triggered'], + 'confidence': coi_result['confidence'], + 'evidence': coi_result['evidence'] + }) + + return { + 'members_analysis': members_analysis, + 'conflicts': conflicts, + 'has_conflicts': len(conflicts) > 0 + } + + def validate_committee( + self, + student: Dict[str, str], + advisor: Dict[str, str], + committee_members: List[Dict[str, Any]], + time_window: int = 5, + coi_rules_config: Dict[str, bool] = None + ) -> str: + """ + Validate academic committee for conflicts of interest. + + Architecture: + 1. _collect_all_profiles() - Browser Tool: extracts all Lattes profiles + 2. _judge_committee() - Judge Tool: analyzes COI (no browser) + + Analyzes COI only between student and non-advisor committee members. + Advisor-student COI is expected and excluded from analysis. + Member-member COI is not relevant for committee validation. + """ + try: + coi_config = coi_rules_config or {"R1": True, "R2": True, "R3": True, "R4": True, "R5": True, "R6": True, "R7": True} + cutoff_date = datetime.now() - timedelta(days=time_window * 365) + + results = { + 'status': 'valid', + 'execution_metadata': { + 'execution_date': datetime.now().isoformat(), + 'time_window_years': time_window, + 'cutoff_date': cutoff_date.isoformat(), + 'coi_rules_active': coi_config, + 'browser_use_available': self.browser_available + }, + 'student': None, + 'advisor': None, + 'members_analysis': [], + 'conflicts': [], + 'collection_log': [], + 'summary': '' + } + + # STEP 1: Browser Tool - Collect all profiles + collected = self._collect_all_profiles(student, advisor, committee_members, cutoff_date) + + results['student'] = collected['student_data'] + results['advisor'] = collected.get('advisor_data') + results['collection_log'] = collected['collection_log'] + + # Check if collection was aborted (student extraction failed) + if collected.get('aborted'): + results['status'] = 'error' + results['summary'] = f"Collection aborted: {collected.get('abort_reason', 'Student extraction failed')}" + return json.dumps(results, ensure_ascii=False, indent=2) + + # STEP 2: Judge Tool - Analyze COI (no browser operations) + judgment = self._judge_committee( + collected['student_data'], + collected['members_data'], + coi_config, + cutoff_date + ) + + results['members_analysis'] = judgment['members_analysis'] + results['conflicts'] = judgment['conflicts'] + + if judgment['has_conflicts']: + results['status'] = 'invalid' + + # Generate summary + num_members = len(results['members_analysis']) + num_conflicts = len(results['conflicts']) + + if num_conflicts == 0: + results['summary'] = f"Committee valid. Analyzed {num_members} members against student. No conflicts detected." + else: + conflict_names = [c['member_name'] for c in results['conflicts']] + results['summary'] = f"Committee INVALID. {num_conflicts} conflict(s) detected with: {', '.join(conflict_names)}." + + return json.dumps(results, ensure_ascii=False, indent=2) + + except Exception as e: + return self._error_response('unexpected_error', str(e)) + + def _analyze_coi_pair(self, a: Dict, b: Dict, config: Dict[str, bool], cutoff: datetime) -> Dict[str, Any]: + """Analyze COI between two researchers (student vs member).""" + checks = { + 'R1': self._check_r1, + 'R2': self._check_r2, + 'R3': self._check_r3, + 'R4': self._check_r4, + 'R5': self._check_r5, + 'R6': self._check_r6, + 'R7': self._check_r7 + } + rule_descriptions = { + 'R1': 'Co-authorship (shared publication)', + 'R2': 'Advisor-advisee relationship', + 'R3': 'Institutional overlap', + 'R4': 'Project overlap', + 'R5': 'Committee/event overlap', + 'R6': 'Frequent co-authorship (3+ publications)', + 'R7': 'Same lab/research group' + } + + details = [] + all_evidence = [] + rules_triggered = [] + levels = [] + + for rule, fn in checks.items(): + if config.get(rule, True): + triggered, conf, ev = fn(a, b, cutoff) + if triggered: + rules_triggered.append(rule) + details.append({ + 'rule': rule, + 'description': rule_descriptions[rule], + 'confidence': conf, + 'evidence': ev + }) + all_evidence.extend(ev) + levels.append(conf) + + has_coi = len(rules_triggered) > 0 + confidence = 'high' if 'high' in levels else ('medium' if 'medium' in levels else 'low') + + return { + 'has_coi': has_coi, + 'rules_triggered': rules_triggered, + 'confidence': confidence if has_coi else None, + 'evidence': all_evidence, + 'details': details + } \ No newline at end of file diff --git a/tools/cnpq_lattes_navigator/api/main.py b/tools/cnpq_lattes_navigator/api/main.py new file mode 100644 index 0000000..0a3418a --- /dev/null +++ b/tools/cnpq_lattes_navigator/api/main.py @@ -0,0 +1,137 @@ +import os +import sys +from typing import List, Optional +from fastapi import FastAPI +from pydantic import BaseModel +import json + +app = FastAPI(title="CNPq/Lattes Navigator API", version="1.0.0") + +# Capture import error for diagnostics +browser_import_error = None +try: + from lattes_navigator import Tools, BROWSER_USE_AVAILABLE, BROWSER_IMPORT_ERROR + tool = Tools() + browser_import_error = BROWSER_IMPORT_ERROR +except Exception as e: + browser_import_error = str(e) + BROWSER_USE_AVAILABLE = False + tool = None + + +class Researcher(BaseModel): + name: str + lattes_id: str + + +class AnalysisRequest(BaseModel): + researchers: List[Researcher] + time_window: int = 5 + coi_rules: Optional[dict] = None + + +class CommitteeMember(BaseModel): + name: str + lattes_id: str + email: Optional[str] = None + institution: Optional[str] = None + role: str # "advisor", "internal", "external", "substitute" + is_president: bool = False + + +class CommitteeValidationRequest(BaseModel): + student: Researcher + advisor: Researcher + committee_members: List[CommitteeMember] + thesis_title: Optional[str] = None + committee_type: str = "qualification" # "qualification", "defense" + time_window: int = 5 + coi_rules: Optional[dict] = None + + +class HealthResponse(BaseModel): + status: str + browser_available: bool + api_key_set: bool + import_error: Optional[str] = None + python_version: str + + +@app.get("/health", response_model=HealthResponse) +def health(): + return HealthResponse( + status="ok" if tool else "error", + browser_available=BROWSER_USE_AVAILABLE if tool else False, + api_key_set=bool(os.getenv("OPENAI_API_KEY")), + import_error=browser_import_error, + python_version=sys.version + ) + + +@app.get("/debug") +def debug(): + errors = [] + + try: + from browser_use import Agent + errors.append({"browser_use.Agent": "OK"}) + except Exception as e: + errors.append({"browser_use.Agent": str(e)}) + + try: + from browser_use import ChatOpenAI + errors.append({"browser_use.ChatOpenAI": "OK"}) + except Exception as e: + errors.append({"browser_use.ChatOpenAI": str(e)}) + + try: + import playwright + errors.append({"playwright": "OK", "version": playwright.__version__}) + except Exception as e: + errors.append({"playwright": str(e)}) + + return {"imports": errors, "python": sys.version} + + +@app.post("/analyze") +def analyze(request: AnalysisRequest): + if not tool: + return {"status": "error", "message": "Tool not initialized", "import_error": browser_import_error} + + researchers_json = json.dumps([r.model_dump() for r in request.researchers]) + coi_config = json.dumps(request.coi_rules or {"R1": True, "R2": True, "R3": True, "R4": True, "R5": True, "R6": True, "R7": True}) + + result = tool.analyze_researchers_coi( + researchers_json=researchers_json, + time_window=request.time_window, + coi_rules_config=coi_config + ) + + return json.loads(result) + + +@app.post("/validate-committee") +def validate_committee(request: CommitteeValidationRequest): + if not tool: + return {"status": "error", "message": "Tool not initialized", "import_error": browser_import_error} + + student_data = request.student.model_dump() + advisor_data = request.advisor.model_dump() + members_data = [m.model_dump() for m in request.committee_members] + coi_config = request.coi_rules or {"R1": True, "R2": True, "R3": True, "R4": True, "R5": True, "R6": True, "R7": True} + + result = tool.validate_committee( + student=student_data, + advisor=advisor_data, + committee_members=members_data, + time_window=request.time_window, + coi_rules_config=coi_config + ) + + return json.loads(result) + + +if __name__ == "__main__": + import uvicorn + port = int(os.getenv("PORT", 8000)) + uvicorn.run(app, host="0.0.0.0", port=port) diff --git a/tools/cnpq_lattes_navigator/api/requirements.txt b/tools/cnpq_lattes_navigator/api/requirements.txt new file mode 100644 index 0000000..caa9f88 --- /dev/null +++ b/tools/cnpq_lattes_navigator/api/requirements.txt @@ -0,0 +1,7 @@ +fastapi>=0.100.0 +uvicorn>=0.23.0 +pydantic>=2.0.0 +python-dateutil>=2.8.0 +browser-use +playwright + diff --git a/tools/cnpq_lattes_navigator/demo/README.md b/tools/cnpq_lattes_navigator/demo/README.md new file mode 100644 index 0000000..f7e5cd0 --- /dev/null +++ b/tools/cnpq_lattes_navigator/demo/README.md @@ -0,0 +1,46 @@ +# Demo - Local Browser Testing + +Test browser-use navigation locally with visible browser. + +## Setup + +```bash +pip install -r requirements.txt +playwright install chromium +export OPENAI_API_KEY="sk-..." +``` + +## Tests + +### 1. Navigation Test (Debug) + +Isolates navigation issues with minimal tasks: + +```bash +python test_navigation.py +``` + +Options: +- Test 1: Direct URL to profile +- Test 2: Search portal with ID parameter +- Test 3: Search form interaction + +### 2. Full Extraction Test + +Complete extraction task matching API behavior: + +```bash +python test_browser.py +python test_browser.py --lattes-id 4003190744770195 --name "Ricardo Marcacini" +python test_browser.py --headless # Run without visible browser +``` + +## Observed Issues + +From Railway logs: +- Captcha challenges on Lattes pages +- CDP timeout errors +- Agent falling back to DuckDuckGo search + +Use these tests to validate navigation paths before deploying. + diff --git a/tools/cnpq_lattes_navigator/demo/requirements.txt b/tools/cnpq_lattes_navigator/demo/requirements.txt new file mode 100644 index 0000000..0b9c21f --- /dev/null +++ b/tools/cnpq_lattes_navigator/demo/requirements.txt @@ -0,0 +1,3 @@ +browser-use +playwright + diff --git a/tools/cnpq_lattes_navigator/demo/test_browser.py b/tools/cnpq_lattes_navigator/demo/test_browser.py new file mode 100644 index 0000000..687e88b --- /dev/null +++ b/tools/cnpq_lattes_navigator/demo/test_browser.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +""" +Local browser test for CNPq/Lattes Navigator. +Runs with visible browser to observe AI agent navigation. + +Usage: + export OPENAI_API_KEY="sk-..." + python test_browser.py + python test_browser.py --lattes-id 4003190744770195 +""" +import os +import sys +import asyncio +import argparse + +def check_deps(): + if not os.getenv("OPENAI_API_KEY"): + print("Error: OPENAI_API_KEY not set") + sys.exit(1) + try: + from browser_use import Agent, Browser, BrowserConfig, ChatOpenAI + return Agent, Browser, BrowserConfig, ChatOpenAI + except ImportError as e: + print(f"Error: {e}") + print("Install: pip install browser-use playwright && playwright install chromium") + sys.exit(1) + + +async def run_test(lattes_id: str, name: str, headless: bool = False): + Agent, Browser, BrowserConfig, ChatOpenAI = check_deps() + + print(f"\nTesting Lattes ID: {lattes_id}") + print(f"Researcher: {name}") + print(f"Headless: {headless}") + print("-" * 50) + + browser = Browser(config=BrowserConfig(headless=headless)) + llm = ChatOpenAI(model=os.getenv("OPENAI_MODEL", "gpt-4o-mini")) + + profile_url = f"http://lattes.cnpq.br/{lattes_id}" + + task = f""" +TASK: Extract academic data from Brazilian Lattes CV. + +DO NOT use search engines. Navigate DIRECTLY to these URLs: + +STEP 1: Go to https://buscatextual.cnpq.br/buscatextual/visualizacv.do?id={lattes_id} +STEP 2: If that fails, try: {profile_url} +STEP 3: Wait for researcher name "{name}" to appear on page +STEP 4: Scroll down and look for sections (in Portuguese): + - "Artigos completos publicados" = journal articles + - "Projetos de pesquisa" = projects + - "Orientacoes" = supervisions +STEP 5: Extract data from years 2020-2025 only + +STEP 6: Return ONLY this JSON (no other text): +```json +{{ + "last_update": null, + "affiliations": [], + "publications": [{{"title": "...", "year": 2024, "type": "journal"}}], + "projects": [{{"title": "...", "start_year": 2022}}], + "advising": [{{"name": "...", "level": "PhD", "year": 2023}}], + "coauthors": [], + "warnings": [] +}} +``` + +If page blocked or captcha, return: {{"warnings": ["captcha_blocked"], "publications": [], "projects": [], "advising": [], "affiliations": [], "coauthors": [], "last_update": null}} +""" + + agent = Agent(task=task, llm=llm, browser=browser) + + print("\nStarting browser agent...") + print("Watch the browser window to see navigation.\n") + + try: + result = await agent.run(max_steps=25) + print("\n" + "=" * 50) + print("RESULT:") + print("=" * 50) + print(result) + except Exception as e: + print(f"\nError: {e}") + finally: + if not headless: + print("\nKeeping browser open for 10s...") + await asyncio.sleep(10) + await browser.close() + + +def main(): + parser = argparse.ArgumentParser(description="Test browser-use with Lattes") + parser.add_argument("--lattes-id", default="4003190744770195", help="Lattes ID to test") + parser.add_argument("--name", default="Ricardo Marcacini", help="Researcher name") + parser.add_argument("--headless", action="store_true", help="Run headless (no visible browser)") + args = parser.parse_args() + + asyncio.run(run_test(args.lattes_id, args.name, args.headless)) + + +if __name__ == "__main__": + main() + diff --git a/tools/cnpq_lattes_navigator/demo/test_navigation.py b/tools/cnpq_lattes_navigator/demo/test_navigation.py new file mode 100644 index 0000000..d9699bb --- /dev/null +++ b/tools/cnpq_lattes_navigator/demo/test_navigation.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +""" +Simple navigation test - just checks if Lattes page loads. +No extraction, minimal task to debug navigation issues. + +Usage: + export OPENAI_API_KEY="sk-..." + python test_navigation.py +""" +import os +import sys +import asyncio + +def check_deps(): + if not os.getenv("OPENAI_API_KEY"): + print("Error: OPENAI_API_KEY not set") + sys.exit(1) + try: + from browser_use import Agent, Browser, BrowserConfig, ChatOpenAI + return Agent, Browser, BrowserConfig, ChatOpenAI + except ImportError as e: + print(f"Error: {e}") + sys.exit(1) + + +async def test_direct_url(): + """Test 1: Direct URL navigation""" + Agent, Browser, BrowserConfig, ChatOpenAI = check_deps() + + print("\n" + "=" * 50) + print("TEST 1: Direct URL Navigation") + print("=" * 50) + + browser = Browser(config=BrowserConfig(headless=False)) + llm = ChatOpenAI(model=os.getenv("OPENAI_MODEL", "gpt-4o-mini")) + + task = """ +Go directly to http://lattes.cnpq.br/4003190744770195 +Wait for the page to load. +Tell me what you see on the page. +Return: {"success": true/false, "page_title": "...", "error": null} +""" + + agent = Agent(task=task, llm=llm, browser=browser) + + try: + result = await agent.run(max_steps=10) + print(f"Result: {result}") + finally: + await asyncio.sleep(5) + await browser.close() + + +async def test_search_portal(): + """Test 2: Search portal navigation""" + Agent, Browser, BrowserConfig, ChatOpenAI = check_deps() + + print("\n" + "=" * 50) + print("TEST 2: Search Portal Navigation") + print("=" * 50) + + browser = Browser(config=BrowserConfig(headless=False)) + llm = ChatOpenAI(model=os.getenv("OPENAI_MODEL", "gpt-4o-mini")) + + task = """ +Go directly to https://buscatextual.cnpq.br/buscatextual/visualizacv.do?id=4003190744770195 +Wait for the page to load. +Tell me what you see on the page. +Return: {"success": true/false, "page_title": "...", "error": null} +""" + + agent = Agent(task=task, llm=llm, browser=browser) + + try: + result = await agent.run(max_steps=10) + print(f"Result: {result}") + finally: + await asyncio.sleep(5) + await browser.close() + + +async def test_search_form(): + """Test 3: Use search form""" + Agent, Browser, BrowserConfig, ChatOpenAI = check_deps() + + print("\n" + "=" * 50) + print("TEST 3: Search Form") + print("=" * 50) + + browser = Browser(config=BrowserConfig(headless=False)) + llm = ChatOpenAI(model=os.getenv("OPENAI_MODEL", "gpt-4o-mini")) + + task = """ +1. Go to https://buscatextual.cnpq.br/buscatextual/busca.do?metodo=apresentar +2. Wait for search form to load +3. Find input field for "Nome" (name) +4. Type "Ricardo Marcacini" +5. Click search button +6. Wait for results +7. Click first result +8. Tell me what you see +Return: {"success": true/false, "found_profile": true/false, "error": null} +""" + + agent = Agent(task=task, llm=llm, browser=browser) + + try: + result = await agent.run(max_steps=15) + print(f"Result: {result}") + finally: + await asyncio.sleep(5) + await browser.close() + + +async def main(): + print("Lattes Navigation Tests") + print("Watch the browser window to see what happens.\n") + + tests = [ + ("Direct URL", test_direct_url), + ("Search Portal", test_search_portal), + ("Search Form", test_search_form), + ] + + print("Available tests:") + for i, (name, _) in enumerate(tests, 1): + print(f" {i}. {name}") + + choice = input("\nRun test (1-3, or 'all'): ").strip() + + if choice == "all": + for name, test in tests: + await test() + print("\n") + elif choice in ["1", "2", "3"]: + await tests[int(choice) - 1][1]() + else: + print("Invalid choice") + + +if __name__ == "__main__": + asyncio.run(main()) + diff --git a/tools/cnpq_lattes_navigator/examples/README.md b/tools/cnpq_lattes_navigator/examples/README.md new file mode 100644 index 0000000..dd76d70 --- /dev/null +++ b/tools/cnpq_lattes_navigator/examples/README.md @@ -0,0 +1,64 @@ +# CNPq/Lattes Navigator - Examples + +This directory contains example input and output files for the CNPq/Lattes Navigator tool. + +## Files + +### input_example.json + +Example input showing how to structure the researchers list and configuration parameters. + +**Key fields:** +- `researchers`: Array of objects with `name` and `lattes_id` +- `time_window`: Number of years to analyze (default: 5) +- `coi_rules_config`: Configuration object to enable/disable specific COI rules + +### output_example.json + +Example output showing the complete structure of the tool's response when COI is detected. + +**Key sections:** +- `execution_metadata`: Information about the analysis run +- `researchers`: Per-researcher profile data with production summaries +- `coi_matrix`: Pairwise conflict of interest detections with evidence +- `summary_text`: Human-readable summary + +## Usage in Open WebUI + +When using the tool in Open WebUI, you would provide the researchers data as a JSON string: + +``` +Can you analyze these researchers for conflicts of interest: +[ + {"name": "Ana Silva Santos", "lattes_id": "1234567890123456"}, + {"name": "Carlos Oliveira Lima", "lattes_id": "2345678901234567"} +] +``` + +The agent will automatically invoke the tool and return structured results. + +## Important Notes + +1. **Anonymized Data**: The examples use anonymized/fictional data to protect privacy. +2. **Mock Data Warning**: Without browser-use properly configured, the tool will return mock data with warnings. +3. **Evidence**: All COI detections include evidence URLs and specific details. +4. **Confidence Levels**: Each COI detection includes a confidence level (high/medium/low). + +## COI Rules Summary + +- **R1**: Co-authorship (≥1 shared publication) +- **R2**: Advisor-advisee relationship +- **R3**: Institutional overlap (same department/program) +- **R4**: Project team overlap +- **R5**: Committee/board/event overlap +- **R6**: Frequent co-authorship (≥3 publications) +- **R7**: Strong institutional proximity (same lab/group) + +## Testing + +To test the tool with the example input: + +1. Import the tool into Open WebUI +2. Use the researchers from `input_example.json` in your query +3. Compare the output structure with `output_example.json` + diff --git a/tools/cnpq_lattes_navigator/examples/input_example.json b/tools/cnpq_lattes_navigator/examples/input_example.json new file mode 100644 index 0000000..dae40b4 --- /dev/null +++ b/tools/cnpq_lattes_navigator/examples/input_example.json @@ -0,0 +1,28 @@ +{ + "description": "Example input for CNPq/Lattes Navigator tool", + "researchers": [ + { + "name": "Ana Silva Santos", + "lattes_id": "1234567890123456" + }, + { + "name": "Carlos Oliveira Lima", + "lattes_id": "2345678901234567" + }, + { + "name": "Beatriz Costa Ferreira", + "lattes_id": "3456789012345678" + } + ], + "time_window": 5, + "coi_rules_config": { + "R1": true, + "R2": true, + "R3": true, + "R4": true, + "R5": true, + "R6": true, + "R7": true + } +} + diff --git a/tools/cnpq_lattes_navigator/examples/invalid_committee.json b/tools/cnpq_lattes_navigator/examples/invalid_committee.json new file mode 100644 index 0000000..3740a32 --- /dev/null +++ b/tools/cnpq_lattes_navigator/examples/invalid_committee.json @@ -0,0 +1,63 @@ +{ + "student": { + "name": "Matheus Yasuo Ribeiro Utino", + "lattes_id": "6191612710855387" + }, + "advisor": { + "name": "Ricardo Marcondes Marcacini", + "lattes_id": "3272611282260295" + }, + "committee_members": [ + { + "name": "Ricardo Marcondes Marcacini", + "lattes_id": "3272611282260295", + "email": "ricardo.marcacini@icmc.usp.br", + "institution": "ICMC-USP", + "role": "advisor", + "is_president": true + }, + { + "name": "Solange Oliveira Rezende", + "lattes_id": "8526960535874806", + "email": "solange@icmc.usp.br", + "institution": "ICMC-USP", + "role": "internal", + "is_president": false + }, + { + "name": "Paulo Roberto Mann Marques Júnior", + "lattes_id": "3571577377652346", + "email": "paulomann@ufrj.br", + "institution": "UFRJ", + "role": "external", + "is_president": false + }, + { + "name": "Ricardo Cerri", + "lattes_id": "6266519868438512", + "email": "cerri@icmc.usp.br", + "institution": "ICMC-USP", + "role": "substitute", + "is_president": false + }, + { + "name": "Renato Tinós", + "lattes_id": "1273134370963830", + "email": "rtinos@ffclrp.usp.br", + "institution": "FFCLRP", + "role": "substitute", + "is_president": false + }, + { + "name": "Jônata Tyska Carvalho", + "lattes_id": "9494364044256921", + "email": "jonata.tyska@ufsc.br", + "institution": "UFSC", + "role": "substitute", + "is_president": false + } + ], + "thesis_title": "Unstructured Text Mining in the Era of Large Language Models", + "committee_type": "qualification", + "time_window": 5 +} \ No newline at end of file diff --git a/tools/cnpq_lattes_navigator/examples/output_example.json b/tools/cnpq_lattes_navigator/examples/output_example.json new file mode 100644 index 0000000..1d79975 --- /dev/null +++ b/tools/cnpq_lattes_navigator/examples/output_example.json @@ -0,0 +1,408 @@ +{ + "status": "success", + "execution_metadata": { + "execution_date": "2024-11-13T10:30:00.000Z", + "time_window_years": 5, + "cutoff_date": "2019-11-13T10:30:00.000Z", + "num_researchers": 3, + "coi_rules_active": { + "R1": true, + "R2": true, + "R3": true, + "R4": true, + "R5": true, + "R6": true, + "R7": true + } + }, + "researchers": [ + { + "person": { + "name": "Ana Silva Santos", + "lattes_id": "1234567890123456", + "profile_url": "http://lattes.cnpq.br/1234567890123456", + "last_update": "2024-10-15" + }, + "production_5y": { + "publications": { + "total": 12, + "by_type": { + "journal": 8, + "conference": 3, + "chapter": 1 + }, + "top_items": [ + { + "title": "Machine Learning Applications in Public Health Systems", + "year": 2023, + "venue": "Journal of Health Informatics", + "type": "journal", + "authors": ["Ana Silva Santos", "Carlos Oliveira Lima", "Maria Souza"] + }, + { + "title": "Data-Driven Decision Making in Government Services", + "year": 2022, + "venue": "International Conference on E-Government", + "type": "conference", + "authors": ["Ana Silva Santos", "João Pedro", "Beatriz Costa Ferreira"] + }, + { + "title": "Privacy Preserving Techniques for Public Data", + "year": 2021, + "venue": "Information Security Journal", + "type": "journal", + "authors": ["Ana Silva Santos", "Ricardo Alves"] + } + ] + }, + "projects": { + "total": 3, + "active": [ + { + "title": "AI for Public Services Modernization", + "role": "Coordinator", + "sponsor": "CNPq", + "start_year": 2022, + "end_year": null + } + ], + "concluded": [ + { + "title": "Digital Transformation in Healthcare", + "role": "Researcher", + "sponsor": "FAPESP", + "start_year": 2019, + "end_year": 2022 + }, + { + "title": "Open Data Platform Development", + "role": "Co-coordinator", + "sponsor": "Ministry of Planning", + "start_year": 2020, + "end_year": 2021 + } + ] + }, + "advising": { + "total": 5, + "ongoing": [ + { + "name": "Pedro Martins", + "level": "PhD", + "start_year": 2022 + }, + { + "name": "Julia Rodrigues", + "level": "MS", + "start_year": 2023 + } + ], + "concluded": [ + { + "name": "Lucas Fernandes", + "level": "MS", + "year": 2021 + }, + { + "name": "Camila Rocha", + "level": "MS", + "year": 2020 + }, + { + "name": "Rafael Dias", + "level": "IC", + "year": 2022 + } + ] + }, + "activities": [ + { + "name": "Brazilian Conference on Artificial Intelligence - Program Committee", + "role": "PC Member", + "year": 2023 + }, + { + "name": "National E-Government Workshop - Organization", + "role": "Organizing Committee", + "year": 2022 + } + ] + }, + "affiliations_5y": [ + { + "institution": "Universidade de São Paulo", + "department": "Instituto de Ciências Matemáticas e de Computação", + "lab_group": "Laboratório de Inteligência Computacional", + "start_year": 2018, + "end_year": null + } + ], + "coauthors_5y": [ + { + "name": "Carlos Oliveira Lima", + "count": 4 + }, + { + "name": "Beatriz Costa Ferreira", + "count": 2 + }, + { + "name": "Maria Souza", + "count": 3 + }, + { + "name": "João Pedro", + "count": 2 + }, + { + "name": "Ricardo Alves", + "count": 1 + } + ], + "warnings": [ + "browser-use library not installed - using mock data. Install with: pip install browser-use" + ], + "evidence_urls": [ + "http://lattes.cnpq.br/1234567890123456" + ] + }, + { + "person": { + "name": "Carlos Oliveira Lima", + "lattes_id": "2345678901234567", + "profile_url": "http://lattes.cnpq.br/2345678901234567", + "last_update": "2024-09-20" + }, + "production_5y": { + "publications": { + "total": 15, + "by_type": { + "journal": 10, + "conference": 4, + "book": 1 + }, + "top_items": [ + { + "title": "Machine Learning Applications in Public Health Systems", + "year": 2023, + "venue": "Journal of Health Informatics", + "type": "journal", + "authors": ["Ana Silva Santos", "Carlos Oliveira Lima", "Maria Souza"] + }, + { + "title": "Deep Learning for Medical Image Analysis", + "year": 2023, + "venue": "Medical Imaging Conference", + "type": "conference", + "authors": ["Carlos Oliveira Lima", "Patricia Mendes"] + } + ] + }, + "projects": { + "total": 2, + "active": [ + { + "title": "AI for Public Services Modernization", + "role": "Researcher", + "sponsor": "CNPq", + "start_year": 2022, + "end_year": null + } + ], + "concluded": [ + { + "title": "Digital Transformation in Healthcare", + "role": "Researcher", + "sponsor": "FAPESP", + "start_year": 2019, + "end_year": 2022 + } + ] + }, + "advising": { + "total": 3, + "ongoing": [ + { + "name": "Marcos Silva", + "level": "PhD", + "start_year": 2021 + } + ], + "concluded": [ + { + "name": "Fernanda Costa", + "level": "MS", + "year": 2020 + }, + { + "name": "Gabriel Santos", + "level": "MS", + "year": 2022 + } + ] + }, + "activities": [] + }, + "affiliations_5y": [ + { + "institution": "Universidade de São Paulo", + "department": "Instituto de Ciências Matemáticas e de Computação", + "lab_group": "Laboratório de Inteligência Computacional", + "start_year": 2017, + "end_year": null + } + ], + "coauthors_5y": [ + { + "name": "Ana Silva Santos", + "count": 4 + }, + { + "name": "Maria Souza", + "count": 3 + }, + { + "name": "Patricia Mendes", + "count": 2 + } + ], + "warnings": [ + "browser-use library not installed - using mock data. Install with: pip install browser-use" + ], + "evidence_urls": [ + "http://lattes.cnpq.br/2345678901234567" + ] + }, + { + "person": { + "name": "Beatriz Costa Ferreira", + "lattes_id": "3456789012345678", + "profile_url": "http://lattes.cnpq.br/3456789012345678", + "last_update": "2024-11-01" + }, + "production_5y": { + "publications": { + "total": 8, + "by_type": { + "journal": 5, + "conference": 3 + }, + "top_items": [ + { + "title": "Data-Driven Decision Making in Government Services", + "year": 2022, + "venue": "International Conference on E-Government", + "type": "conference", + "authors": ["Ana Silva Santos", "João Pedro", "Beatriz Costa Ferreira"] + }, + { + "title": "Blockchain Applications in Public Administration", + "year": 2021, + "venue": "Government Information Quarterly", + "type": "journal", + "authors": ["Beatriz Costa Ferreira", "Roberto Nunes"] + } + ] + }, + "projects": { + "total": 1, + "active": [], + "concluded": [ + { + "title": "Open Data Platform Development", + "role": "Researcher", + "sponsor": "Ministry of Planning", + "start_year": 2020, + "end_year": 2021 + } + ] + }, + "advising": { + "total": 1, + "ongoing": [ + { + "name": "Sofia Almeida", + "level": "MS", + "start_year": 2023 + } + ], + "concluded": [] + }, + "activities": [ + { + "name": "National E-Government Workshop - Organization", + "role": "Organizing Committee", + "year": 2022 + } + ] + }, + "affiliations_5y": [ + { + "institution": "Universidade Federal do Rio de Janeiro", + "department": "Instituto de Computação", + "lab_group": null, + "start_year": 2019, + "end_year": null + } + ], + "coauthors_5y": [ + { + "name": "Ana Silva Santos", + "count": 2 + }, + { + "name": "João Pedro", + "count": 1 + }, + { + "name": "Roberto Nunes", + "count": 3 + } + ], + "warnings": [ + "browser-use library not installed - using mock data. Install with: pip install browser-use" + ], + "evidence_urls": [ + "http://lattes.cnpq.br/3456789012345678" + ] + } + ], + "coi_matrix": { + "pairs": [ + { + "a_lattes_id": "1234567890123456", + "b_lattes_id": "2345678901234567", + "a_name": "Ana Silva Santos", + "b_name": "Carlos Oliveira Lima", + "rules_triggered": ["R1", "R3", "R4", "R6", "R7"], + "confidence": "high", + "evidence": [ + "Shared publication: Machine Learning Applications in Public Health Systems (2023)", + "Shared publication: Digital Health Systems Analysis (2022)", + "Shared publication: AI in Healthcare Applications (2021)", + "Shared publication: Public Health Data Mining (2020)", + "Same affiliation: Universidade de São Paulo - Instituto de Ciências Matemáticas e de Computação", + "Same lab/group: Laboratório de Inteligência Computacional", + "Shared project: AI for Public Services Modernization", + "Shared project: Digital Transformation in Healthcare" + ] + }, + { + "a_lattes_id": "1234567890123456", + "b_lattes_id": "3456789012345678", + "a_name": "Ana Silva Santos", + "b_name": "Beatriz Costa Ferreira", + "rules_triggered": ["R1", "R4", "R5"], + "confidence": "medium", + "evidence": [ + "Shared publication: Data-Driven Decision Making in Government Services (2022)", + "Shared publication: E-Government Implementation Challenges (2021)", + "Shared project: Open Data Platform Development", + "Shared activity: National E-Government Workshop - Organization" + ] + } + ] + }, + "summary_text": "Analysis of 3 researchers over the last 5 years. Detected 2 potential conflict(s) of interest. Confidence levels: 1 high, 1 medium, 0 low." +} + diff --git a/tools/cnpq_lattes_navigator/examples/valid_committee.json b/tools/cnpq_lattes_navigator/examples/valid_committee.json new file mode 100644 index 0000000..de8bfac --- /dev/null +++ b/tools/cnpq_lattes_navigator/examples/valid_committee.json @@ -0,0 +1,63 @@ +{ + "student": { + "name": "Matheus Yasuo Ribeiro Utino", + "lattes_id": "6191612710855387" + }, + "advisor": { + "name": "Ricardo Marcondes Marcacini", + "lattes_id": "3272611282260295" + }, + "committee_members": [ + { + "name": "Ricardo Marcondes Marcacini", + "lattes_id": "3272611282260295", + "email": "ricardo.marcacini@icmc.usp.br", + "institution": "ICMC-USP", + "role": "advisor", + "is_president": true + }, + { + "name": "Solange Oliveira Rezende", + "lattes_id": "8526960535874806", + "email": "solange@icmc.usp.br", + "institution": "ICMC-USP", + "role": "internal", + "is_president": false + }, + { + "name": "Bruno Magalhães Nogueira", + "lattes_id": "0544106600515308", + "email": "bruno.nogueira@ufms.br", + "institution": "UFMS", + "role": "external", + "is_president": false + }, + { + "name": "Ricardo Cerri", + "lattes_id": "6266519868438512", + "email": "cerri@icmc.usp.br", + "institution": "ICMC-USP", + "role": "substitute", + "is_president": false + }, + { + "name": "Renato Tinós", + "lattes_id": "1273134370963830", + "email": "rtinos@ffclrp.usp.br", + "institution": "FFCLRP", + "role": "substitute", + "is_president": false + }, + { + "name": "Jônata Tyska Carvalho", + "lattes_id": "9494364044256921", + "email": "jonata.tyska@ufsc.br", + "institution": "UFSC", + "role": "substitute", + "is_president": false + } + ], + "thesis_title": "Unstructured Text Mining in the Era of Large Language Models", + "committee_type": "qualification", + "time_window": 5 +} \ No newline at end of file diff --git a/tools/cnpq_lattes_navigator/schema.json b/tools/cnpq_lattes_navigator/schema.json new file mode 100644 index 0000000..de2d879 --- /dev/null +++ b/tools/cnpq_lattes_navigator/schema.json @@ -0,0 +1,497 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "CNPq/Lattes COI Analysis Output Schema", + "description": "Schema for the output of CNPq/Lattes Navigator tool - Conflict of Interest detection and 5-year production summary", + "type": "object", + "properties": { + "status": { + "type": "string", + "enum": [ + "success", + "error" + ], + "description": "Status of the analysis execution" + }, + "execution_metadata": { + "type": "object", + "description": "Metadata about the analysis execution", + "properties": { + "execution_date": { + "type": "string", + "format": "date-time", + "description": "ISO 8601 timestamp of when the analysis was run" + }, + "time_window_years": { + "type": "integer", + "description": "Number of years analyzed" + }, + "cutoff_date": { + "type": "string", + "format": "date-time", + "description": "ISO 8601 date threshold for filtering data" + }, + "num_researchers": { + "type": "integer", + "description": "Total number of researchers analyzed" + }, + "coi_rules_active": { + "type": "object", + "description": "Configuration of which COI rules were enabled", + "properties": { + "R1": { + "type": "boolean", + "description": "Co-authorship (≥1 publication)" + }, + "R2": { + "type": "boolean", + "description": "Advisor-advisee relationship" + }, + "R3": { + "type": "boolean", + "description": "Institutional overlap" + }, + "R4": { + "type": "boolean", + "description": "Project team overlap" + }, + "R5": { + "type": "boolean", + "description": "Committee/board/event overlap" + }, + "R6": { + "type": "boolean", + "description": "Frequent co-authorship (≥3 publications)" + }, + "R7": { + "type": "boolean", + "description": "Strong institutional proximity (same lab/group)" + } + } + } + }, + "required": [ + "execution_date", + "time_window_years", + "num_researchers" + ] + }, + "researchers": { + "type": "array", + "description": "Array of researcher profiles with production data", + "items": { + "type": "object", + "properties": { + "person": { + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Researcher name" + }, + "lattes_id": { + "type": "string", + "description": "CNPq Lattes ID" + }, + "profile_url": { + "type": "string", + "format": "uri", + "description": "URL to public Lattes profile" + }, + "last_update": { + "type": [ + "string", + "null" + ], + "description": "Last profile update date if available" + } + }, + "required": [ + "name", + "lattes_id", + "profile_url" + ] + }, + "production_5y": { + "type": "object", + "description": "Academic production within the time window", + "properties": { + "publications": { + "type": "object", + "properties": { + "total": { + "type": "integer", + "description": "Total number of publications" + }, + "by_type": { + "type": "object", + "description": "Count of publications by type", + "additionalProperties": { + "type": "integer" + } + }, + "top_items": { + "type": "array", + "description": "Top/most recent publications", + "items": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "year": { + "type": "integer" + }, + "venue": { + "type": "string" + }, + "type": { + "type": "string" + }, + "authors": { + "type": "array", + "items": { + "type": "string" + } + } + } + } + } + } + }, + "projects": { + "type": "object", + "properties": { + "total": { + "type": "integer", + "description": "Total number of projects" + }, + "active": { + "type": "array", + "description": "Currently active projects", + "items": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "role": { + "type": "string" + }, + "sponsor": { + "type": "string" + }, + "start_year": { + "type": "integer" + }, + "end_year": { + "type": [ + "integer", + "null" + ] + } + } + } + }, + "concluded": { + "type": "array", + "description": "Concluded projects", + "items": { + "type": "object", + "properties": { + "title": { + "type": "string" + }, + "role": { + "type": "string" + }, + "sponsor": { + "type": "string" + }, + "start_year": { + "type": "integer" + }, + "end_year": { + "type": "integer" + } + } + } + } + } + }, + "advising": { + "type": "object", + "properties": { + "total": { + "type": "integer", + "description": "Total advisees" + }, + "ongoing": { + "type": "array", + "description": "Ongoing advising", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "level": { + "type": "string", + "enum": [ + "MS", + "PhD", + "Postdoc", + "IC" + ] + }, + "start_year": { + "type": "integer" + } + } + } + }, + "concluded": { + "type": "array", + "description": "Concluded advising", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "level": { + "type": "string", + "enum": [ + "MS", + "PhD", + "Postdoc", + "IC" + ] + }, + "year": { + "type": "integer" + } + } + } + } + } + }, + "activities": { + "type": "array", + "description": "Committee, board, and event participation", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "role": { + "type": "string" + }, + "year": { + "type": "integer" + } + } + } + } + } + }, + "affiliations_5y": { + "type": "array", + "description": "Institutional affiliations within time window", + "items": { + "type": "object", + "properties": { + "institution": { + "type": "string" + }, + "department": { + "type": "string" + }, + "lab_group": { + "type": [ + "string", + "null" + ] + }, + "start_year": { + "type": [ + "integer", + "null" + ] + }, + "end_year": { + "type": [ + "integer", + "null" + ] + } + } + } + }, + "coauthors_5y": { + "type": "array", + "description": "Unique coauthors with publication counts", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "count": { + "type": "integer", + "description": "Number of co-authored publications" + } + } + } + }, + "warnings": { + "type": "array", + "description": "Warnings or issues during data extraction", + "items": { + "type": "string" + } + }, + "evidence_urls": { + "type": "array", + "description": "URLs used as evidence for extracted data", + "items": { + "type": "string", + "format": "uri" + } + } + }, + "required": [ + "person", + "production_5y", + "affiliations_5y", + "coauthors_5y", + "warnings", + "evidence_urls" + ] + } + }, + "coi_matrix": { + "type": "object", + "description": "Pairwise conflict of interest analysis", + "properties": { + "pairs": { + "type": "array", + "description": "Detected COI pairs", + "items": { + "type": "object", + "properties": { + "a_lattes_id": { + "type": "string", + "description": "Lattes ID of first researcher" + }, + "b_lattes_id": { + "type": "string", + "description": "Lattes ID of second researcher" + }, + "a_name": { + "type": "string", + "description": "Name of first researcher" + }, + "b_name": { + "type": "string", + "description": "Name of second researcher" + }, + "rules_triggered": { + "type": "array", + "description": "List of COI rules that were triggered", + "items": { + "type": "string", + "enum": [ + "R1", + "R2", + "R3", + "R4", + "R5", + "R6", + "R7" + ] + } + }, + "confidence": { + "type": "string", + "enum": [ + "high", + "medium", + "low" + ], + "description": "Overall confidence level for the COI detection" + }, + "evidence": { + "type": "array", + "description": "Evidence supporting the COI detection", + "items": { + "type": "string" + } + } + }, + "required": [ + "a_lattes_id", + "b_lattes_id", + "a_name", + "b_name", + "rules_triggered", + "confidence", + "evidence" + ] + } + } + }, + "required": [ + "pairs" + ] + }, + "summary_text": { + "type": "string", + "description": "Human-readable summary of the analysis results" + }, + "error_type": { + "type": "string", + "description": "Type of error if status is 'error'" + }, + "message": { + "type": "string", + "description": "Error message if status is 'error'" + }, + "timestamp": { + "type": "string", + "format": "date-time", + "description": "Timestamp of error if status is 'error'" + } + }, + "required": [ + "status" + ], + "oneOf": [ + { + "properties": { + "status": { + "const": "success" + } + }, + "required": [ + "status", + "execution_metadata", + "researchers", + "coi_matrix", + "summary_text" + ] + }, + { + "properties": { + "status": { + "const": "error" + } + }, + "required": [ + "status", + "error_type", + "message" + ] + } + ] +} \ No newline at end of file diff --git a/tools/cnpq_lattes_navigator/tool/Dockerfile b/tools/cnpq_lattes_navigator/tool/Dockerfile new file mode 100644 index 0000000..3fcc503 --- /dev/null +++ b/tools/cnpq_lattes_navigator/tool/Dockerfile @@ -0,0 +1,26 @@ +FROM python:3.11-slim + +ENV PYTHONUNBUFFERED=1 \ + PLAYWRIGHT_BROWSERS_PATH=/ms-playwright \ + BROWSER_USE_CLOUD=true + +RUN apt-get update && apt-get install -y --no-install-recommends \ + libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 \ + libdbus-1-3 libxkbcommon0 libatspi2.0-0 libxcomposite1 libxdamage1 \ + libxfixes3 libxrandr2 libgbm1 libasound2 libpango-1.0-0 libcairo2 \ + fonts-liberation wget ca-certificates \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --upgrade pip && pip install -r requirements.txt +RUN playwright install chromium && playwright install-deps chromium + +COPY . . + +RUN useradd -m -u 1000 app && chown -R app:app /app /ms-playwright +USER app + +CMD ["python", "-c", "from lattes_navigator import Tools; print('Tool ready')"] + diff --git a/tools/cnpq_lattes_navigator/tool/__init__.py b/tools/cnpq_lattes_navigator/tool/__init__.py new file mode 100644 index 0000000..6845c3a --- /dev/null +++ b/tools/cnpq_lattes_navigator/tool/__init__.py @@ -0,0 +1,4 @@ +from .lattes_navigator import Tools + +__all__ = ["Tools"] + diff --git a/tools/cnpq_lattes_navigator/tool/lattes_navigator.py b/tools/cnpq_lattes_navigator/tool/lattes_navigator.py new file mode 100644 index 0000000..d3c6685 --- /dev/null +++ b/tools/cnpq_lattes_navigator/tool/lattes_navigator.py @@ -0,0 +1,819 @@ +import os +import json +import asyncio +import re +import time +from datetime import datetime, timedelta +from typing import List, Dict, Any, Optional, Tuple +from collections import defaultdict +from pydantic import Field + +BROWSER_USE_AVAILABLE = False +BROWSER_IMPORT_ERROR = None + +try: + from browser_use import Agent, Browser, ChatOpenAI + BROWSER_USE_AVAILABLE = True +except Exception as e: + BROWSER_IMPORT_ERROR = str(e) + + +class Tools: + def __init__(self): + self.start_url = "https://buscatextual.cnpq.br/buscatextual/busca.do?metodo=apresentar" + self.current_year = datetime.now().year + self.browser_available = BROWSER_USE_AVAILABLE + self.rate_limit_delay = 2.0 + self.openai_api_key = os.getenv("OPENAI_API_KEY") + self.openai_model = os.getenv("OPENAI_MODEL", "gpt-4o-mini") + self.use_cloud_browser = os.getenv("BROWSER_USE_CLOUD", "true").lower() == "true" + + def analyze_researchers_coi( + self, + researchers_json: str = Field(..., description='JSON list: [{"name": "...", "lattes_id": "..."}]'), + time_window: int = Field(default=5, description="Years to analyze"), + coi_rules_config: str = Field( + default='{"R1": true, "R2": true, "R3": true, "R4": true, "R5": true, "R6": true, "R7": true}', + description='JSON to enable/disable COI rules' + ) + ) -> str: + try: + researchers = json.loads(researchers_json) + coi_config = json.loads(coi_rules_config) + + if not isinstance(researchers, list) or len(researchers) == 0: + return self._error_response("invalid_input", "researchers_json must be a non-empty list") + + cutoff_date = datetime.now() - timedelta(days=time_window * 365) + + results = { + 'status': 'success', + 'execution_metadata': { + 'execution_date': datetime.now().isoformat(), + 'time_window_years': time_window, + 'cutoff_date': cutoff_date.isoformat(), + 'num_researchers': len(researchers), + 'coi_rules_active': coi_config, + 'browser_use_available': self.browser_available + }, + 'researchers': [], + 'coi_matrix': {'pairs': []}, + 'summary_text': '' + } + + researcher_data = [] + for researcher in researchers: + name = researcher.get('name', '') + lattes_id = researcher.get('lattes_id', '') + + if not name or not lattes_id: + results['researchers'].append({ + 'person': {'name': name, 'lattes_id': lattes_id}, + 'warnings': ['Missing name or lattes_id'], + 'production_5y': {}, + 'coauthors_5y': [], + 'evidence_urls': [] + }) + continue + + profile_data = self._extract_researcher_profile(name, lattes_id, cutoff_date) + researcher_data.append(profile_data) + results['researchers'].append(profile_data) + + coi_pairs = self._analyze_coi_pairwise(researcher_data, coi_config, cutoff_date) + results['coi_matrix']['pairs'] = coi_pairs + results['summary_text'] = self._generate_summary(results) + + return json.dumps(results, ensure_ascii=False, indent=2) + + except json.JSONDecodeError as e: + return self._error_response('json_parse_error', f'Invalid JSON: {str(e)}') + except Exception as e: + return self._error_response('unexpected_error', str(e)) + + def _extract_researcher_profile(self, name: str, lattes_id: str, cutoff_date: datetime, is_student: bool = False) -> Dict[str, Any]: + profile_url = f"http://lattes.cnpq.br/{lattes_id}" + warnings = [] + + if not self.browser_available: + warnings.append("browser-use not installed") + return self._mock_profile(name, lattes_id, profile_url, warnings) + + if not self.openai_api_key: + warnings.append("OPENAI_API_KEY not set") + return self._mock_profile(name, lattes_id, profile_url, warnings) + + time.sleep(self.rate_limit_delay) + + try: + extracted_data = self._run_browser_extraction(profile_url, name, lattes_id, cutoff_date, is_student) + + if extracted_data is None: + warnings.append("Extraction failed") + return self._mock_profile(name, lattes_id, profile_url, warnings) + + # Check for error warnings in extracted data + data_warnings = extracted_data.get('warnings', []) + if any(w in data_warnings for w in ['profile_not_found', 'captcha_blocked', 'page_error']): + warnings.extend(data_warnings) + return self._mock_profile(name, lattes_id, profile_url, warnings, extracted_data.get('agent_logs', [])) + + production = self._process_production(extracted_data, cutoff_date) + coauthors = production.pop('coauthors_extracted', []) or extracted_data.get('coauthors', []) + + return { + 'person': { + 'name': name, + 'lattes_id': lattes_id, + 'profile_url': profile_url, + 'last_update': extracted_data.get('last_update') + }, + 'production_5y': production, + 'affiliations_5y': extracted_data.get('affiliations', []), + 'coauthors_5y': coauthors, + 'warnings': warnings + data_warnings, + 'evidence_urls': [profile_url], + 'agent_logs': extracted_data.get('agent_logs', []) + } + except Exception as e: + warnings.append(f"Error: {str(e)}") + return self._mock_profile(name, lattes_id, profile_url, warnings) + + def _mock_profile(self, name: str, lattes_id: str, profile_url: str, warnings: List[str], agent_logs: List[Dict] = None) -> Dict[str, Any]: + return { + 'person': {'name': name, 'lattes_id': lattes_id, 'profile_url': profile_url, 'last_update': None}, + 'production_5y': { + 'publications': {'total': 0, 'by_type': {}, 'top_items': []}, + 'projects': {'total': 0, 'active': [], 'concluded': []}, + 'advising': {'total': 0, 'ongoing': [], 'concluded': []}, + 'activities': [] + }, + 'affiliations_5y': [], + 'coauthors_5y': [], + 'warnings': warnings, + 'evidence_urls': [profile_url], + 'agent_logs': agent_logs or [] + } + + def _run_browser_extraction(self, profile_url: str, name: str, lattes_id: str, cutoff_date: datetime, is_student: bool = False) -> Optional[Dict[str, Any]]: + try: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + return loop.run_until_complete(self._async_extraction(profile_url, name, lattes_id, cutoff_date, is_student)) + finally: + loop.close() + except Exception as e: + return {'warnings': [str(e)], 'publications': [], 'projects': [], 'advising': [], 'affiliations': [], 'coauthors': [], 'last_update': None} + + async def _async_extraction(self, profile_url: str, name: str, lattes_id: str, cutoff_date: datetime, is_student: bool = False) -> Dict[str, Any]: + cutoff_year = cutoff_date.year + current_year = datetime.now().year + + llm = ChatOpenAI(model=self.openai_model) + + # Build checkbox step only for students + checkbox_step = "" + if is_student: + checkbox_step = """ +3. CHECK the checkbox with CSS selector "#buscarDemais" - REQUIRED for student search +4. """ + else: + checkbox_step = """ +3. """ + + task = f""" +TASK: Find and extract Lattes CV data for "{name}" (Lattes ID: {lattes_id}). + +TARGET LATTES ID: {lattes_id} + +NAVIGATION: +1. Go to: https://buscatextual.cnpq.br/buscatextual/busca.do?metodo=apresentar +2. Type "{name}" in the search field +{checkbox_step}CLICK button "#botaoBuscaFiltros" +{"5" if is_student else "4"}. CLICK link containing "{name}" in results +{"6" if is_student else "5"}. CLICK button "#idbtnabrircurriculo" +{"7" if is_student else "6"}. VERIFY ID: Look at top of CV for "ID Lattes:" text followed by a number. + The ID must be exactly "{lattes_id}". + If the ID shown is DIFFERENT, go BACK and try the NEXT result in the list. + +CSS SELECTORS: +- Checkbox: #buscarDemais (use CHECK action) +- Search button: #botaoBuscaFiltros (use CLICK action) +- Open CV button: #idbtnabrircurriculo (use CLICK action) + +ID LATTES LOCATION (in CV page): +The ID appears at top of CV like: "ID Lattes: {lattes_id}" +HTML:
  • ID Lattes: {lattes_id}
  • + +EXTRACT (years {cutoff_year}-{current_year}): +- Institution, publications, projects, advising, coauthors + +OUTPUT JSON: +```json +{{ + "last_update": null, + "affiliations": [{{"institution": "...", "department": "..."}}], + "publications": [{{"title": "...", "year": 2024, "type": "journal", "coauthors": ["..."]}}], + "projects": [{{"title": "...", "start_year": 2022}}], + "advising": [{{"name": "...", "level": "PhD", "year": 2023}}], + "coauthors": [{{"name": "...", "count": 1}}], + "warnings": [] +}} +``` + +ERRORS (only if NO data found): +- {{"warnings": ["profile_not_found"], ...}} if ID {lattes_id} not found in any result +- {{"warnings": ["captcha_blocked"], ...}} if completely blocked +""" + + # Create browser with cloud stealth mode if enabled + browser = None + if self.use_cloud_browser: + browser = Browser( + use_cloud=True, # CRITICAL: Enable cloud browser + cloud_proxy_country_code='br', # Brazil proxy for CNPq + cloud_timeout=15, # 15 min session (free tier max) + wait_between_actions=3.0, + wait_for_network_idle_page_load_time=5.0, + minimum_wait_page_load_time=3.0, + ) + + agent = Agent( + task=task, + llm=llm, + browser=browser, + max_actions_per_step=1 + ) + + max_retries = 1 + last_error = None + + for attempt in range(max_retries + 1): + try: + history = await agent.run(max_steps=50) # Increased to allow iteration through search results + break + except Exception as retry_error: + last_error = retry_error + if attempt < max_retries: + await asyncio.sleep(5) + continue + else: + return { + 'warnings': [f'Failed after {max_retries + 1} attempts: {str(last_error)}'], + 'publications': [], 'projects': [], 'advising': [], + 'affiliations': [], 'coauthors': [], 'last_update': None, + 'agent_logs': [{'error': str(last_error)}] + } + + try: + agent_logs = [] + all_content = [] + + if hasattr(history, 'all_results'): + for i, r in enumerate(history.all_results): + step_log = {'step': i + 1} + if hasattr(r, 'extracted_content') and r.extracted_content: + all_content.append(str(r.extracted_content)) + step_log['content'] = str(r.extracted_content)[:200] + if hasattr(r, 'long_term_memory') and r.long_term_memory: + step_log['memory'] = str(r.long_term_memory)[:200] + if hasattr(r, 'error') and r.error: + step_log['error'] = str(r.error) + agent_logs.append(step_log) + + if hasattr(history, 'final_result') and history.final_result: + all_content.append(str(history.final_result)) + + full_text = '\n'.join(all_content) + + json_block = re.search(r'```json\s*([\s\S]*?)\s*```', full_text) + if json_block: + try: + result = json.loads(json_block.group(1)) + result['agent_logs'] = agent_logs + return result + except json.JSONDecodeError: + pass + + json_match = re.search(r'\{[^{}]*"warnings"[^{}]*\}', full_text) + if json_match: + try: + result = json.loads(json_match.group()) + result['agent_logs'] = agent_logs + return result + except json.JSONDecodeError: + pass + + json_match = re.search(r'\{[\s\S]*\}', full_text) + if json_match: + try: + result = json.loads(json_match.group()) + result['agent_logs'] = agent_logs + return result + except json.JSONDecodeError: + pass + + return { + 'warnings': [f'No JSON in response'], + 'publications': [], 'projects': [], 'advising': [], + 'affiliations': [], 'coauthors': [], 'last_update': None, + 'agent_logs': agent_logs, + 'raw_content': full_text[:1000] + } + except Exception as e: + return {'warnings': [f'Error: {str(e)}'], 'publications': [], 'projects': [], 'advising': [], 'affiliations': [], 'coauthors': [], 'last_update': None, 'agent_logs': []} + + def _deduplicate_publications(self, pubs: List[Dict]) -> List[Dict]: + seen = set() + unique = [] + for pub in pubs: + doi = pub.get('doi') + if doi: + key = doi.lower() + else: + title = self._normalize_name(pub.get('title', '')) + year = pub.get('year', '') + key = f"{title}_{year}" + if key and key not in seen: + seen.add(key) + unique.append(pub) + return unique + + def _extract_coauthors(self, pubs: List[Dict]) -> List[Dict]: + coauthor_count = defaultdict(int) + for pub in pubs: + for coauthor in pub.get('coauthors', []): + if coauthor: + norm_name = self._normalize_name(coauthor) + coauthor_count[coauthor] += 1 + return [{'name': name, 'count': count} for name, count in sorted(coauthor_count.items(), key=lambda x: -x[1])[:20]] + + def _process_production(self, data: Dict[str, Any], cutoff_date: datetime) -> Dict[str, Any]: + pub_by_type = defaultdict(int) + filtered_pubs = [] + + for pub in data.get('publications', []): + year = self._parse_year(pub.get('year')) + if self._in_window(year, cutoff_date): + filtered_pubs.append(pub) + pub_by_type[pub.get('type', 'other')] += 1 + + filtered_pubs = self._deduplicate_publications(filtered_pubs) + coauthors = self._extract_coauthors(filtered_pubs) + + active_proj, concluded_proj = [], [] + for proj in data.get('projects', []): + if self._in_window(self._parse_year(proj.get('start_year')), cutoff_date): + (active_proj if proj.get('status') == 'active' else concluded_proj).append(proj) + + ongoing_adv, concluded_adv = [], [] + for adv in data.get('advising', []): + if self._in_window(self._parse_year(adv.get('year')), cutoff_date): + (ongoing_adv if adv.get('status') == 'ongoing' else concluded_adv).append(adv) + + activities = [] + for act in data.get('activities', []): + if self._in_window(self._parse_year(act.get('year')), cutoff_date): + activities.append(act) + + return { + 'publications': {'total': len(filtered_pubs), 'by_type': dict(pub_by_type), 'top_items': filtered_pubs[:10]}, + 'projects': {'total': len(active_proj) + len(concluded_proj), 'active': active_proj, 'concluded': concluded_proj}, + 'advising': {'total': len(ongoing_adv) + len(concluded_adv), 'ongoing': ongoing_adv, 'concluded': concluded_adv}, + 'activities': activities, + 'coauthors_extracted': coauthors + } + + def _normalize_name(self, name: str) -> str: + if not name: + return "" + normalized = re.sub(r'\s+', ' ', name.lower().strip()) + for a, p in [('á','a'),('à','a'),('â','a'),('ã','a'),('é','e'),('ê','e'),('í','i'),('ó','o'),('ô','o'),('õ','o'),('ú','u'),('ç','c')]: + normalized = normalized.replace(a, p) + return normalized + + def _names_match(self, n1: str, n2: str) -> Tuple[bool, str]: + norm1, norm2 = self._normalize_name(n1), self._normalize_name(n2) + if norm1 == norm2: + return True, 'high' + if norm1 in norm2 or norm2 in norm1: + return True, 'medium' + p1, p2 = norm1.split(), norm2.split() + if p1 and p2 and p1[-1] == p2[-1]: + return True, 'medium' + return False, 'low' + + def _parse_year(self, val: Any) -> Optional[int]: + if val is None: + return None + if isinstance(val, int): + return val if 1900 <= val <= 2100 else None + match = re.search(r'\b(19|20)\d{2}\b', str(val)) + return int(match.group(0)) if match else None + + def _in_window(self, year: Optional[int], cutoff: datetime) -> bool: + return year is not None and year >= cutoff.year + + def _check_r1(self, a: Dict, b: Dict, cutoff: datetime) -> Tuple[bool, str, List[str]]: + pubs_a = a.get('production_5y', {}).get('publications', {}).get('top_items', []) + pubs_b = b.get('production_5y', {}).get('publications', {}).get('top_items', []) + evidence = [] + + for pa in pubs_a: + ta = self._normalize_name(pa.get('title', '')) + for pb in pubs_b: + if ta and ta == self._normalize_name(pb.get('title', '')): + evidence.append(f"Shared: {pa.get('title')} ({pa.get('year')})") + + name_b = b.get('person', {}).get('name', '') + for co in a.get('coauthors_5y', []): + if self._names_match(co.get('name', ''), name_b)[0]: + evidence.append(f"Coauthor: {co.get('name')} ({co.get('count', 1)}x)") + + return (True, 'high', evidence) if evidence else (False, 'low', []) + + def _check_r2(self, a: Dict, b: Dict, cutoff: datetime) -> Tuple[bool, str, List[str]]: + for src, tgt, src_name in [(a, b, a), (b, a, b)]: + adv = src.get('production_5y', {}).get('advising', {}) + name = tgt.get('person', {}).get('name', '') + for advisee in adv.get('ongoing', []) + adv.get('concluded', []): + match, conf = self._names_match(name, advisee.get('name', '')) + if match: + return True, conf, [f"{src_name.get('person', {}).get('name')} advised {advisee.get('name')}"] + return False, 'low', [] + + def _check_r3(self, a: Dict, b: Dict, cutoff: datetime) -> Tuple[bool, str, List[str]]: + for aa in a.get('affiliations_5y', []): + ia = self._normalize_name(aa.get('institution', '')) + da = self._normalize_name(aa.get('department', '')) + for ab in b.get('affiliations_5y', []): + ib = self._normalize_name(ab.get('institution', '')) + if ia and ia == ib: + if da and da == self._normalize_name(ab.get('department', '')): + return True, 'high', [f"Same dept: {aa.get('institution')} - {aa.get('department')}"] + return True, 'medium', [f"Same inst: {aa.get('institution')}"] + return False, 'low', [] + + def _check_r4(self, a: Dict, b: Dict, cutoff: datetime) -> Tuple[bool, str, List[str]]: + pa = a.get('production_5y', {}).get('projects', {}) + pb = b.get('production_5y', {}).get('projects', {}) + all_a = pa.get('active', []) + pa.get('concluded', []) + all_b = pb.get('active', []) + pb.get('concluded', []) + for p1 in all_a: + t1 = self._normalize_name(p1.get('title', '')) + for p2 in all_b: + if t1 and t1 == self._normalize_name(p2.get('title', '')): + return True, 'high', [f"Shared project: {p1.get('title')}"] + return False, 'low', [] + + def _check_r5(self, a: Dict, b: Dict, cutoff: datetime) -> Tuple[bool, str, List[str]]: + for aa in a.get('production_5y', {}).get('activities', []): + na = self._normalize_name(aa.get('name', '')) + for ab in b.get('production_5y', {}).get('activities', []): + if na and na == self._normalize_name(ab.get('name', '')): + return True, 'medium', [f"Shared activity: {aa.get('name')}"] + return False, 'low', [] + + def _check_r6(self, a: Dict, b: Dict, cutoff: datetime) -> Tuple[bool, str, List[str]]: + _, _, evidence = self._check_r1(a, b, cutoff) + return (True, 'high', evidence) if len(evidence) >= 3 else (False, 'low', []) + + def _check_r7(self, a: Dict, b: Dict, cutoff: datetime) -> Tuple[bool, str, List[str]]: + for aa in a.get('affiliations_5y', []): + la = self._normalize_name(aa.get('lab_group', '')) + for ab in b.get('affiliations_5y', []): + if la and la == self._normalize_name(ab.get('lab_group', '')): + return True, 'high', [f"Same lab: {aa.get('lab_group')}"] + return False, 'low', [] + + def _analyze_coi_pairwise(self, data: List[Dict], config: Dict[str, bool], cutoff: datetime) -> List[Dict]: + pairs = [] + checks = {'R1': self._check_r1, 'R2': self._check_r2, 'R3': self._check_r3, 'R4': self._check_r4, 'R5': self._check_r5, 'R6': self._check_r6, 'R7': self._check_r7} + rule_descriptions = { + 'R1': 'Co-authorship (shared publication)', + 'R2': 'Advisor-advisee relationship', + 'R3': 'Institutional overlap', + 'R4': 'Project overlap', + 'R5': 'Committee/event overlap', + 'R6': 'Frequent co-authorship (3+ publications)', + 'R7': 'Same lab/research group' + } + + for i in range(len(data)): + for j in range(i + 1, len(data)): + a, b = data[i], data[j] + rules_detail = [] + all_evidence = [] + levels = [] + + for rule, fn in checks.items(): + if config.get(rule, True): + triggered, conf, ev = fn(a, b, cutoff) + if triggered: + rules_detail.append({ + 'rule': rule, + 'description': rule_descriptions[rule], + 'confidence': conf, + 'evidence': ev + }) + all_evidence.extend(ev) + levels.append(conf) + + if rules_detail: + pairs.append({ + 'a_lattes_id': a.get('person', {}).get('lattes_id'), + 'b_lattes_id': b.get('person', {}).get('lattes_id'), + 'a_name': a.get('person', {}).get('name'), + 'b_name': b.get('person', {}).get('name'), + 'a_profile_url': a.get('person', {}).get('profile_url'), + 'b_profile_url': b.get('person', {}).get('profile_url'), + 'rules_triggered': [r['rule'] for r in rules_detail], + 'rules_detail': rules_detail, + 'confidence': 'high' if 'high' in levels else ('medium' if 'medium' in levels else 'low'), + 'evidence_summary': all_evidence + }) + return pairs + + def _generate_summary(self, results: Dict) -> str: + n = results['execution_metadata']['num_researchers'] + w = results['execution_metadata']['time_window_years'] + p = len(results['coi_matrix']['pairs']) + + if p == 0: + return f"Analyzed {n} researchers over {w} years. No COI detected." + + h = sum(1 for x in results['coi_matrix']['pairs'] if x['confidence'] == 'high') + m = sum(1 for x in results['coi_matrix']['pairs'] if x['confidence'] == 'medium') + l = p - h - m + return f"Analyzed {n} researchers over {w} years. {p} COI found ({h} high, {m} medium, {l} low)." + + def _error_response(self, error_type: str, message: str) -> str: + return json.dumps({'status': 'error', 'error_type': error_type, 'message': message, 'timestamp': datetime.now().isoformat()}, ensure_ascii=False, indent=2) + + def _collect_all_profiles( + self, + student: Dict[str, str], + advisor: Dict[str, str], + committee_members: List[Dict[str, Any]], + cutoff_date: datetime + ) -> Dict[str, Any]: + """ + Browser Tool: Collect all profiles from Lattes platform. + Returns dict with student_data, advisor_data, and members_data. + + Note: The checkbox "Demais pesquisadores" is only needed for student search. + If student extraction fails, the entire collection is aborted. + """ + collection_log = [] + total = 2 + len([m for m in committee_members if m.get('lattes_id') != advisor.get('lattes_id') and m.get('role') != 'advisor']) + current = 0 + + # Extract student FIRST (requires checkbox "Demais pesquisadores") + current += 1 + collection_log.append(f"Extracting {current}/{total}: {student.get('name', 'Unknown')} (student - requires checkbox)") + student_data = self._extract_researcher_profile( + student.get('name', ''), + student.get('lattes_id', ''), + cutoff_date, + is_student=True # This enables checkbox verification in the prompt + ) + + # Check if student extraction failed - if so, abort the entire collection + student_warnings = student_data.get('warnings', []) + student_failed = any(w in student_warnings for w in ['profile_not_found', 'captcha_blocked', 'page_error', 'Extraction failed']) + + if student_failed: + collection_log.append(f"ABORTED: Student extraction failed. Warnings: {student_warnings}") + return { + 'student_data': student_data, + 'advisor_data': None, + 'members_data': [], + 'collection_log': collection_log, + 'aborted': True, + 'abort_reason': f"Student extraction failed: {student_warnings}" + } + + # Extract advisor (no checkbox needed - established researchers appear in default search) + current += 1 + collection_log.append(f"Extracting {current}/{total}: {advisor.get('name', 'Unknown')} (advisor)") + advisor_data = self._extract_researcher_profile( + advisor.get('name', ''), + advisor.get('lattes_id', ''), + cutoff_date, + is_student=False + ) + + # Extract committee members (excluding advisor) - no checkbox needed + members_data = [] + for member in committee_members: + member_role = member.get('role', 'unknown') + if member_role == 'advisor' or member.get('lattes_id') == advisor.get('lattes_id'): + continue + + current += 1 + collection_log.append(f"Extracting {current}/{total}: {member.get('name', 'Unknown')} ({member_role})") + member_data = self._extract_researcher_profile( + member.get('name', ''), + member.get('lattes_id', ''), + cutoff_date, + is_student=False + ) + members_data.append({ + 'member_info': member, + 'profile_data': member_data + }) + + return { + 'student_data': student_data, + 'advisor_data': advisor_data, + 'members_data': members_data, + 'collection_log': collection_log, + 'aborted': False + } + + def _judge_committee( + self, + student_data: Dict[str, Any], + members_data: List[Dict[str, Any]], + coi_config: Dict[str, bool], + cutoff_date: datetime + ) -> Dict[str, Any]: + """ + Judge Tool: Analyze COI between student and each committee member. + No browser operations - pure data analysis. + """ + members_analysis = [] + conflicts = [] + + for member_entry in members_data: + member_info = member_entry['member_info'] + member_profile = member_entry['profile_data'] + member_role = member_info.get('role', 'unknown') + + # Analyze COI between student and this member + coi_result = self._analyze_coi_pair(student_data, member_profile, coi_config, cutoff_date) + + member_analysis = { + 'member': { + 'name': member_info.get('name'), + 'lattes_id': member_info.get('lattes_id'), + 'role': member_role, + 'institution': member_info.get('institution'), + 'profile_url': member_profile.get('person', {}).get('profile_url') + }, + 'extraction_warnings': member_profile.get('warnings', []), + 'coi_detected': coi_result['has_coi'], + 'coi_details': coi_result['details'] + } + + members_analysis.append(member_analysis) + + if coi_result['has_coi']: + conflicts.append({ + 'student_name': student_data.get('person', {}).get('name'), + 'member_name': member_info.get('name'), + 'member_role': member_role, + 'rules_triggered': coi_result['rules_triggered'], + 'confidence': coi_result['confidence'], + 'evidence': coi_result['evidence'] + }) + + return { + 'members_analysis': members_analysis, + 'conflicts': conflicts, + 'has_conflicts': len(conflicts) > 0 + } + + def validate_committee( + self, + student: Dict[str, str], + advisor: Dict[str, str], + committee_members: List[Dict[str, Any]], + time_window: int = 5, + coi_rules_config: Dict[str, bool] = None + ) -> str: + """ + Validate academic committee for conflicts of interest. + + Architecture: + 1. _collect_all_profiles() - Browser Tool: extracts all Lattes profiles + 2. _judge_committee() - Judge Tool: analyzes COI (no browser) + + Analyzes COI only between student and non-advisor committee members. + Advisor-student COI is expected and excluded from analysis. + Member-member COI is not relevant for committee validation. + """ + try: + coi_config = coi_rules_config or {"R1": True, "R2": True, "R3": True, "R4": True, "R5": True, "R6": True, "R7": True} + cutoff_date = datetime.now() - timedelta(days=time_window * 365) + + results = { + 'status': 'valid', + 'execution_metadata': { + 'execution_date': datetime.now().isoformat(), + 'time_window_years': time_window, + 'cutoff_date': cutoff_date.isoformat(), + 'coi_rules_active': coi_config, + 'browser_use_available': self.browser_available + }, + 'student': None, + 'advisor': None, + 'members_analysis': [], + 'conflicts': [], + 'collection_log': [], + 'summary': '' + } + + # STEP 1: Browser Tool - Collect all profiles + collected = self._collect_all_profiles(student, advisor, committee_members, cutoff_date) + + results['student'] = collected['student_data'] + results['advisor'] = collected.get('advisor_data') + results['collection_log'] = collected['collection_log'] + + # Check if collection was aborted (student extraction failed) + if collected.get('aborted'): + results['status'] = 'error' + results['summary'] = f"Collection aborted: {collected.get('abort_reason', 'Student extraction failed')}" + return json.dumps(results, ensure_ascii=False, indent=2) + + # STEP 2: Judge Tool - Analyze COI (no browser operations) + judgment = self._judge_committee( + collected['student_data'], + collected['members_data'], + coi_config, + cutoff_date + ) + + results['members_analysis'] = judgment['members_analysis'] + results['conflicts'] = judgment['conflicts'] + + if judgment['has_conflicts']: + results['status'] = 'invalid' + + # Generate summary + num_members = len(results['members_analysis']) + num_conflicts = len(results['conflicts']) + + if num_conflicts == 0: + results['summary'] = f"Committee valid. Analyzed {num_members} members against student. No conflicts detected." + else: + conflict_names = [c['member_name'] for c in results['conflicts']] + results['summary'] = f"Committee INVALID. {num_conflicts} conflict(s) detected with: {', '.join(conflict_names)}." + + return json.dumps(results, ensure_ascii=False, indent=2) + + except Exception as e: + return self._error_response('unexpected_error', str(e)) + + def _analyze_coi_pair(self, a: Dict, b: Dict, config: Dict[str, bool], cutoff: datetime) -> Dict[str, Any]: + """Analyze COI between two researchers (student vs member).""" + checks = { + 'R1': self._check_r1, + 'R2': self._check_r2, + 'R3': self._check_r3, + 'R4': self._check_r4, + 'R5': self._check_r5, + 'R6': self._check_r6, + 'R7': self._check_r7 + } + rule_descriptions = { + 'R1': 'Co-authorship (shared publication)', + 'R2': 'Advisor-advisee relationship', + 'R3': 'Institutional overlap', + 'R4': 'Project overlap', + 'R5': 'Committee/event overlap', + 'R6': 'Frequent co-authorship (3+ publications)', + 'R7': 'Same lab/research group' + } + + details = [] + all_evidence = [] + rules_triggered = [] + levels = [] + + for rule, fn in checks.items(): + if config.get(rule, True): + triggered, conf, ev = fn(a, b, cutoff) + if triggered: + rules_triggered.append(rule) + details.append({ + 'rule': rule, + 'description': rule_descriptions[rule], + 'confidence': conf, + 'evidence': ev + }) + all_evidence.extend(ev) + levels.append(conf) + + has_coi = len(rules_triggered) > 0 + confidence = 'high' if 'high' in levels else ('medium' if 'medium' in levels else 'low') + + return { + 'has_coi': has_coi, + 'rules_triggered': rules_triggered, + 'confidence': confidence if has_coi else None, + 'evidence': all_evidence, + 'details': details + } diff --git a/tools/cnpq_lattes_navigator/tool/requirements.txt b/tools/cnpq_lattes_navigator/tool/requirements.txt new file mode 100644 index 0000000..d3554ea --- /dev/null +++ b/tools/cnpq_lattes_navigator/tool/requirements.txt @@ -0,0 +1,5 @@ +pydantic>=2.0.0 +python-dateutil>=2.8.0 +browser-use +playwright + diff --git a/tools/open_alex_doi.py b/tools/open_alex_doi.py new file mode 100644 index 0000000..eec9bb6 --- /dev/null +++ b/tools/open_alex_doi.py @@ -0,0 +1,195 @@ +import os +import requests +import json +from pydantic import Field + +class Tools: + def __init__(self): + pass + + def _clean_doi(self, doi: str) -> str: + """ + Clean and normalize a DOI string by removing common prefixes. + + Args: + doi: The DOI string to clean + + Returns: + Cleaned DOI string without prefixes like 'doi:', 'https://doi.org/', etc. + """ + doi_clean = doi.strip() + + # Remove common DOI prefixes + if doi_clean.lower().startswith('doi:'): + doi_clean = doi_clean[4:].strip() + if doi_clean.startswith('https://doi.org/'): + doi_clean = doi_clean.replace('https://doi.org/', '') + if doi_clean.startswith('http://doi.org/'): + doi_clean = doi_clean.replace('http://doi.org/', '') + + return doi_clean + + def get_openalex_metadata_by_doi( + self, + doi: str = Field( + ..., + description="The DOI (Digital Object Identifier) of the publication, e.g., '10.1371/journal.pone.0000000'" + ) + ) -> str: + """ + Retrieve essential metadata and impact indicators for a scientific publication from OpenAlex API. + + Returns a JSON string containing: + - Basic metadata (title, authors, venue, publication year) + - Impact indicators (citations, percentiles, FWCI) + + Args: + doi: The DOI of the publication to query + + Returns: + JSON string with structured publication data and impact metrics + """ + + # Clean the DOI using the helper function + doi_clean = self._clean_doi(doi) + + # Build OpenAlex API endpoint URL + base_url = f"https://api.openalex.org/works/doi:{doi_clean}" + + # Optional: Add email for polite pool access (faster and more reliable) + # Set OPENALEX_EMAIL environment variable to use this feature + email = os.getenv("OPENALEX_EMAIL", None) + params = {} + if email: + params['mailto'] = email + + try: + # Make request to OpenAlex API + response = requests.get(base_url, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + # ======================================== + # BASIC METADATA EXTRACTION + # ======================================== + + # Extract core publication information + title = data.get('title', None) + publication_year = data.get('publication_year', None) + publication_date = data.get('publication_date', None) + type_crossref = data.get('type_crossref', None) + + # Extract and format authors list + # Only include author name for simplicity + authors_list = data.get('authorships', []) + authors = [ + author_info.get('author', {}).get('display_name') + for author_info in authors_list + ] + + # Extract venue/journal information + primary_location = data.get('primary_location', {}) + source = primary_location.get('source', {}) or {} + venue_name = source.get('display_name') + + # ======================================== + # IMPACT INDICATORS EXTRACTION + # ======================================== + + # Total number of citations + cited_by_count = data.get('cited_by_count', 0) + + # Citation normalized percentile + # Compares citation count to similar publications (by year, type, field) + citation_normalized_percentile = data.get('citation_normalized_percentile', {}) or {} + percentile_value = citation_normalized_percentile.get('value') + is_top_1_percent = citation_normalized_percentile.get('is_in_top_1_percent', False) + + # Cited by percentile year + # Percentile ranking among publications from the same year + cited_by_percentile_year = data.get('cited_by_percentile_year', {}) or {} + percentile_min = cited_by_percentile_year.get('min') + percentile_max = cited_by_percentile_year.get('max') + + # Field-Weighted Citation Impact (FWCI) + # Value of 1.0 means average for the field + # >1.0 means above average, <1.0 means below average + fwci = data.get('fwci') + + # ======================================== + # BUILD STRUCTURED RESPONSE + # ======================================== + + result = { + 'status': 'success', + 'doi': doi_clean, + 'openalex_id': data.get('id'), + + # Basic publication metadata + 'metadata': { + 'title': title, + 'authors': authors, + 'venue': venue_name, + 'publication_year': publication_year, + 'publication_date': publication_date, + 'type': type_crossref + }, + + # Citation and impact metrics + 'impact_indicators': { + 'cited_by_count': cited_by_count, + 'citation_normalized_percentile': { + 'value': percentile_value, + 'is_in_top_1_percent': is_top_1_percent + }, + 'cited_by_percentile_year': { + 'min': percentile_min, + 'max': percentile_max + }, + 'fwci': fwci + }, + + # Useful links + 'links': { + 'doi_url': f'https://doi.org/{doi_clean}', + 'openalex_url': data.get('id') + } + } + + # Return as formatted JSON string + return json.dumps(result, ensure_ascii=False, indent=2) + + # ======================================== + # ERROR HANDLING + # ======================================== + + except requests.exceptions.HTTPError as e: + # Handle HTTP errors (e.g., 404 Not Found) + error_result = { + 'status': 'error', + 'error_type': 'http_error', + 'error_code': e.response.status_code, + 'message': f'Publication not found for DOI: {doi_clean}' if e.response.status_code == 404 else str(e), + 'doi': doi_clean + } + return json.dumps(error_result, ensure_ascii=False, indent=2) + + except requests.exceptions.RequestException as e: + # Handle connection errors + error_result = { + 'status': 'error', + 'error_type': 'connection_error', + 'message': f'Error connecting to OpenAlex API: {str(e)}', + 'doi': doi_clean + } + return json.dumps(error_result, ensure_ascii=False, indent=2) + + except Exception as e: + # Handle any other unexpected errors + error_result = { + 'status': 'error', + 'error_type': 'unexpected_error', + 'message': f'Unexpected error: {str(e)}', + 'doi': doi_clean + } + return json.dumps(error_result, ensure_ascii=False, indent=2) \ No newline at end of file