-
-
Notifications
You must be signed in to change notification settings - Fork 67
Add WikiCommons fetch #234
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
Joyakis
wants to merge
15
commits into
creativecommons:main
Choose a base branch
from
Joyakis:wikicommons
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
15 commits
Select commit
Hold shift + click to select a range
c63e8d5
Added wikicommons
Joyakis 8d5366a
Renamed csv
Joyakis 0ee217a
Made it executable
Joyakis 7521245
Made necessary changes
Joyakis 756243e
Followed header naming conventions
Joyakis 01c93f3
Followed header naming conventions(in caps)
Joyakis c055c47
Changed file name
Joyakis 5c48e52
raise QuantifyingException for failed category fetch instead of retur…
Joyakis 1a0a8b5
raise QuantifyingException for failed category fetch instead of retur…
Joyakis 3a27803
Merge branch 'main' into wikicommons
Joyakis 906beea
Added sources
Joyakis 5c4ea28
Added sources
Joyakis 35498c0
Made necessary changes
Joyakis d928671
Included only legal tools
Joyakis 641855e
Rearranged order of sources
Joyakis File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,298 @@ | ||
| #!/usr/bin/env python | ||
| """ | ||
| Fetch high-level WikiCommons statistics for Quantifying the Commons. | ||
| Generates one dataset: | ||
| 1) Recursive category data (aggregated by LICENSE TYPE, File Count, Page Count) | ||
| Uses Wikimedia Commons API to retrieve metadata | ||
| for Creative Commons license categories. | ||
| """ | ||
|
|
||
| # Standard library | ||
| import argparse | ||
| import csv | ||
| import os | ||
| import sys | ||
| import textwrap | ||
| import time | ||
| import traceback | ||
|
|
||
| # Third-party | ||
| from pygments import highlight | ||
| from pygments.formatters import TerminalFormatter | ||
| from pygments.lexers import PythonTracebackLexer | ||
|
|
||
| # Add parent directory for shared imports | ||
| sys.path.append(os.path.join(os.path.dirname(__file__), "..")) | ||
| # First-party/Local | ||
| import shared # noqa: E402 | ||
|
|
||
| # Setup | ||
| LOGGER, PATHS = shared.setup(__file__) | ||
|
|
||
| # Constants | ||
| BASE_URL = "https://commons.wikimedia.org/w/api.php" | ||
| FILE_WIKICOMMONS = shared.path_join( | ||
| PATHS["data_phase"], "wikicommons_legal_tool_counts.csv" | ||
| ) | ||
| HEADER_WIKICOMMONS = ["LICENSE_TYPE", "FILE_COUNT", "PAGE_COUNT"] | ||
| ROOT_CATEGORY = "Free_Creative_Commons_licenses" | ||
| TIMEOUT = 25 | ||
|
|
||
|
|
||
| def parse_arguments(): | ||
| """Parse command-line options.""" | ||
| parser = argparse.ArgumentParser(description=__doc__) | ||
| parser.add_argument( | ||
| "--enable-save", | ||
| action="store_true", | ||
| help="Enable saving results to CSV.", | ||
| ) | ||
| parser.add_argument( | ||
| "--enable-git", | ||
| action="store_true", | ||
| help="Enable git actions (fetch, merge, add, commit, push).", | ||
| ) | ||
|
|
||
| parser.add_argument( | ||
| "--limit", | ||
| type=int, | ||
| default=None, | ||
| help="Limit recursive depth for testing", | ||
| ) | ||
|
|
||
| args = parser.parse_args() | ||
| if not args.enable_save and args.enable_git: | ||
| parser.error("--enable-git requires --enable-save") | ||
| return args | ||
|
|
||
|
|
||
| def get_subcategories(category, session): | ||
| """Fetch all subcategories for a | ||
| given category, handling pagination""" | ||
| all_subcats = [] | ||
| cmcontinue = None | ||
|
|
||
| while True: | ||
| try: | ||
| params = { | ||
| "action": "query", | ||
| "list": "categorymembers", | ||
| "cmtitle": f"Category:{category}", | ||
| "cmtype": "subcat", | ||
| "format": "json", | ||
| "cmlimit": "max", | ||
| } | ||
| if cmcontinue: | ||
| params["cmcontinue"] = cmcontinue | ||
|
|
||
| resp = session.get(BASE_URL, params=params, timeout=TIMEOUT) | ||
| resp.raise_for_status() | ||
| data = resp.json() | ||
|
|
||
| members = data.get("query", {}).get("categorymembers", []) | ||
| subcats = [ | ||
| m["title"].replace("Category:", "").replace("&", "%26") | ||
| for m in members | ||
| ] | ||
| all_subcats.extend(subcats) | ||
|
|
||
| # Handle pagination | ||
| if "continue" in data and "cmcontinue" in data["continue"]: | ||
| cmcontinue = data["continue"]["cmcontinue"] | ||
| time.sleep(0.2) | ||
| else: | ||
| break | ||
|
|
||
| except Exception as e: | ||
| LOGGER.warning( | ||
| f"Failed to fetch subcategories for {category}: {e}" | ||
| ) | ||
| break | ||
|
|
||
| return all_subcats | ||
|
|
||
|
|
||
| def fetch_category_totals(category, session): | ||
| """Fetch total file and page counts for a category.""" | ||
| try: | ||
| params = { | ||
| "action": "query", | ||
| "prop": "categoryinfo", | ||
| "titles": f"Category:{category}", | ||
| "format": "json", | ||
| } | ||
| resp = session.get(BASE_URL, params=params, timeout=TIMEOUT) | ||
| resp.raise_for_status() | ||
| data = resp.json() | ||
| file_cnt, page_cnt = 0, 0 | ||
| for _, info in data.get("query", {}).get("pages", {}).items(): | ||
| catinfo = info.get("categoryinfo", {}) | ||
| file_cnt += catinfo.get("files", 0) | ||
| page_cnt += catinfo.get("pages", 0) | ||
| return {"FILE_COUNT": file_cnt, "PAGE_COUNT": page_cnt} | ||
| except Exception as e: | ||
| message = f"Failed to fetch contents for {category}: {e}" | ||
| LOGGER.error(message) | ||
| raise shared.QuantifyingException(message) | ||
|
|
||
|
|
||
| # Helper function to check if a category | ||
| # name represents a valid CC license tool | ||
| def is_valid_license_tool(category_name): | ||
| """ | ||
| Checks if a category name corresponds to | ||
| an official Creative Commons license tool.. | ||
| Official license categories usually start with | ||
| 'CC-' followed by a combination | ||
| of BY, SA, ND, NC, and a version number (e.g., CC-BY-4.0) | ||
|
|
||
| EXCLUDED CC Licenses (marked 'Not OK' in policy): | ||
| - Attribution-NonCommercial (CC BY-NC). | ||
| - Attribution-NoDerivs (CC BY-ND). | ||
| - Any combination containing NC or ND restrictions. | ||
|
|
||
|
|
||
| """ | ||
| # A list of common patterns to check | ||
| if category_name.startswith("CC-") and any( | ||
| x in category_name for x in ["BY", "SA"] | ||
| ): | ||
| # Specific exceptions that look like | ||
| # licenses but are markers/subcategories | ||
| if "migrated" in category_name or "Retired" in category_name: | ||
| return False | ||
| return True | ||
|
|
||
| # Check for CC0 Public Domain Dedication (often just "CC0") | ||
| if ( | ||
| category_name == "CC0" | ||
| or category_name.startswith("CC0-") | ||
| or category_name == "CC-Zero" | ||
| ): | ||
| return True | ||
|
|
||
| # The root category itself is not a license tool | ||
| if category_name == ROOT_CATEGORY: | ||
| return False | ||
|
|
||
| return False | ||
|
|
||
|
|
||
| def recursive_collect_data(session, limit=None): | ||
| """Recursively traverse WikiCommons categories and collect data.""" | ||
|
|
||
| results = [] | ||
| visited = set() | ||
|
|
||
| def traverse(category, path, depth=0): | ||
| if limit and depth >= limit: | ||
| return | ||
| if category in visited: | ||
| return | ||
| visited.add(category) | ||
|
|
||
| # Only fetch and collect data for valid license tools | ||
| if is_valid_license_tool(category): | ||
| try: | ||
| # Get counts for the current category | ||
| contents = fetch_category_totals(category, session) | ||
|
|
||
| results.append( | ||
| { | ||
| # Use the specific license category name | ||
| # as the LICENSE_TYPE | ||
| "LICENSE_TYPE": category, | ||
| "FILE_COUNT": contents["FILE_COUNT"], | ||
| "PAGE_COUNT": contents["PAGE_COUNT"], | ||
| } | ||
| ) | ||
| except shared.QuantifyingException as e: | ||
| # Log the specific license category failure | ||
| LOGGER.error( | ||
| f"Failed to process valid license category {category}: {e}" | ||
| ) | ||
|
|
||
| # Get subcategories (check subcategories, | ||
| # as a valid license might be nested under a non-license category) | ||
| subcats = get_subcategories(category, session) | ||
|
|
||
| # Logging label | ||
| label = "categories" if depth == 0 else "subcategories" | ||
| LOGGER.info(f"Fetched {len(subcats)} {label} for {category}.") | ||
|
|
||
| # Recursively traverse subcategories | ||
| for sub in subcats: | ||
| # Use the subcategory name as the 'path' for traversal, | ||
| # but use the category name for the final result. | ||
| traverse(sub, f"{path}/{sub}", depth + 1) | ||
| time.sleep(0.05) # time to sleep | ||
|
|
||
| # Start traversal from root | ||
| traverse(ROOT_CATEGORY, ROOT_CATEGORY) | ||
| return results | ||
|
|
||
|
|
||
| def write_data(args, wikicommons_data): | ||
| """Write WikiCommons data to CSV.""" | ||
| if not args.enable_save: | ||
| return args | ||
|
|
||
| os.makedirs(PATHS["data_phase"], exist_ok=True) | ||
| with open( | ||
| FILE_WIKICOMMONS, "w", encoding="utf-8", newline="\n" | ||
| ) as file_obj: | ||
|
|
||
| writer = csv.DictWriter( | ||
| file_obj, fieldnames=HEADER_WIKICOMMONS, dialect="unix" | ||
| ) | ||
| writer.writeheader() | ||
| writer.writerows(wikicommons_data) | ||
|
|
||
| LOGGER.info(f"Saved {len(wikicommons_data)} rows to {FILE_WIKICOMMONS}.") | ||
| return args | ||
|
|
||
|
|
||
| def main(): | ||
| args = parse_arguments() | ||
| LOGGER.info("Starting WikiCommons data fetch.") | ||
| shared.paths_log(LOGGER, PATHS) | ||
| shared.git_fetch_and_merge(args, PATHS["repo"]) | ||
| session = shared.get_session(accept_header="application/json") | ||
| wikicommons_data = recursive_collect_data(session, limit=args.limit) | ||
| args = write_data(args, wikicommons_data) | ||
|
|
||
| args = shared.git_add_and_commit( | ||
| args, | ||
| PATHS["repo"], | ||
| PATHS["data_quarter"], | ||
| "Add WikiCommons dataset for Quantifying the Commons.", | ||
| ) | ||
| shared.git_push_changes(args, PATHS["repo"]) | ||
|
|
||
| LOGGER.info("WikiCommons fetch completed successfully.") | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| try: | ||
| main() | ||
| except shared.QuantifyingException as e: | ||
| LOGGER.error(e.message) | ||
| sys.exit(e.exit_code) | ||
| except SystemExit as e: | ||
| if e.code != 0: | ||
| LOGGER.error(f"System exit with code: {e.code}") | ||
| sys.exit(e.code) | ||
| except KeyboardInterrupt: | ||
| LOGGER.info("(130) Halted via KeyboardInterrupt.") | ||
| sys.exit(130) | ||
| except Exception: | ||
| traceback_formatted = textwrap.indent( | ||
| highlight( | ||
| traceback.format_exc(), | ||
| PythonTracebackLexer(), | ||
| TerminalFormatter(), | ||
| ), | ||
| " ", | ||
| ) | ||
| LOGGER.critical(f"(1) Unhandled exception:\n{traceback_formatted}") | ||
| sys.exit(1) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.