From 34d1af128733a88dfcd7e801cb0b218b28944425 Mon Sep 17 00:00:00 2001 From: Wilson Lin Date: Sat, 27 Sep 2025 14:21:03 -0400 Subject: [PATCH 1/4] Add wilsonl.in-search dataset --- datasets/wilsonl.in-search.yaml | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 datasets/wilsonl.in-search.yaml diff --git a/datasets/wilsonl.in-search.yaml b/datasets/wilsonl.in-search.yaml new file mode 100644 index 000000000..734d5d4d9 --- /dev/null +++ b/datasets/wilsonl.in-search.yaml @@ -0,0 +1,29 @@ +Name: "search.wilsonl.in Web Search Index Crawl + Text Embeddings" +Description: 'search.wilsonl.in is a web search engine built from scratch using neural embeddings, RocksDB, HNSW. This dataset contains the index, source documents, and text embeddings for 280M pages.' +Documentation: https://blog.wilsonl.in/search-engine-open-data +Contact: wl@wilsonl.in +ManagedBy: Wilson Lin +UpdateFrequency: The dataset has been finalized and will not be updated. +Tags: + - aws-pds + - natural language processing + - internet + - web archive + - semantic search + - text embeddings +License: "[CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)" +Resources: + - Description: Dataset files + ARN: arn:aws:s3:::aws-opendata.wilsonl.in/search-engine + Region: us-east-1 + Type: S3 Bucket +DataAtWork: + Publications: + - Title: "Building a web search engine from scratch in two months with 3 billion neural embeddings" + URL: https://blog.wilsonl.in/search-engine/ + AuthorName: Wilson Lin + Tutorials: + - Title: Notebook Tutorials + URL: https://github.com/wilsonzlin/datasets/search-engine-open-data/notebooks/ + AuthorName: Wilson Lin + AuthorURL: https://github.com/wilsonzlin \ No newline at end of file From c3abd925a59d00292826c6f49391baa24b347698 Mon Sep 17 00:00:00 2001 From: Wilson Lin Date: Mon, 20 Oct 2025 00:19:14 -0400 Subject: [PATCH 2/4] Update description page URL --- datasets/wilsonl.in-search.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/wilsonl.in-search.yaml b/datasets/wilsonl.in-search.yaml index 734d5d4d9..e4b30eda3 100644 --- a/datasets/wilsonl.in-search.yaml +++ b/datasets/wilsonl.in-search.yaml @@ -1,6 +1,6 @@ Name: "search.wilsonl.in Web Search Index Crawl + Text Embeddings" Description: 'search.wilsonl.in is a web search engine built from scratch using neural embeddings, RocksDB, HNSW. This dataset contains the index, source documents, and text embeddings for 280M pages.' -Documentation: https://blog.wilsonl.in/search-engine-open-data +Documentation: https://github.com/wilsonzlin/datasets/search-engine-open-data/ Contact: wl@wilsonl.in ManagedBy: Wilson Lin UpdateFrequency: The dataset has been finalized and will not be updated. From 523cfb88a9e5979d7d8ca810913e2026cee0eb2e Mon Sep 17 00:00:00 2001 From: Wilson Lin Date: Mon, 20 Oct 2025 00:21:03 -0400 Subject: [PATCH 3/4] Update tutorial --- datasets/wilsonl.in-search.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/wilsonl.in-search.yaml b/datasets/wilsonl.in-search.yaml index e4b30eda3..7f38d3f85 100644 --- a/datasets/wilsonl.in-search.yaml +++ b/datasets/wilsonl.in-search.yaml @@ -23,7 +23,7 @@ DataAtWork: URL: https://blog.wilsonl.in/search-engine/ AuthorName: Wilson Lin Tutorials: - - Title: Notebook Tutorials - URL: https://github.com/wilsonzlin/datasets/search-engine-open-data/notebooks/ + - Title: Notebook Tutorial + URL: https://github.com/wilsonzlin/datasets/blob/master/search-engine-open-data/notebooks/get-to-know-a-dataset.ipynb AuthorName: Wilson Lin AuthorURL: https://github.com/wilsonzlin \ No newline at end of file From 3ba94867541209e9190b0d87e47336467b799d71 Mon Sep 17 00:00:00 2001 From: Wilson Lin Date: Mon, 20 Oct 2025 00:22:27 -0400 Subject: [PATCH 4/4] Update tutorial --- datasets/wilsonl.in-search.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/wilsonl.in-search.yaml b/datasets/wilsonl.in-search.yaml index 7f38d3f85..32b38f281 100644 --- a/datasets/wilsonl.in-search.yaml +++ b/datasets/wilsonl.in-search.yaml @@ -23,7 +23,7 @@ DataAtWork: URL: https://blog.wilsonl.in/search-engine/ AuthorName: Wilson Lin Tutorials: - - Title: Notebook Tutorial + - Title: "Get To Know A Dataset: search.wilsonl.in Web Search Index Crawl + Text Embeddings" URL: https://github.com/wilsonzlin/datasets/blob/master/search-engine-open-data/notebooks/get-to-know-a-dataset.ipynb AuthorName: Wilson Lin AuthorURL: https://github.com/wilsonzlin \ No newline at end of file