Skip to content

Commit aec3888

Browse files
committed
Python: Port the term_index C++ example to Python.
1 parent 1818780 commit aec3888

File tree

1 file changed

+101
-0
lines changed

1 file changed

+101
-0
lines changed

doc/examples/term_index.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
"""
2+
This example shows how to add spatial data to an information retrieval
3+
system. Such systems work by converting documents into a collection of
4+
"index terms" (e.g., representing words or phrases), and then building an
5+
"inverted index" that maps each term to a list of documents (and document
6+
positions) where that term occurs.
7+
8+
This example shows how to convert spatial data into index terms, which can
9+
then be indexed along with the other document information.
10+
11+
This is a port of the C++ term_index.cc example for the Python API.
12+
"""
13+
import argparse
14+
from collections import defaultdict
15+
16+
import pywraps2 as s2
17+
18+
19+
def main():
20+
parser = argparse.ArgumentParser(
21+
description=(
22+
"This example shows how to convert spatial data into index terms, "
23+
"which can then be indexed along with the other document "
24+
"information."
25+
)
26+
)
27+
parser.add_argument(
28+
'--num_documents', type=int, default=10000, help="Number of documents"
29+
)
30+
parser.add_argument(
31+
'--num_queries', type=int, default=10000, help="Number of queries"
32+
)
33+
parser.add_argument(
34+
'--query_radius_km', type=float, default=100,
35+
help="Query radius in kilometers"
36+
)
37+
38+
args = parser.parse_args()
39+
40+
# A prefix added to spatial terms to distinguish them from other index terms
41+
# (e.g. representing words or phrases).
42+
PREFIX = "s2:"
43+
44+
# Create a set of "documents" to be indexed. Each document consists of a
45+
# single point. (You can easily substitute any S2Region type here, or even
46+
# index a mixture of region types using S2Region. Other
47+
# region types include polygons, polylines, rectangles, discs, buffered
48+
# geometry, etc.)
49+
documents = []
50+
for i in range(args.num_documents):
51+
documents.append(s2.S2Testing.RandomPoint())
52+
53+
# We use a dict as our inverted index. The key is an index term, and
54+
# the value is the set of "document ids" where this index term is present.
55+
index = defaultdict(set)
56+
57+
# Create an indexer suitable for an index that contains points only.
58+
# (You may also want to adjust min_level() or max_level() if you plan
59+
# on querying very large or very small regions.)
60+
indexer = s2.S2RegionTermIndexer()
61+
indexer.set_index_contains_points_only(True)
62+
63+
# Add the documents to the index.
64+
for docid, index_region in enumerate(documents):
65+
for term in indexer.GetIndexTerms(index_region, PREFIX):
66+
index[term].add(docid)
67+
68+
# Convert the query radius to an angle representation.
69+
radius = s2.S1Angle.Radians(s2.S2Earth.KmToRadians(args.query_radius_km))
70+
71+
# Count the number of documents (points) found in all queries.
72+
num_found = 0
73+
for i in range(args.num_queries):
74+
# Choose a random center for query.
75+
query_region = s2.S2Cap(s2.S2Testing.RandomPoint(), radius)
76+
77+
# Convert the query region to a set of terms, and compute the union of
78+
# the document ids associated with those terms. (An actual information
79+
# retrieval system would do something more sophisticated.)
80+
candidates = set()
81+
for term in indexer.GetQueryTerms(query_region, PREFIX):
82+
candidates |= index[term]
83+
84+
# "candidates" now contains all documents that intersect the query
85+
# region, along with some documents that nearly intersect it. We can
86+
# prune the results by retrieving the original "document" and checking
87+
# the distance more precisely.
88+
result = []
89+
for docid in candidates:
90+
if query_region.Contains(documents[docid]):
91+
result.append(docid)
92+
93+
# Now do something with the results (in this example we just count
94+
# them).
95+
num_found += len(result)
96+
97+
print("Found %d points in %d queries" % (num_found, args.num_queries))
98+
99+
100+
if __name__ == "__main__":
101+
main()

0 commit comments

Comments
 (0)