|
| 1 | +""" |
| 2 | +This example shows how to add spatial data to an information retrieval |
| 3 | +system. Such systems work by converting documents into a collection of |
| 4 | +"index terms" (e.g., representing words or phrases), and then building an |
| 5 | +"inverted index" that maps each term to a list of documents (and document |
| 6 | +positions) where that term occurs. |
| 7 | +
|
| 8 | +This example shows how to convert spatial data into index terms, which can |
| 9 | +then be indexed along with the other document information. |
| 10 | +
|
| 11 | +This is a port of the C++ term_index.cc example for the Python API. |
| 12 | +""" |
| 13 | +import argparse |
| 14 | +from collections import defaultdict |
| 15 | + |
| 16 | +import pywraps2 as s2 |
| 17 | + |
| 18 | + |
| 19 | +def main(): |
| 20 | + parser = argparse.ArgumentParser( |
| 21 | + description=( |
| 22 | + "This example shows how to convert spatial data into index terms, " |
| 23 | + "which can then be indexed along with the other document " |
| 24 | + "information." |
| 25 | + ) |
| 26 | + ) |
| 27 | + parser.add_argument( |
| 28 | + '--num_documents', type=int, default=10000, help="Number of documents" |
| 29 | + ) |
| 30 | + parser.add_argument( |
| 31 | + '--num_queries', type=int, default=10000, help="Number of queries" |
| 32 | + ) |
| 33 | + parser.add_argument( |
| 34 | + '--query_radius_km', type=float, default=100, |
| 35 | + help="Query radius in kilometers" |
| 36 | + ) |
| 37 | + |
| 38 | + args = parser.parse_args() |
| 39 | + |
| 40 | + # A prefix added to spatial terms to distinguish them from other index terms |
| 41 | + # (e.g. representing words or phrases). |
| 42 | + PREFIX = "s2:" |
| 43 | + |
| 44 | + # Create a set of "documents" to be indexed. Each document consists of a |
| 45 | + # single point. (You can easily substitute any S2Region type here, or even |
| 46 | + # index a mixture of region types using S2Region. Other |
| 47 | + # region types include polygons, polylines, rectangles, discs, buffered |
| 48 | + # geometry, etc.) |
| 49 | + documents = [] |
| 50 | + for i in range(args.num_documents): |
| 51 | + documents.append(s2.S2Testing.RandomPoint()) |
| 52 | + |
| 53 | + # We use a dict as our inverted index. The key is an index term, and |
| 54 | + # the value is the set of "document ids" where this index term is present. |
| 55 | + index = defaultdict(set) |
| 56 | + |
| 57 | + # Create an indexer suitable for an index that contains points only. |
| 58 | + # (You may also want to adjust min_level() or max_level() if you plan |
| 59 | + # on querying very large or very small regions.) |
| 60 | + indexer = s2.S2RegionTermIndexer() |
| 61 | + indexer.set_index_contains_points_only(True) |
| 62 | + |
| 63 | + # Add the documents to the index. |
| 64 | + for docid, index_region in enumerate(documents): |
| 65 | + for term in indexer.GetIndexTerms(index_region, PREFIX): |
| 66 | + index[term].add(docid) |
| 67 | + |
| 68 | + # Convert the query radius to an angle representation. |
| 69 | + radius = s2.S1Angle.Radians(s2.S2Earth.KmToRadians(args.query_radius_km)) |
| 70 | + |
| 71 | + # Count the number of documents (points) found in all queries. |
| 72 | + num_found = 0 |
| 73 | + for i in range(args.num_queries): |
| 74 | + # Choose a random center for query. |
| 75 | + query_region = s2.S2Cap(s2.S2Testing.RandomPoint(), radius) |
| 76 | + |
| 77 | + # Convert the query region to a set of terms, and compute the union of |
| 78 | + # the document ids associated with those terms. (An actual information |
| 79 | + # retrieval system would do something more sophisticated.) |
| 80 | + candidates = set() |
| 81 | + for term in indexer.GetQueryTerms(query_region, PREFIX): |
| 82 | + candidates |= index[term] |
| 83 | + |
| 84 | + # "candidates" now contains all documents that intersect the query |
| 85 | + # region, along with some documents that nearly intersect it. We can |
| 86 | + # prune the results by retrieving the original "document" and checking |
| 87 | + # the distance more precisely. |
| 88 | + result = [] |
| 89 | + for docid in candidates: |
| 90 | + if query_region.Contains(documents[docid]): |
| 91 | + result.append(docid) |
| 92 | + |
| 93 | + # Now do something with the results (in this example we just count |
| 94 | + # them). |
| 95 | + num_found += len(result) |
| 96 | + |
| 97 | + print("Found %d points in %d queries" % (num_found, args.num_queries)) |
| 98 | + |
| 99 | + |
| 100 | +if __name__ == "__main__": |
| 101 | + main() |
0 commit comments