-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathpg_iterator.py
More file actions
176 lines (160 loc) · 6.75 KB
/
pg_iterator.py
File metadata and controls
176 lines (160 loc) · 6.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import psycopg2 as pg
import pdb
import itertools
from utils.utils import *
from collections import defaultdict
from job_parse import JOBQuery
import random
MAX_WORD_LEN = 14
class PGIterator:
"""Class to implement an iterator
of powers of two"""
def __init__(self, sql_queries, args):
self.sql_queries = sql_queries
self.args = args
self.cur_id = 0
def __iter__(self):
self.sentences = self.sentence_generator(self.sql_queries, self.args)
return self
def __next__(self):
return next(self.sentences)
def sentence_generator(self, sql_queries, args):
'''
Takes a bunch of sql queries, and generates one sentence at a time, using
whatever rules have been specified in the args.
'''
def find_relevant_attributes():
queries = []
# key: table_name, value: list of attributes
tables = defaultdict(set)
for fn in glob.glob("job/*.sql"):
# if (self.args.query_match not in fn):
# continue
queries.append(JOBQuery(fn))
for q in queries:
attrs = q.attrs_with_predicate()
for a in attrs:
assert len(a) == 2
if ("gender=" in a[1]):
continue
if self.args.no_id:
if ("id" in a[1]):
continue
tables[a[0]].add(a[1])
joins = q.joins()
for j in joins:
assert len(j) == 4
if ("gender=" in j[1] or "gender=" in j[3]):
print("gender= parsing error")
continue
if self.args.no_id:
if ("id" in j[1] or "id" in j[3]):
continue
tables[j[0]].add(j[1])
tables[j[2]].add(j[3])
return tables
def handle_sentence(row, attrs):
'''
FIXME: handle special cases better.
'''
sentences = []
sentence = []
if args.synthetic_db_debug:
# choose a random word, and add all the words in the row as
# a sentence with this random word.
# rand_word = str(random.random())
# rand_word = str(float(row[1])*1000)
rand_word = str(row[1] + row[2])
sentences.append([rand_word, row[1]])
sentences.append([rand_word, row[2]])
return sentences
for i, word in enumerate(row):
if "id" in attrs[i] and args.no_id:
# print("skipping ", attrs[i])
continue
if word != None:
if args.split_words:
# This doesn't seem what we want for multi-word things as
# in keywords. Especially, if we need to match LIKE
# statements on that...
all_words = str(word).split()
if len(all_words) > 6:
continue
for w in all_words:
if (len(w) > MAX_WORD_LEN):
continue
if not args.no_preprocess_word:
w = preprocess_word(w, exclude_the=args.exclude_the,
exclude_nums=args.exclude_nums)
sentence.append(w)
else:
if (len(word) > MAX_WORD_LEN):
continue
if not args.no_preprocess_word:
word = preprocess_word(str(word), exclude_the=args.exclude_the,
exclude_nums=args.exclude_nums)
sentence.append(word)
return [sentence]
def get_relevant_select(tables, sql):
# let us find the relevant attributes
sel_attrs = []
for table in tables:
if table in sql:
attributes = str(tables[table])
# convert attributes to a string
attributes = attributes.replace("{", "")
attributes = attributes.replace("}", "")
# messes up the selects
attributes = attributes.replace("'", "")
attributes = attributes.replace(",", "")
# just add each of them
all_attrs = attributes.split()
# for each of these, need to add the name.alias guys
for idx, _ in enumerate(all_attrs):
all_attrs[idx] = table + "." + all_attrs[idx]
sel_attrs += all_attrs
select = ""
for i, sa in enumerate(sel_attrs):
select += sa
if (i != len(sel_attrs)-1):
select += ","
return select
def get_cursor(query):
# format the query
conn = pg.connect(host=args.db_host, database=args.db_name)
cur = conn.cursor(name="named_cursor_more_efficient")
print("going to execute: ", query)
cur.execute(query)
print("successfully executed query!")
return cur, conn
if args.relevant_selects:
tables = find_relevant_attributes()
print("going to go over sql queries")
for sql in sql_queries:
select = "*"
if args.relevant_selects:
select = get_relevant_select(tables, sql)
query = sql.replace("*", select)
cursor, conn = get_cursor(query)
# go over every row in the cursor, and yield all valid sentences
attrs = None
for row_num, row in enumerate(cursor):
if row_num % 1000000 == 0:
print("row_num: ", row_num)
if attrs is None:
descr = cursor.description
attrs = []
for d in descr:
attrs.append(d[0])
if args.train_pairs:
assert False, "use same function"
# pair_words = list(itertools.combinations(row, 2))
# for pair in pair_words:
# yield handle_sentence(pair, attrs)
else:
sentences = handle_sentence(row, attrs)
for cur_sentence in sentences:
yield cur_sentence
#preprocess_rows(sentences, rows, res_attrs)
cursor.close()
conn.close()