db-embedding-tools/pg_iterator.py at master · parimarjan/db-embedding-tools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
import psycopg2 as pg
import pdb
import itertools
from utils.utils import *
from collections import defaultdict
from job_parse import JOBQuery
import random

MAX_WORD_LEN = 14

class PGIterator:
    """Class to implement an iterator
    of powers of two"""

    def __init__(self, sql_queries, args):
        self.sql_queries = sql_queries
        self.args = args
        self.cur_id = 0

    def __iter__(self):
        self.sentences = self.sentence_generator(self.sql_queries, self.args)
        return self

    def __next__(self):
        return next(self.sentences)

    def sentence_generator(self, sql_queries, args):
        '''
        Takes a bunch of sql queries, and generates one sentence at a time, using
        whatever rules have been specified in the args.
        '''
        def find_relevant_attributes():
            queries = []
            # key: table_name, value: list of attributes
            tables = defaultdict(set)
            for fn in glob.glob("job/*.sql"):
                # if (self.args.query_match not in fn):
                    # continue
                queries.append(JOBQuery(fn))
            for q in queries:
                attrs = q.attrs_with_predicate()
                for a in attrs:
                    assert len(a) == 2
                    if ("gender=" in a[1]):
                        continue
                    if self.args.no_id:
                        if ("id" in a[1]):
                            continue
                    tables[a[0]].add(a[1])

                joins = q.joins()
                for j in joins:
                    assert len(j) == 4
                    if ("gender=" in j[1] or "gender=" in j[3]):
                        print("gender= parsing error")
                        continue
                    if self.args.no_id:
                        if ("id" in j[1] or "id" in j[3]):
                            continue
                    tables[j[0]].add(j[1])
                    tables[j[2]].add(j[3])

            return tables

        def handle_sentence(row, attrs):
            '''
            FIXME: handle special cases better.
            '''
            sentences = []
            sentence = []

            if args.synthetic_db_debug:
                # choose a random word, and add all the words in the row as
                # a sentence with this random word.
                # rand_word = str(random.random())
                # rand_word = str(float(row[1])*1000)
                rand_word = str(row[1] + row[2])
                sentences.append([rand_word, row[1]])
                sentences.append([rand_word, row[2]])
                return sentences

            for i, word in enumerate(row):
                if "id" in attrs[i] and args.no_id:
                    # print("skipping ", attrs[i])
                    continue
                if word != None:
                    if args.split_words:
                        # This doesn't seem what we want for multi-word things as
                        # in keywords. Especially, if we need to match LIKE
                        # statements on that...
                        all_words = str(word).split()
                        if len(all_words) > 6:
                            continue
                        for w in all_words:
                            if (len(w) > MAX_WORD_LEN):
                                continue
                            if not args.no_preprocess_word:
                                w = preprocess_word(w, exclude_the=args.exclude_the,
                                        exclude_nums=args.exclude_nums)
                            sentence.append(w)
                    else:
                        if (len(word) > MAX_WORD_LEN):
                            continue
                        if not args.no_preprocess_word:
                            word = preprocess_word(str(word), exclude_the=args.exclude_the,
                                    exclude_nums=args.exclude_nums)
                        sentence.append(word)
            return [sentence]

        def get_relevant_select(tables, sql):
            # let us find the relevant attributes
            sel_attrs = []
            for table in tables:
                if table in sql:
                    attributes = str(tables[table])
                    # convert attributes to a string
                    attributes = attributes.replace("{", "")
                    attributes = attributes.replace("}", "")
                    # messes up the selects
                    attributes = attributes.replace("'", "")
                    attributes = attributes.replace(",", "")
                    # just add each of them
                    all_attrs = attributes.split()
                    # for each of these, need to add the name.alias guys
                    for idx, _ in enumerate(all_attrs):
                        all_attrs[idx] = table + "." + all_attrs[idx]
                    sel_attrs += all_attrs
            select = ""
            for i, sa in enumerate(sel_attrs):
                select += sa
                if (i != len(sel_attrs)-1):
                    select += ","
            return select

        def get_cursor(query):
            # format the query
            conn = pg.connect(host=args.db_host, database=args.db_name)
            cur = conn.cursor(name="named_cursor_more_efficient")
            print("going to execute: ", query)
            cur.execute(query)
            print("successfully executed query!")
            return cur, conn

        if args.relevant_selects:
            tables = find_relevant_attributes()

        print("going to go over sql queries")
        for sql in sql_queries:
            select = "*"
            if args.relevant_selects:
                select = get_relevant_select(tables, sql)
            query = sql.replace("*", select)
            cursor, conn = get_cursor(query)
            # go over every row in the cursor, and yield all valid sentences
            attrs = None
            for row_num, row in enumerate(cursor):
                if row_num % 1000000 == 0:
                    print("row_num: ", row_num)
                if attrs is None:
                    descr = cursor.description
                    attrs = []
                    for d in descr:
                        attrs.append(d[0])
                if args.train_pairs:
                   assert False, "use same function"
                   # pair_words  = list(itertools.combinations(row, 2))
                   # for pair in pair_words:
                       # yield handle_sentence(pair, attrs)
                else:
                    sentences = handle_sentence(row, attrs)
                    for cur_sentence in sentences:
                        yield cur_sentence

            #preprocess_rows(sentences, rows, res_attrs)
            cursor.close()
            conn.close()