lm-pcg/evaluate.py at main · gdrtodd/lm-pcg · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
from functools import partial
import json
import os
import shutil

from griddly import GymWrapperFactory
import gym
import hydra
from Levenshtein import distance
import matplotlib.pyplot as plt
from multiprocessing import Pool, get_context
import numpy as np
from PIL import Image
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

from conf.config import Config, EvalConfig
from datasets import GameDataset, AnnotatedSokobanDataset
from utils import BOXOBAN_TO_GRIDDLY_CHARS, GRIDDLY_ACTION_MAPPING, get_run_name, load_train_state, save_gif


def evaluate(model: AutoModelForCausalLM, device, tokenizer: AutoTokenizer, dataset: GameDataset, cfg: EvalConfig,
             num_steps_trained: int, verbose=False, render_dir=None, num_proc=1):

    # HACK: to avoid OOM errors on GPU
    if cfg.gen_beams >= 10:
        cfg.sample_sequential = True

    # Map the model to the available device
    model.to(device)

    # Set up for evaluation
    model.eval()

    # Generate samples
    if verbose: print("Generating samples...")
    with torch.no_grad():

        if cfg.sample_contexts:
            contexts = torch.stack([tokenizer.encode(tokenizer.bos_token + dataset.gen_context(), return_tensors="pt").to(device) for
                                    _ in range(cfg.num_eval_samples)], axis=0).squeeze(1)
            return_sequences = 1

        else:
            contexts = tokenizer.encode(tokenizer.bos_token + dataset.gen_context(), return_tensors="pt").to(device)
            return_sequences = cfg.num_eval_samples

        if cfg.sample_sequential and not cfg.sample_contexts:
            samples = [model.generate(
                input_ids=contexts,
                max_length=cfg.gen_len,
                temperature=cfg.gen_temp,
                do_sample=True,
                top_k=cfg.gen_top_k,
                top_p=cfg.gen_top_p,
                typical_p=cfg.gen_typical_p,
                num_beams=cfg.gen_beams,
                num_return_sequences=1,
                pad_token_id=tokenizer.eos_token_id,
            )[0] for _ in range(cfg.num_eval_samples)]

        else:
            samples = model.generate(
                input_ids=contexts,
                max_length=cfg.gen_len,
                temperature=cfg.gen_temp,
                do_sample=True,
                top_k=cfg.gen_top_k,
                top_p=cfg.gen_top_p,
                typical_p=cfg.gen_typical_p,
                num_beams=cfg.gen_beams,
                num_return_sequences=return_sequences,
                pad_token_id=tokenizer.eos_token_id,
            )

    # Decode samples
    samples = [dataset.decode(sample) for sample in samples]


    if cfg.num_eval_proc == 1:
        if verbose: print("Computing solutions...")
        solutions = [dataset.get_solution(sample, verbose=False, n_search_iters=cfg.n_search_iters) for sample in samples]
        novelties, nearest_lvls, nearest_lvl_sols = zip(*[dataset.is_novel(sample) for sample in samples])
        accuracies, infos = zip(*[dataset.is_accurate(sample, solution, cfg.eval_tolerance) for sample, solution in zip(samples, solutions)])

    else:
        # FIXME: This makes things much slower (at least with num_eval_proc=10 or so -- just multiproc overhead?)
        with get_context("spawn").Pool(cfg.num_eval_proc) as pool:
            get_solution = partial(dataset.get_solution, verbose=False, n_search_iters=cfg.n_search_iters)
            solutions = list(tqdm(pool.imap(get_solution, samples), total=len(samples), desc="Computing solutions"))
            samples_sols = list(zip(samples, solutions, [cfg.eval_tolerance] * len(samples)))
            accuracies, infos = zip(*list(tqdm(pool.imap(dataset.is_accurate_multi, samples_sols), total=len(samples_sols), desc="Computing accuracies")))
            novelties, nearest_lvls, nearest_lvl_sols = zip(*list(tqdm(pool.imap(dataset.is_novel, samples), total=len(samples), desc="Computing novelties")))


    # Convert solutions to strings using the griddly action mapping
    solutions = ["" if sol is False else "".join([str(GRIDDLY_ACTION_MAPPING[(step['x'], step['y'])]) for step in sol]) for sol in solutions]
    nearest_lvl_sols = ["" if sol is False else "".join([str(GRIDDLY_ACTION_MAPPING[(step['x'], step['y'])]) for step in sol]) for sol in nearest_lvl_sols]

    num_accurate = sum(accuracies)
    num_playable = sum([len(sol) > 0 for sol in solutions])
    num_novel = sum(novelties)

    prop_accurate = num_accurate / len(samples)
    prop_playable = num_playable / len(samples)
    prop_novel = num_novel / len(samples)

    if verbose: print("Computing diversity...")
    num_diverse = dataset.get_diversity(samples)
    diversity = num_diverse / len(samples)

    # Compute the number of levels that are novel, playable, and accurate
    novel_playable_accurate_levels = [level for idx, level in enumerate(samples) if novelties[idx] and len(solutions[idx]) > 0 and accuracies[idx]]
    prop_novel_playable_accurate = len(novel_playable_accurate_levels) / len(samples)
    restricted_diversity = dataset.get_diversity(novel_playable_accurate_levels) / len(samples)

    # Slightly less restricted diversity: only require that the level is playable and novel (not accurate)
    # For non-controlled experiments, this will be the same as above (avoid recomputing diversity).
    if all(accuracies):
        prop_novel_playable = prop_novel_playable_accurate
        less_restricted_diversity = restricted_diversity
    else:
        novel_playable_levels = [level for idx, level in enumerate(samples) if novelties[idx] and len(solutions[idx]) > 0]
        prop_novel_playable = len(novel_playable_levels) / len(samples)
        less_restricted_diversity = dataset.get_diversity(novel_playable_levels) / len(samples)

    if verbose:
        print("GENERATION PARAMETERS:")
        print(f"\tLength: {cfg.gen_len}")
        print(f"\tTemperature: {cfg.gen_temp}")
        print(f"\tTop-k: {cfg.gen_top_k}")
        print(f"\tTop-p: {cfg.gen_top_p}")
        print(f"\tTypical-p: {cfg.gen_typical_p}")
        print(f"\tBeams: {cfg.gen_beams}")

        for idx, sample in enumerate(samples):
            print("_" * 80)

            line_offset = len(sample.split("\n")[0]) - len("SAMPLE") # just for lining things up
            print(f"SAMPLE{' ' * line_offset}\t\t \t\tNEAREST LEVEL")
            for l1, l2 in zip(sample.split("\n"), nearest_lvls[idx].split("\n")):
                print(f"{l1.replace('-', ' ')}\t\t|\t\t{l2.replace('-', ' ')}")


            print(f"\nSample {idx + 1} of {cfg.num_eval_samples}")
            print(f"Playable: {solutions[idx] != ''}" + (f" ({len(solutions[idx])} steps)" if solutions[idx] != "" else ""))
            print(f"Novel: {novelties[idx]}")

            print(f"-Level edit distance: {distance(sample, nearest_lvls[idx])}")
            if solutions[idx]:
                print(f"-Solution edit distance: {distance(solutions[idx], nearest_lvl_sols[idx])}")

            print(f"Accurate: {accuracies[idx]}")
            if cfg.annotation_keys is not None:
                for key in cfg.annotation_keys:
                    print(f"\t{key}: {infos[idx][key]}")

        print("_" * 80)
        print(f"Proportion accurate: {prop_accurate}")
        print(f"Proportion playable: {prop_playable}")
        print(f"Proportion novel: {prop_novel}")
        print(f"Diversity (lower bound): {diversity}")
        if cfg.annotation_keys is not None:
            print(f"\nPropotion novel, playable, and accurate: {prop_novel_playable_accurate}")
            print(f"Diversity (restricted): {restricted_diversity}")
        print(f"\nProprtion novel and playable: {prop_novel_playable}")
        print(f"Diversity (less restricted): {less_restricted_diversity}")

    # Save stats to json
    stats = {
        "num_steps_trained": num_steps_trained,
        "novelty_threshold": dataset.novelty_threshold,
        "prop_accurate": prop_accurate,
        "prop_playable": prop_playable,
        "prop_novel": prop_novel,
        "prop_novel_playable_accurate": prop_novel_playable_accurate,
        "diversity": diversity,
        "restricted_diversity": restricted_diversity,
        "less_restricted_diversity": less_restricted_diversity,
        "samples": samples,
        "solutions": solutions,
        "accuracies": accuracies,
        "novelties": novelties,
        "infos": infos,
        "n_search_iters": cfg.n_search_iters,
    }


    # Save json to disc
    run_name = get_run_name(cfg)
    eval_filename = f"temp-{cfg.gen_temp}_topk-{cfg.gen_top_k}_topp-{cfg.gen_top_p}_typicalp-{cfg.gen_typical_p}_beams-{cfg.gen_beams}_threshold-{dataset.novelty_threshold}.json"
    stats_path = os.path.join('logs', run_name, eval_filename)
    with open(stats_path, "w") as f:
        json.dump(stats, f, indent=4)

    # Render generated levels and animate solutions if applicable.
    if render_dir:
        if os.path.isdir(render_dir):
            # Delete old renderings
            shutil.rmtree(render_dir)
        os.makedirs(render_dir)

        # Remove the fucking prompt
        if cfg.annotation_keys is not None:
            n_annotation_keys = len(cfg.annotation_keys)
            samples = ['\n'.join(sample.split("\n")[n_annotation_keys:]) for sample in samples]

        trans_table = {ord(k): v for k, v in BOXOBAN_TO_GRIDDLY_CHARS.items()}
        wrapper = GymWrapperFactory()
        wrapper.build_gym_from_yaml('sokoban', os.path.join('gdy_games', 'sokoban.yaml'))
        env = gym.make('GDY-sokoban-v0')
        for i, (sample, sol) in enumerate(zip(samples, solutions)):

            lvl = sample.translate(trans_table)
            lvl_render_dir = os.path.join(render_dir, f"lvl_{i}_{len(sol)}-sol")
            save_gif(env, lvl, sol, lvl_render_dir)

            nearest_lvl = nearest_lvls[i].translate(trans_table)
            nearest_lvl_sol = nearest_lvl_sols[i]
            lvl_render_dir = os.path.join(render_dir, f"lvl_{i}_nearest_train")
            save_gif(env, nearest_lvl, sol=nearest_lvl_sol, lvl_render_dir=lvl_render_dir)

        env.close()
    model.train()

    return prop_accurate, prop_playable, prop_novel, diversity


def eval_controllability(model: AutoModelForCausalLM, device, tokenizer: AutoTokenizer, dataset: GameDataset, args: Config):
    '''
    Evaluate the controllability of the specifeid model by generating a number of levels for a variety of sample conditions,
    and generating a confusion matrix based on the results
    '''

    model.to(device)

    assert args.annotation_keys is not None, "Must specify annotation keys to evaluate controllability"

    # Initialize confusion matrix. Each row represents a generated annotation bin, and each column represents a target
    # annotation bin. We have an extra row for the "unplayable" bin
    confusion_matrix = np.zeros((11, 10))
    bottom_row_idx = confusion_matrix.shape[0] - 1

    if args.annotation_keys == ["solution_len"]:
        targets, width = list(range(5, 100, 10)), 10                                # middle of each of 10 bins from 0 to 100
        contexts = [dataset._format_annotation([target]) for target in targets]

    elif args.annotation_keys == ["prop_empty"]:
        targets, width = list(range(0, 1, 10)), 0.1                                # middle of each of 10 bins from 0 to 100
        contexts = [dataset._format_annotation([target]) for target in targets]

    else:
        raise NotImplementedError

    with torch.no_grad():
        for context_idx, context in tqdm(enumerate(contexts), total=len(contexts), desc="Determining controllability"):
            samples = model.generate(
                tokenizer.encode(tokenizer.bos_token + dataset.gen_context(), return_tensors="pt").to(device),
                max_length=args.gen_len,
                temperature=args.gen_temp,
                do_sample=True,
                top_k=args.gen_top_k,
                top_p=args.gen_top_p,
                typical_p=args.gen_typical_p,
                num_beams=args.gen_beams,
                num_return_sequences=args.num_eval_samples,
                pad_token_id=tokenizer.eos_token_id,
            )

            samples = [dataset.decode(sample) for sample in samples]

            if args.num_eval_proc == 1:
                solutions = [dataset.get_solution(sample, verbose=False) for sample in tqdm(samples, total=len(samples), desc="Computing solutions",
                                                                                            leave=False)]

            else:
                # FIXME: This makes things much slower (at least with num_eval_proc=10 or so -- just multiproc overhead?)
                with get_context("spawn").Pool(args.num_eval_proc) as pool:
                    get_solution = partial(dataset.get_solution, verbose=False)
                    solutions = list(tqdm(pool.imap(get_solution, samples), total=len(samples), desc="Computing solutions", leave=False))

            solutions = ["" if sol is False else "".join([str(GRIDDLY_ACTION_MAPPING[(step['x'], step['y'])]) for step in sol]) for sol in solutions]

            if args.annotation_keys == ["solution_len"]:
                name = "Solution Length"
                for solution in solutions:
                    if len(solution) == 0:
                        observed_idx = bottom_row_idx # bottom row is for unplayable levels

                    else:
                        observed_idx = max(bottom_row_idx - int(len(solution) / width) - 1, 0)

                    confusion_matrix[observed_idx, context_idx] += 1

            if args.annotation_keys == ["prop_empty"]:
                name = "Emptiness"
                emptinesses = [dataset.get_emptiness(sample) for sample in samples]
                for emptiness in emptinesses:
                    if len(solution) == 0:
                        observed_idx = bottom_row_idx # bottom row is for unplayable levels

                    else:
                        observed_idx = max(bottom_row_idx - int(len(solution) / width) - 1, 0)

                    confusion_matrix[observed_idx, context_idx] += 1

        # Generate the heatmap
        fig, ax = plt.subplots()
        im = ax.imshow(confusion_matrix)

        if args.annotation_keys == ["solution_len"]:
            limits = [(int(target-(width/2)+1), int(target+(width/2))) for target in targets]

        if args.annotation_keys == ["prop_empty"]:
            limits = [(int(target-(width/2)+1), int(target+(width/2))) for target in targets]

        x_labels = [f"{lower}-{upper}" for lower, upper in limits]
        y_labels = [f"{lower}-{upper}" for lower, upper in reversed(limits)] + ["Unplayable"]

        # Show ticks
        ax.set_xticks(np.arange(len(x_labels)), labels=x_labels)
        ax.set_yticks(np.arange(len(y_labels)), labels=y_labels)

        # Rotate the x-tick labels and set their alignment.
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
                 rotation_mode="anchor")

        ax.set_title(f"Controllability Confusion Matrix for {name}")
        ax.set_xlabel(f"Target {name}")
        ax.set_ylabel(f"Actual {name}")

        fig.tight_layout()

        run_name = f"{args.model}_temp-{args.gen_temp}_topp-{args.gen_top_p}_beams-{args.gen_beams}_seed-{args.seed}_{'+'.join(args.annotation_keys)}"
        plt.savefig(f"./results/{run_name}_controllability_heatmap.png")
        np.save(f"./results/{run_name}_controllability_confusion.npy", confusion_matrix)


@hydra.main(version_base="1.2.0", config_path="conf", config_name="eval")
def main(args: Config):
    run_name = get_run_name(args)
    output_dir = f"./logs/{run_name}"

    model, _, global_step = load_train_state(output_dir, lora=args.lora)

    model_mapping = {"gpt2": "gpt2",
                     "gpt2-untrained": "gpt2-untrained",
                     "codeparrot": "lvwerra/codeparrot",
                     "java-gpt2": "microsoft/CodeGPT-small-java-adaptedGPT2",
                     "incoder-1B": "facebook/incoder-1B",
                     "incoder-6B": "facebook/incoder-6B"}

    # Instantiate the tokenizer based on the model's
    model_name = model_mapping[args.model]

    if args.model == "gpt2-untrained":
        tokenizer_dir = os.path.join("./caches", "gpt2-custom-tokenizer", args.game)

        # Load the custom tokenizer if it exists
        if os.path.exists(os.path.join(tokenizer_dir, "vocab.json")) and os.path.exists(os.path.join(tokenizer_dir, "merges.txt")):
            print(f"Loading tokenizer from cache at {tokenizer_dir}...")
            tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)

            tokenizer.add_special_tokens({"pad_token": "<pad>",
                                          "bos_token": "<s>",
                                          "eos_token": "</s>"})
        else:
            exit("No custom tokenizer found for gpt2-untrained. Please run train_lm.py first.")

    else:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenizer.add_special_tokens({"pad_token": "PAD",
                                    "bos_token": "START"})

    # Instantiate the dataset
    if args.game == "sokoban":
        dataset = AnnotatedSokobanDataset(args.source,
                                          tokenizer,
                                          args.model,
                                          level_key=args.level_key,
                                          annotation_keys=args.annotation_keys,
                                          num_annotation_buckets=args.num_annotation_buckets,
                                          holdout_solution_lens=args.holdout_solution_lens,
                                          split="train",
                                          novelty_threshold=args.novelty_threshold,
                                          sample_prop=args.sample_prop,
                                          chunk_size=args.chunk_size,
                                          seed=args.seed,
                                          cfg=args)

    else:
        raise NotImplementedError

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    render_dir = None
    if args.render:
        render_dir = os.path.join(output_dir, 'renders')

    if args.eval_controllability:
        eval_controllability(model, device, tokenizer, dataset, args)
    else:
        evaluate(model, device, tokenizer, dataset, args, verbose=True, num_steps_trained=global_step,
                render_dir=render_dir, num_proc=args.num_eval_proc)

    # SIGSEGV ?? ... griddly?

    return

if __name__ == "__main__":
    main()