Skip to content

Commit 828491f

Browse files
committed
squad init
1 parent cefdb80 commit 828491f

14 files changed

Lines changed: 524 additions & 11 deletions

File tree

examples/t5/BoolQ.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ OPTS+=" --clip-grad 1.0"
3636
OPTS+=" --loss-scale 128"
3737
# OPTS+=" --load ${BASE_PATH}/results/T5-${VERSION}.pt"
3838

39-
CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5.py ${OPTS}"
39+
CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5_superglue.py ${OPTS}"
4040
echo ${CMD}
4141

4242
${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5_superglue/finetune-t5-${VERSION}-${DATASET}.log

examples/t5/CB.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ OPTS+=" --clip-grad 1.0"
3636
OPTS+=" --loss-scale 128"
3737
# OPTS+=" --load ${BASE_PATH}/results/T5-${VERSION}.pt"
3838

39-
CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5.py ${OPTS}"
39+
CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5_superglue.py ${OPTS}"
4040
echo ${CMD}
4141

4242
${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5_superglue/finetune-t5-${VERSION}-${DATASET}.log

examples/t5/COPA.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ OPTS+=" --clip-grad 1.0"
3636
OPTS+=" --loss-scale 128"
3737
# OPTS+=" --load ${BASE_PATH}/results/T5-${VERSION}.pt"
3838

39-
CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5.py ${OPTS}"
39+
CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5_superglue.py ${OPTS}"
4040
echo ${CMD}
4141

4242
${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5_superglue/finetune-t5-${VERSION}-${DATASET}.log

examples/t5/RTE.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ OPTS+=" --clip-grad 10.0"
3636
OPTS+=" --loss-scale 128"
3737
# OPTS+=" --load ${BASE_PATH}/results/T5-${VERSION}.pt"
3838

39-
CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5.py ${OPTS}"
39+
CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5_superglue.py ${OPTS}"
4040
echo ${CMD}
4141

4242
${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5_superglue/finetune-t5-${VERSION}-${DATASET}.log

examples/t5/SQuAD.sh

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#! /bin/bash
2+
3+
MASTER_ADDR=localhost
4+
MASTER_PORT=12345
5+
NNODES=1
6+
NODE_RANK=0
7+
GPUS_PER_NODE=2
8+
9+
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10+
--nnodes $NNODES \
11+
--node_rank $NODE_RANK \
12+
--master_addr $MASTER_ADDR \
13+
--master_port $MASTER_PORT"
14+
15+
BASE_PATH="/data/ModelCenter"
16+
VERSION="3b"
17+
DATASET="SQuAD"
18+
19+
OPTS=""
20+
OPTS+=" --dataset ${DATASET}"
21+
OPTS+=" --base-path ${BASE_PATH}"
22+
OPTS+=" --model-config ${BASE_PATH}/results/t5-${VERSION}"
23+
OPTS+=" --batch-size 16"
24+
OPTS+=" --train-iters 1400"
25+
OPTS+=" --save-iters 1000"
26+
OPTS+=" --max-encoder-length 512"
27+
OPTS+=" --max-decoder-length 32"
28+
OPTS+=" --save ${BASE_PATH}/results"
29+
OPTS+=" --save-name finetune-t5-ckpt"
30+
OPTS+=" --lr 0.00001"
31+
OPTS+=" --inspect-iters 100"
32+
OPTS+=" --warmup-iters 140"
33+
OPTS+=" --lr-decay-style constant"
34+
OPTS+=" --weight-decay 1e-2"
35+
OPTS+=" --clip-grad 1.0"
36+
OPTS+=" --loss-scale 128"
37+
38+
CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5_squad.py ${OPTS}"
39+
echo ${CMD}
40+
41+
${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5_squad/finetune-t5-${VERSION}-${DATASET}.log

examples/t5/WSC.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ OPTS+=" --clip-grad 1.0"
3636
OPTS+=" --loss-scale 128"
3737
# OPTS+=" --load ${BASE_PATH}/results/T5-${VERSION}.pt"
3838

39-
CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5.py ${OPTS}"
39+
CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5_superglue.py ${OPTS}"
4040
echo ${CMD}
4141

4242
${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5_superglue/finetune-t5-${VERSION}-${DATASET}.log

examples/t5/WiC.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ OPTS+=" --clip-grad 1.0"
3636
OPTS+=" --loss-scale 128"
3737
# OPTS+=" --load ${BASE_PATH}/results/T5-${VERSION}.pt"
3838

39-
CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5.py ${OPTS}"
39+
CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5_superglue.py ${OPTS}"
4040
echo ${CMD}
4141

4242
${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5_superglue/finetune-t5-${VERSION}-${DATASET}.log

examples/t5/finetune_t5_squad.py

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
import time
2+
import random
3+
import os
4+
import csv
5+
6+
import torch
7+
import numpy as np
8+
from squad_metric import squad_metric
9+
10+
import bmtrain as bmt
11+
12+
from model_center import get_args
13+
from model_center.model import T5
14+
from model_center.generation.t5 import T5BeamSearch
15+
from model_center.tokenizer import T5Tokenizer
16+
from model_center.dataset.t5dataset import DATASET
17+
from model_center.utils import print_inspect
18+
from model_center.dataset import DistributedDataLoader
19+
from torch.utils.data import DataLoader
20+
21+
22+
def get_tokenizer(args):
23+
tokenizer = T5Tokenizer.from_pretrained(args.model_config)
24+
return tokenizer
25+
26+
def get_model(args):
27+
model = T5.from_pretrained(args.model_config)
28+
return model
29+
30+
def get_optimizer(args, model):
31+
optimizer = bmt.optim.AdamOffloadOptimizer(model.parameters(), weight_decay=args.weight_decay)
32+
return optimizer
33+
34+
def get_learning_rate_scheduler(args, optimizer):
35+
if args.lr_decay_iters is None:
36+
args.lr_decay_iters = args.train_iters * args.epochs
37+
if args.lr_decay_style == "noam":
38+
lr_scheduler = bmt.lr_scheduler.Noam(optimizer,
39+
start_lr = args.lr,
40+
warmup_iter = args.warmup_iters,
41+
end_iter = args.lr_decay_iters,
42+
num_iter = args.start_step)
43+
elif args.lr_decay_style == "constant":
44+
lr_scheduler = bmt.lr_scheduler.NoDecay(optimizer,
45+
start_lr = args.lr,
46+
warmup_iter = args.warmup_iters,
47+
end_iter = -1,
48+
num_iter = args.start_step)
49+
elif args.lr_decay_style == "linear":
50+
lr_scheduler = bmt.lr_scheduler.Linear(optimizer,
51+
start_lr = args.lr,
52+
warmup_iter = args.warmup_iters,
53+
end_iter = args.lr_decay_iters,
54+
num_iter = args.start_step)
55+
elif args.lr_decay_style == "exponential":
56+
lr_scheduler = bmt.lr_scheduler.Exponential(optimizer,
57+
start_lr = args.lr,
58+
warmup_iter = args.warmup_iters,
59+
end_iter = args.lr_decay_iters,
60+
num_iter = args.start_step)
61+
elif args.lr_decay_style == "cosine":
62+
lr_scheduler = bmt.lr_scheduler.Cosine(optimizer,
63+
start_lr = args.lr,
64+
warmup_iter = args.warmup_iters,
65+
end_iter = args.lr_decay_iters,
66+
num_iter = args.start_step)
67+
else:
68+
raise ValueError(f"lr_scheduler of type {args.lr_decay_style} is not supported yet.")
69+
70+
return lr_scheduler
71+
72+
def setup_model_and_optimizer(args):
73+
# get the tokenizer
74+
tokenizer = get_tokenizer(args)
75+
# get the model
76+
model = get_model(args)
77+
bmt.synchronize()
78+
# get the optimizer and lr_scheduler
79+
optimizer = get_optimizer(args, model)
80+
lr_scheduler = get_learning_rate_scheduler(args, optimizer)
81+
bmt.synchronize()
82+
# get the memory usage
83+
bmt.print_rank("Model mem\n", torch.cuda.memory_summary())
84+
bmt.synchronize()
85+
return tokenizer, model, optimizer, lr_scheduler
86+
87+
def initialize():
88+
# get arguments
89+
args = get_args()
90+
# init bmt
91+
bmt.init_distributed(seed = args.seed)
92+
# init save folder
93+
if args.save != None:
94+
os.makedirs(args.save, exist_ok=True)
95+
return args
96+
97+
98+
def prepare_dataset(args, tokenizer, base_path, dataset_name):
99+
splits = ['train', 'dev', 'test']
100+
dataset = {}
101+
for split in splits:
102+
dataset[split] = DATASET[dataset_name](base_path, split, tokenizer, args.max_encoder_length, args.max_decoder_length)
103+
return dataset
104+
105+
def collate_fn(data):
106+
# data: a list of tuples with (input, target)
107+
return {
108+
"inputs" : [d['inputs'] for d in data],
109+
"targets": [d['targets'] for d in data],
110+
}
111+
112+
def finetune(args, tokenizer, model, optimizer, lr_scheduler, dataset):
113+
loss_func = bmt.loss.FusedCrossEntropy(ignore_index=-100)
114+
115+
optim_manager = bmt.optim.OptimManager(loss_scale=args.loss_scale, loss_scale_steps=100)
116+
optim_manager.add_optimizer(optimizer, lr_scheduler)
117+
118+
# print_inspect(model, '*')
119+
120+
for epoch in range(20):
121+
dataloader = {
122+
"train": DistributedDataLoader(dataset['train'], batch_size=args.batch_size, shuffle=True),
123+
"dev": DataLoader(dataset['dev'], batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn),
124+
}
125+
126+
model.train()
127+
for it, data in enumerate(dataloader['train']):
128+
logits = model(
129+
input_ids = data['input_ids'],
130+
attention_mask = data['attention_mask'],
131+
decoder_input_ids = data['decoder_input_ids'],
132+
decoder_attention_mask = data['decoder_attention_mask'],
133+
).logits
134+
targets = data["targets"]
135+
136+
loss = loss_func(logits.view(-1, logits.shape[-1]), targets.view(-1))
137+
global_loss = bmt.sum_loss(loss).item()
138+
139+
optim_manager.zero_grad()
140+
141+
optim_manager.backward(loss)
142+
grad_norm = optim_manager.clip_grad_norm(optimizer.param_groups, args.clip_grad, norm_type = 2)
143+
144+
optim_manager.step()
145+
146+
bmt.print_rank(
147+
"train | epoch {:3d} | Iter: {:6d}/{:6d} | loss: {:.4f} | lr: {:.4e}, scale: {:10.4f} | grad_norm: {:.4f} |".format(
148+
epoch,
149+
it,
150+
len(dataloader["train"]),
151+
global_loss,
152+
lr_scheduler.current_lr,
153+
int(optim_manager.loss_scale),
154+
grad_norm,
155+
)
156+
)
157+
# if it % args.inspect_iters == 0: print_inspect(model, "*")
158+
# if args.save != None and it % args.save_iters == 0:
159+
# bmt.save(model, os.path.join(args.save, args.save_name+("-%d.pt" % it)))
160+
161+
model.eval()
162+
beam_search = T5BeamSearch(
163+
model=model,
164+
tokenizer=tokenizer,
165+
)
166+
with torch.no_grad():
167+
for split in ['dev']:
168+
pd = []
169+
gt = []
170+
for it, data in enumerate(dataloader[split]):
171+
preds = beam_search.generate(data['inputs'], max_length=args.max_decoder_length)
172+
targets = data["targets"]
173+
174+
pd.extend(preds)
175+
gt.extend(targets)
176+
177+
bmt.print_rank(
178+
"{} | epoch {:3d} | Iter: {:6d}/{:6d} |".format(
179+
split,
180+
epoch,
181+
it,
182+
len(dataloader[split]),
183+
)
184+
)
185+
186+
metrics = squad_metric(pd, gt, None)
187+
bmt.print_rank(f"metrics: {metrics}")
188+
189+
190+
def main():
191+
args = initialize()
192+
tokenizer, model, optimizer, lr_scheduler = setup_model_and_optimizer(args)
193+
dataset = prepare_dataset(
194+
args,
195+
tokenizer,
196+
f"{args.base_path}/down_data/squad/",
197+
args.dataset_name,
198+
)
199+
finetune(args, tokenizer, model, optimizer, lr_scheduler, dataset)
200+
201+
if __name__ == "__main__":
202+
main()

0 commit comments

Comments
 (0)