@@ -16,17 +16,16 @@ import (
1616 "github.com/prometheus/client_golang/prometheus"
1717 "github.com/prometheus/client_golang/prometheus/promauto"
1818 "github.com/prometheus/common/model"
19- "github.com/prometheus/prometheus/pkg/gate"
2019 "github.com/prometheus/prometheus/pkg/labels"
2120 "github.com/prometheus/prometheus/storage"
2221 "github.com/prometheus/prometheus/tsdb"
23- tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
2422 "github.com/thanos-io/thanos/pkg/block/metadata"
2523 "github.com/thanos-io/thanos/pkg/objstore"
2624 "github.com/thanos-io/thanos/pkg/shipper"
2725 "github.com/weaveworks/common/httpgrpc"
2826 "github.com/weaveworks/common/user"
2927 "go.uber.org/atomic"
28+ "golang.org/x/sync/errgroup"
3029
3130 "github.com/cortexproject/cortex/pkg/ingester/client"
3231 "github.com/cortexproject/cortex/pkg/ring"
@@ -1143,98 +1142,100 @@ func (i *Ingester) closeAllTSDB() {
11431142// concurrently opening TSDB.
11441143func (i * Ingester ) openExistingTSDB (ctx context.Context ) error {
11451144 level .Info (util .Logger ).Log ("msg" , "opening existing TSDBs" )
1146- wg := & sync.WaitGroup {}
1147- openGate := gate .New (i .cfg .BlocksStorageConfig .TSDB .MaxTSDBOpeningConcurrencyOnStartup )
11481145
1149- // Keep track of all errors that could occur.
1150- errs := tsdb_errors.MultiError {}
1151- errsMx := sync.Mutex {}
1146+ queue := make (chan string )
1147+ group , groupCtx := errgroup .WithContext (ctx )
11521148
1153- walkErr := filepath .Walk (i .cfg .BlocksStorageConfig .TSDB .Dir , func (path string , info os.FileInfo , err error ) error {
1154- if err != nil {
1155- // If the root directory doesn't exist, we're OK (not needed to be created upfront).
1156- if os .IsNotExist (err ) && path == i .cfg .BlocksStorageConfig .TSDB .Dir {
1157- return filepath .SkipDir
1158- }
1149+ // Create a pool of workers which will open existing TSDBs.
1150+ for n := 0 ; n < i .cfg .BlocksStorageConfig .TSDB .MaxTSDBOpeningConcurrencyOnStartup ; n ++ {
1151+ group .Go (func () error {
1152+ for userID := range queue {
1153+ startTime := time .Now ()
11591154
1160- level .Error (util .Logger ).Log ("msg" , "an error occurred while iterating the filesystem storing TSDBs" , "path" , path , "err" , err )
1161- return errors .Wrapf (err , "an error occurred while iterating the filesystem storing TSDBs at %s" , path )
1162- }
1155+ db , err := i .createTSDB (userID )
1156+ if err != nil {
1157+ level .Error (util .Logger ).Log ("msg" , "unable to open TSDB" , "err" , err , "user" , userID )
1158+ return errors .Wrapf (err , "unable to open TSDB for user %s" , userID )
1159+ }
1160+
1161+ // Add the database to the map of user databases
1162+ i .userStatesMtx .Lock ()
1163+ i .TSDBState .dbs [userID ] = db
1164+ i .userStatesMtx .Unlock ()
1165+ i .metrics .memUsers .Inc ()
1166+
1167+ i .TSDBState .walReplayTime .Observe (time .Since (startTime ).Seconds ())
1168+ }
11631169
1164- // Skip root dir and all other files
1165- if path == i .cfg .BlocksStorageConfig .TSDB .Dir || ! info .IsDir () {
11661170 return nil
1167- }
1171+ })
1172+ }
11681173
1169- // Top level directories are assumed to be user TSDBs
1170- userID := info .Name ()
1171- f , err := os .Open (path )
1172- if err != nil {
1173- level .Error (util .Logger ).Log ("msg" , "unable to open TSDB dir" , "err" , err , "user" , userID , "path" , path )
1174- return errors .Wrapf (err , "unable to open TSDB dir %s for user %s" , path , userID )
1175- }
1176- defer f .Close ()
1174+ // Spawn a goroutine to find all users with a TSDB on the filesystem.
1175+ group .Go (func () error {
1176+ // Close the queue once filesystem walking is done.
1177+ defer close (queue )
1178+
1179+ walkErr := filepath .Walk (i .cfg .BlocksStorageConfig .TSDB .Dir , func (path string , info os.FileInfo , err error ) error {
1180+ if err != nil {
1181+ // If the root directory doesn't exist, we're OK (not needed to be created upfront).
1182+ if os .IsNotExist (err ) && path == i .cfg .BlocksStorageConfig .TSDB .Dir {
1183+ return filepath .SkipDir
1184+ }
11771185
1178- // If the dir is empty skip it
1179- if _ , err := f .Readdirnames (1 ); err != nil {
1180- if err == io .EOF {
1181- return filepath .SkipDir
1186+ level .Error (util .Logger ).Log ("msg" , "an error occurred while iterating the filesystem storing TSDBs" , "path" , path , "err" , err )
1187+ return errors .Wrapf (err , "an error occurred while iterating the filesystem storing TSDBs at %s" , path )
11821188 }
11831189
1184- level .Error (util .Logger ).Log ("msg" , "unable to read TSDB dir" , "err" , err , "user" , userID , "path" , path )
1185- return errors .Wrapf (err , "unable to read TSDB dir %s for user %s" , path , userID )
1186- }
1190+ // Skip root dir and all other files
1191+ if path == i .cfg .BlocksStorageConfig .TSDB .Dir || ! info .IsDir () {
1192+ return nil
1193+ }
11871194
1188- // Limit the number of TSDB's opening concurrently. Start blocks until there's a free spot available or the context is cancelled.
1189- if err := openGate .Start (ctx ); err != nil {
1190- return err
1191- }
1195+ // Top level directories are assumed to be user TSDBs
1196+ userID := info .Name ()
1197+ f , err := os .Open (path )
1198+ if err != nil {
1199+ level .Error (util .Logger ).Log ("msg" , "unable to open TSDB dir" , "err" , err , "user" , userID , "path" , path )
1200+ return errors .Wrapf (err , "unable to open TSDB dir %s for user %s" , path , userID )
1201+ }
1202+ defer f .Close ()
11921203
1193- wg .Add (1 )
1194- go func (userID string ) {
1195- defer wg .Done ()
1196- defer openGate .Done ()
1197- defer func (ts time.Time ) {
1198- i .TSDBState .walReplayTime .Observe (time .Since (ts ).Seconds ())
1199- }(time .Now ())
1204+ // If the dir is empty skip it
1205+ if _ , err := f .Readdirnames (1 ); err != nil {
1206+ if err == io .EOF {
1207+ return filepath .SkipDir
1208+ }
12001209
1201- db , err := i .createTSDB (userID )
1202- if err != nil {
1203- errsMx .Lock ()
1204- errs .Add (errors .Wrapf (err , "unable to open TSDB for user %s" , userID ))
1205- errsMx .Unlock ()
1210+ level .Error (util .Logger ).Log ("msg" , "unable to read TSDB dir" , "err" , err , "user" , userID , "path" , path )
1211+ return errors .Wrapf (err , "unable to read TSDB dir %s for user %s" , path , userID )
1212+ }
12061213
1207- level .Error (util .Logger ).Log ("msg" , "unable to open TSDB" , "err" , err , "user" , userID )
1208- return
1214+ // Enqueue the user to be processed.
1215+ select {
1216+ case queue <- userID :
1217+ // Nothing to do.
1218+ case <- groupCtx .Done ():
1219+ // Interrupt in case a failure occurred in another goroutine.
1220+ return nil
12091221 }
12101222
1211- // Add the database to the map of user databases
1212- i .userStatesMtx .Lock ()
1213- i .TSDBState .dbs [userID ] = db
1214- i .userStatesMtx .Unlock ()
1215- i .metrics .memUsers .Inc ()
1216- }(userID )
1223+ // Don't descend into subdirectories.
1224+ return filepath .SkipDir
1225+ })
12171226
1218- return filepath . SkipDir // Don't descend into directories
1227+ return errors . Wrapf ( walkErr , "unable to walk directory %s containing existing TSDBs" , i . cfg . BlocksStorageConfig . TSDB . Dir )
12191228 })
12201229
1221- if walkErr != nil {
1222- errsMx .Lock ()
1223- errs .Add (errors .Wrapf (walkErr , "unable to walk directory %s containing existing TSDBs" , i .cfg .BlocksStorageConfig .TSDB .Dir ))
1224- errsMx .Unlock ()
1225- }
1226-
1227- // Wait for all opening routines to finish
1228- wg .Wait ()
1229-
1230- // Ensure no error occurred.
1231- if errs .Err () == nil {
1232- level .Info (util .Logger ).Log ("msg" , "successfully opened existing TSDBs" )
1233- return nil
1230+ // Wait for all workers to complete.
1231+ err := group .Wait ()
1232+ if err != nil {
1233+ level .Error (util .Logger ).Log ("msg" , "error while opening existing TSDBs" , "err" , err )
1234+ return err
12341235 }
12351236
1236- level .Error (util .Logger ).Log ("msg" , "error while opening existing TSDBs" , "err" , errs . Error () )
1237- return errs . Err ()
1237+ level .Info (util .Logger ).Log ("msg" , "successfully opened existing TSDBs" )
1238+ return nil
12381239}
12391240
12401241// numSeriesInTSDB returns the total number of in-memory series across all open TSDBs.
0 commit comments