3636import java .time .Instant ;
3737import java .util .ArrayList ;
3838import java .util .Collections ;
39+ import java .util .Comparator ;
3940import java .util .HashSet ;
4041import java .util .List ;
4142import java .util .Set ;
@@ -72,6 +73,8 @@ public class DwhFiles {
7273
7374 private static final String INCREMENTAL_DIR = "incremental_run" ;
7475
76+ static final String TIMESTAMP_PREFIX = "_TIMESTAMP_" ;
77+
7578 // TODO: It is probably better if we build all DWH files related operations using Beam's
7679 // filesystem API such that when a new filesystem is registered, it automatically works
7780 // everywhere in our code. Note that currently we have hardcoded the valid schema in some places,
@@ -121,6 +124,10 @@ static DwhFiles forRoot(String dwhRoot, FhirContext fhirContext) {
121124 return new DwhFiles (dwhRoot , fhirContext );
122125 }
123126
127+ public static String safeTimestampSuffix () {
128+ return Instant .now ().toString ().replace (":" , "-" ).replace ("-" , "_" ).replace ("." , "_" );
129+ }
130+
124131 public String getRoot () {
125132 return dwhRoot ;
126133 }
@@ -145,50 +152,76 @@ public String getFilePattern(String resourceType) {
145152 "%s*%s" , getResourcePath (resourceType ).toString (), ParquetUtil .PARQUET_EXTENSION );
146153 }
147154
155+ // TODO: Move this to a util class and make it non-static.
148156 /**
149- * This returns the default incremental run path; each incremental run is relative to a full path,
150- * hence we put this directory under the full-run root.
157+ * Returns all the child directories under the given base directory which are 1-level deep. Note
158+ * in many cloud/distributed file-systems, we do not have "directories"; there are only buckets
159+ * and files in those buckets. We use file-seprators (e.g., `/`) to simulate the concept of
160+ * directories. So for example, this method returns an empty set if `baseDir` is `bucket/test` and
161+ * the only file in that bucket is `bucket/test/dir1/dir2/file.txt`. If `baseDir` is
162+ * `bucket/test/dir1`, in the above example, `dir2` is returned.
151163 *
152- * @return the default incremental run path
164+ * @param baseDir the path under which "directories" are looked for.
165+ * @return The list of all child directories under the base directory
166+ * @throws IOException
153167 */
154- public ResourceId getIncrementalRunPath () {
155- return FileSystems .matchNewResource (getRoot (), true )
156- .resolve (INCREMENTAL_DIR , StandardResolveOptions .RESOLVE_DIRECTORY );
157- }
158-
159- /** This is used when we want to keep a backup of the old incremental run output. */
160- public ResourceId getIncrementalRunPathWithTimestamp () {
161- return FileSystems .matchNewResource (getRoot (), true )
162- .resolve (
163- String .format ("%s_old_%d" , INCREMENTAL_DIR , System .currentTimeMillis ()),
164- StandardResolveOptions .RESOLVE_DIRECTORY );
168+ static Set <ResourceId > getAllChildDirectories (String baseDir ) throws IOException {
169+ String fileSeparator = getFileSeparatorForDwhFiles (baseDir );
170+ // Avoid using ResourceId.resolve(..) method to resolve the files when the path contains glob
171+ // expressions with multiple special characters like **, */* etc as this api only supports
172+ // single special characters like `*` or `..`. Rather use the FileSystems.match(..) if the path
173+ // contains glob expressions.
174+ List <MatchResult > matchResultList =
175+ FileSystems .match (
176+ List .of (
177+ getPathEndingWithFileSeparator (baseDir , fileSeparator )
178+ + "*"
179+ + fileSeparator
180+ + "*" ));
181+ Set <ResourceId > childDirectories = new HashSet <>();
182+ for (MatchResult matchResult : matchResultList ) {
183+ if (matchResult .status () == Status .OK && !matchResult .metadata ().isEmpty ()) {
184+ for (Metadata metadata : matchResult .metadata ()) {
185+ childDirectories .add (metadata .resourceId ().getCurrentDirectory ());
186+ }
187+ } else if (matchResult .status () == Status .ERROR ) {
188+ String errorMessage = String .format ("Error matching files under directory %s" , baseDir );
189+ log .error (errorMessage );
190+ throw new IOException (errorMessage );
191+ }
192+ }
193+ log .info ("Child directories of {} are {}" , baseDir , childDirectories );
194+ return childDirectories ;
165195 }
166196
167197 /**
168- * Similar to {@link #getIncrementalRunPath} but also checks if that directory exists and if so,
169- * moves it to {@link #getIncrementalRunPathWithTimestamp()}.
198+ * Also see {@link #newIncrementalRunPath()}
170199 *
171- * @return same as {@link #getIncrementalRunPath()}
172- * @throws IOException if the directory move fails
200+ * @return the current incremental run path if one found; null otherwise.
173201 */
174- public ResourceId newIncrementalRunPath () throws IOException {
175- ResourceId incPath = getIncrementalRunPath ();
176- if (hasIncrementalDir ()) {
177- ResourceId movePath = getIncrementalRunPathWithTimestamp ();
178- log .info ("Moving the old {} directory to {}" , INCREMENTAL_DIR , movePath );
179- FileSystems .rename (Collections .singletonList (incPath ), Collections .singletonList (movePath ));
180- }
181- return incPath ;
202+ @ Nullable
203+ public ResourceId getLatestIncrementalRunPath () throws IOException {
204+ List <ResourceId > dirs =
205+ getAllChildDirectories (getRoot ()).stream ()
206+ .filter (dir -> dir .getFilename ().contains (INCREMENTAL_DIR + TIMESTAMP_PREFIX ))
207+ .collect (Collectors .toList ());
208+ if (dirs .isEmpty ()) return null ;
209+
210+ Collections .sort (dirs , Comparator .comparing (ResourceId ::toString ));
211+ return dirs .get (dirs .size () - 1 );
182212 }
183213
184214 /**
185- * @return true iff there is already an incremental run subdirectory in this DWH.
215+ * This returns a new incremental-run path based on the current timestamp. Note that each
216+ * incremental-run is relative to a full-run, hence we put this directory under the full-run root.
217+ *
218+ * @return a new incremental run path based on the current timestamp.
186219 */
187- public boolean hasIncrementalDir () throws IOException {
188- List < MatchResult > matches =
189- FileSystems . matchResources ( Collections . singletonList ( getIncrementalRunPath ()));
190- MatchResult matchResult = Iterables . getOnlyElement ( matches );
191- return matchResult . status () == Status . OK ;
220+ public ResourceId newIncrementalRunPath () {
221+ return FileSystems . matchNewResource ( getRoot (), true )
222+ . resolve (
223+ String . format ( "%s%s%s" , INCREMENTAL_DIR , TIMESTAMP_PREFIX , safeTimestampSuffix ()),
224+ StandardResolveOptions . RESOLVE_DIRECTORY ) ;
192225 }
193226
194227 public Set <String > findNonEmptyResourceDirs () throws IOException {
0 commit comments