-
Couldn't load subscription status.
- Fork 1
Adding additional checks during upload and download of chunk #39
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: airbnb-main
Are you sure you want to change the base?
Changes from all commits
97a5a2c
7443840
30ef18c
11de483
4294bc1
ac66b41
f909840
5c4b07a
c975fe3
99ae402
b09f9e0
4809591
808160a
fd4744c
31a054c
dbfdfe0
2c2ccec
5d41271
6dc0c76
8acd25f
e5abb7b
98dceab
48c83b2
e171c05
f6cb74b
52f394e
d2d5093
e3e3b56
cfadbdc
17b0dfb
4540e97
6b72251
961c8d5
dd68d77
f041267
bcd0970
175de27
b17a16c
2787a34
11691ed
67d541d
7575911
b75d892
5b879f8
2db12ce
15e76f4
d497e6f
414bd33
6c14cbe
ca6fb0e
e45a2de
4e658fd
3f0b897
4920c58
f5270f4
76809e2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,17 @@ | ||
| package com.slack.astra.chunk; | ||
|
|
||
| import java.nio.file.Path; | ||
| import org.apache.lucene.index.CheckIndex; | ||
| import org.apache.lucene.store.FSDirectory; | ||
| import org.apache.lucene.store.NoLockFactory; | ||
|
|
||
| public class ChunkValidationUtils { | ||
|
|
||
| public static boolean isChunkClean(Path path) throws Exception { | ||
| FSDirectory existingDir = FSDirectory.open(path, NoLockFactory.INSTANCE); | ||
| try (CheckIndex checker = new CheckIndex(existingDir)) { | ||
| CheckIndex.Status status = checker.checkIndex(); | ||
| return status.clean; | ||
| } | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,6 @@ | ||
| package com.slack.astra.chunk; | ||
|
|
||
| import static com.slack.astra.chunk.ChunkValidationUtils.isChunkClean; | ||
| import static com.slack.astra.chunkManager.CachingChunkManager.ASTRA_NG_DYNAMIC_CHUNK_SIZES_FLAG; | ||
| import static com.slack.astra.server.AstraConfig.DEFAULT_ZK_TIMEOUT_SECS; | ||
|
|
||
|
|
@@ -28,9 +29,11 @@ | |
| import com.slack.astra.proto.metadata.Metadata; | ||
| import io.micrometer.core.instrument.MeterRegistry; | ||
| import io.micrometer.core.instrument.Timer; | ||
| import java.io.File; | ||
| import java.io.IOException; | ||
| import java.nio.file.Files; | ||
| import java.nio.file.Path; | ||
| import java.nio.file.Paths; | ||
| import java.time.Instant; | ||
| import java.util.EnumSet; | ||
| import java.util.List; | ||
|
|
@@ -228,6 +231,62 @@ public CacheNodeAssignment getCacheNodeAssignment() { | |
| return assignment; | ||
| } | ||
|
|
||
| private boolean validateS3vsLocalDownLoad() { | ||
| // check if the number of files in S3 matches the local directory | ||
| Map<String, Long> filesWithSizeInS3 = blobStore.listFilesWithSize(snapshotMetadata.snapshotId); | ||
|
|
||
| Map<String, Long> localFilesSizeMap; | ||
| List<String> mismatchFiles = new java.util.ArrayList<>(); | ||
| try (Stream<Path> fileList = Files.list(dataDirectory)) { | ||
| localFilesSizeMap = | ||
| fileList | ||
| .filter(Files::isRegularFile) | ||
| .collect( | ||
| Collectors.toMap( | ||
| path -> | ||
| dataDirectory.relativize(path).toString().replace(File.separator, "/"), | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm. I think the replace isn't necessary since |
||
| path -> path.toFile().length())); | ||
| } catch (IOException e) { | ||
| throw new RuntimeException( | ||
| String.format("Error reading local files in directory %s", dataDirectory), e); | ||
| } | ||
| if (localFilesSizeMap.size() != filesWithSizeInS3.size()) { | ||
| LOG.error( | ||
| "Mismatch in number of files in S3 ({}) and local directory ({}) for snapshot {}", | ||
| filesWithSizeInS3.size(), | ||
| localFilesSizeMap.size(), | ||
| snapshotMetadata.toString()); | ||
| return false; | ||
| } | ||
|
|
||
| for (Map.Entry<String, Long> entry : filesWithSizeInS3.entrySet()) { | ||
| String s3Path = entry.getKey(); | ||
| long s3Size = entry.getValue(); | ||
| String fileName = Paths.get(s3Path).getFileName().toString(); | ||
|
|
||
| if (!localFilesSizeMap.containsKey(fileName) | ||
| || !localFilesSizeMap.get(fileName).equals(s3Size)) { | ||
| mismatchFiles.add(fileName); | ||
| } | ||
| } | ||
| if (!mismatchFiles.isEmpty()) { | ||
| String mismatchFilesAndSize = | ||
| mismatchFiles.stream() | ||
| .map( | ||
| e -> | ||
| String.format( | ||
| "%s (S3Size: %s, LocalSize: %s)", | ||
| e, filesWithSizeInS3.get(e), localFilesSizeMap.get(e))) | ||
| .collect(Collectors.joining(", ")); | ||
| LOG.error( | ||
| "Mismatch in file sizes between S3 and local directory for snapshot {}. Mismatch files: {}", | ||
| snapshotMetadata.toString(), | ||
| mismatchFilesAndSize); | ||
| return false; | ||
| } | ||
| return true; | ||
| } | ||
|
|
||
| public void downloadChunkData() { | ||
| Timer.Sample assignmentTimer = Timer.start(meterRegistry); | ||
| // lock | ||
|
|
@@ -265,7 +324,24 @@ public void downloadChunkData() { | |
| "No files found on blob storage, released slot for re-assignment"); | ||
| } | ||
| } | ||
| // validate if the number of files in S3 matches the local directory | ||
| if (!validateS3vsLocalDownLoad()) { | ||
| String errorString = | ||
| String.format( | ||
| "Mismatch in number or size of files in S3 and local directory for snapshot %s", | ||
| snapshotMetadata); | ||
| throw new RuntimeException(errorString); | ||
| } | ||
|
|
||
| // check if lucene index is valid and not corrupted | ||
| boolean luceneStatus = isChunkClean(dataDirectory); | ||
| if (!luceneStatus) { | ||
| throw new IOException( | ||
| String.format( | ||
| "Lucene index is not clean. Found issues for snapshot: %s.", snapshotMetadata)); | ||
| } | ||
|
|
||
| // check if schema file exists | ||
| Path schemaPath = Path.of(dataDirectory.toString(), ReadWriteChunk.SCHEMA_FILE_NAME); | ||
| if (!Files.exists(schemaPath)) { | ||
| throw new RuntimeException("We expect a schema.json file to exist within the index"); | ||
|
|
@@ -305,7 +381,7 @@ public void downloadChunkData() { | |
| // disregarding any errors | ||
| setAssignmentState( | ||
| getCacheNodeAssignment(), Metadata.CacheNodeAssignment.CacheNodeAssignmentState.EVICT); | ||
| LOG.error("Error handling chunk assignment", e); | ||
| LOG.error("Error handling chunk assignment for snapshot: {}", snapshotMetadata, e); | ||
| assignmentTimer.stop(chunkAssignmentTimerFailure); | ||
| } finally { | ||
| chunkAssignmentLock.unlock(); | ||
|
|
@@ -538,6 +614,7 @@ private void cleanDirectory() { | |
| if (dataDirectory != null) { | ||
| try { | ||
| FileUtils.cleanDirectory(dataDirectory.toFile()); | ||
| FileUtils.deleteDirectory(dataDirectory.toFile()); | ||
| } catch (Exception e) { | ||
| LOG.error("Error removing files {}", dataDirectory.toString(), e); | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you could extract a helper method that would be used by both listFiles... methods that takes a prefix and a Consumer. Then this could look like the following.
Also, the block passed to subscribe could be called in multiple threads, so this should use a storage class that is safe wrt concurrent modifications.