diff options
author | Koushik Dutta <koushd@gmail.com> | 2012-07-19 16:36:20 -0700 |
---|---|---|
committer | Koushik Dutta <koushd@gmail.com> | 2012-07-20 01:01:04 -0700 |
commit | b803ac960136bdfd08f1906baef47206e0cb9caf (patch) | |
tree | 58fbde8e1a7fde667e3f2c6d479bf06eb359273b /dedupe | |
parent | d4aaeeb4aac405a3d91c850904b70d206be0bd25 (diff) | |
download | bootable_recovery-b803ac960136bdfd08f1906baef47206e0cb9caf.zip bootable_recovery-b803ac960136bdfd08f1906baef47206e0cb9caf.tar.gz bootable_recovery-b803ac960136bdfd08f1906baef47206e0cb9caf.tar.bz2 |
dedupe: fat32 can only contain 64k of filesystem data (roughly 20k directories) per directory. Need to break the hashes into subdirectories. May need to do this further? TODO: forgot to record/restore ctime,mtime,atime
Change-Id: I7edd50d022c60326fda8cfc00517c646fba86385
Diffstat (limited to 'dedupe')
-rw-r--r-- | dedupe/dedupe.c | 127 |
1 files changed, 85 insertions, 42 deletions
diff --git a/dedupe/dedupe.c b/dedupe/dedupe.c index 8878bbd..9935a57 100644 --- a/dedupe/dedupe.c +++ b/dedupe/dedupe.c @@ -109,20 +109,50 @@ static int store_file(struct DEDUPE_STORE_CONTEXT *context, struct stat st, cons sprintf(&psum[(j*2)], "%02x", (int)sumdata[j]); psum[(SHA256_DIGEST_LENGTH * 2)] = '\0'; + // if a hash is abcdefg, + // the output blob name is abc/defg + // this is to get around vfat having a 64k directory size limit (usually around 20k files) char out_blob[PATH_MAX]; char tmp_out_blob[PATH_MAX]; - sprintf(out_blob, "%s/%s", context->blob_dir, psum); + char key[SHA256_DIGEST_LENGTH + SHA256_DIGEST_LENGTH / 3 + 3]; + // int i = 0; + // int keyIndex = 0; + // while (psum[i]) { + // key[keyIndex] = psum[i]; + // i++; + // keyIndex++; + // if (i % 2 == 0 && psum[i]) { + // key[keyIndex] = '/'; + // keyIndex++; + // } + // } + strcpy(key, psum); + key[3] = '/'; + key[4] = NULL; + strcat(key, psum + 3); + sprintf(out_blob, "%s/%s", context->blob_dir, key); sprintf(tmp_out_blob, "%s.tmp", out_blob); + mkdir(dirname(out_blob), S_IRWXU | S_IRWXG | S_IRWXO); // don't copy the file if it exists? not quite sure how I feel about this. + int size = (int)st.st_size; struct stat file_info; - if (stat(out_blob, &file_info) && ((ret = copy_file(f, tmp_out_blob)) || (ret = rename(tmp_out_blob, out_blob)))) { - fprintf(stderr, "Error copying blob %s\n", f); - return ret; + // verify the file exists and is of the same size + int file_ok = stat(out_blob, &file_info) == 0; + if (file_ok) { + int existing_size = file_info.st_size; + if (existing_size != size) + file_ok = 0; + } + if (!file_ok) { + // copy to the tmp file + if ((ret = copy_file(f, tmp_out_blob)) || (ret = rename(tmp_out_blob, out_blob))) { + fprintf(stderr, "Error copying blob %s\n", f); + return ret; + } } - int size = (int)st.st_size; - fprintf(context->output_manifest, "%s\t%d\t\n", psum, size); + fprintf(context->output_manifest, "%s\t%d\t\n", key, size); return 0; } @@ -233,6 +263,40 @@ static int dec_to_oct(int dec) { return ret; } +void recursive_delete_skip_gc(char* dirname) { + DIR *dp = opendir(dirname); + if (dp == NULL) { + fprintf(stderr, "Error opening directory: %s\n", dirname); + return; + } + struct dirent *ep; + while (ep = readdir(dp)) { + if (strcmp(ep->d_name, ".") == 0) + continue; + if (strcmp(ep->d_name, "..") == 0) + continue; + if (strcmp(ep->d_name, ".gc") == 0) + continue; + struct stat cst; + int ret; + char blob[PATH_MAX]; + sprintf(blob, "%s/%s", dirname, ep->d_name); + if ((ret = lstat(blob, &cst))) { + fprintf(stderr, "Error opening: %s\n", ep->d_name); + continue; + } + + if (S_ISDIR(cst.st_mode)) { + recursive_delete_skip_gc(blob); + } + + if (remove(blob)) { + fprintf(stderr, "Error removing: %s\n", ep->d_name); + } + } + closedir(dp); +} + int dedupe_main(int argc, char** argv) { if (argc < 3) { usage(argv); @@ -408,56 +472,37 @@ int dedupe_main(int argc, char** argv) { int ret; // printf("%s\n", filename); if (strcmp(type, "f") == 0) { - char sha256[128]; - token = tokenize(sha256, token, '\t'); + char key[128]; + token = tokenize(key, token, '\t'); char sizeStr[32]; token = tokenize(sizeStr, token, '\t'); int size = atoi(sizeStr); - sprintf(blob, "%s/%s", blob_dir, sha256); + sprintf(blob, "%s/%s", blob_dir, key); char dst[PATH_MAX]; - sprintf(dst, "%s/%s", gc_dir, sha256); + sprintf(dst, "%s/%s", gc_dir, key); struct stat file_info; - if (stat(blob, &file_info) == 0) + if (stat(blob, &file_info) == 0) { + // keys can have a single parent directory. make sure it exists + mkdir(dirname(dst), S_IRWXU | S_IRWXG | S_IRWXO); rename(blob, dst); + } } } fclose(input_manifest); } - DIR *dp = opendir(blob_dir); - if (dp == NULL) { - fprintf(stderr, "Error opening directory: %s\n", blob_dir); - return 1; - } - struct dirent *ep; - while (ep = readdir(dp)) { - if (strcmp(ep->d_name, ".") == 0) - continue; - if (strcmp(ep->d_name, "..") == 0) - continue; - struct stat cst; - int ret; - sprintf(blob, "%s/%s", blob_dir, ep->d_name); - if ((ret = lstat(blob, &cst))) { - fprintf(stderr, "Error opening: %s\n", ep->d_name); - continue; - } - - if (S_ISREG(cst.st_mode)) { - if (remove(blob)) { - fprintf(stderr, "Error removing: %s\n", ep->d_name); - } - } - } - closedir(dp); + // rm -rf + recursive_delete_skip_gc(blob_dir); + // move .gc over char dst[PATH_MAX]; - dp = opendir(gc_dir); + DIR *dp = opendir(gc_dir); if (dp == NULL) { fprintf(stderr, "Error opening directory: %s\n", gc_dir); return 1; } + struct dirent *ep; while (ep = readdir(dp)) { if (strcmp(ep->d_name, ".") == 0) continue; @@ -472,10 +517,8 @@ int dedupe_main(int argc, char** argv) { continue; } - if (S_ISREG(cst.st_mode)) { - if (rename(blob, dst)) { - fprintf(stderr, "Error moving: %s\n", ep->d_name); - } + if (rename(blob, dst)) { + fprintf(stderr, "Error moving: %s\n", ep->d_name); } } closedir(dp); |