aboutsummaryrefslogtreecommitdiffstats
path: root/dedupe
diff options
context:
space:
mode:
authorPawit Pornkitprasan <p.pawit@gmail.com>2012-07-24 11:54:41 +0700
committerChirayu Desai <chirayudesai1@gmail.com>2012-08-28 10:53:00 +0530
commit9a3dd4ac084d84149142036c86ce05260890c6a4 (patch)
tree1b12040e8d502a6d23217f513e5a1f6626a5adc2 /dedupe
parenta2967bf0177bab6e3948bf37bcd0a5a7bc7aac30 (diff)
downloadbootable_recovery-9a3dd4ac084d84149142036c86ce05260890c6a4.zip
bootable_recovery-9a3dd4ac084d84149142036c86ce05260890c6a4.tar.gz
bootable_recovery-9a3dd4ac084d84149142036c86ce05260890c6a4.tar.bz2
recovery: make dedupe gc interruptable
The current gc approach is moving valid files into another directory, which will corrupt the backup if the gc is ever interrupted. Rewritten so that the list is compared in memory and only unused files are removed, leaving the used files in place. Change-Id: Ib10c40405eaaaa8141d1a6307bc2506789725439
Diffstat (limited to 'dedupe')
-rw-r--r--dedupe/dedupe.c130
1 files changed, 79 insertions, 51 deletions
diff --git a/dedupe/dedupe.c b/dedupe/dedupe.c
index 347e61b..dea544c 100644
--- a/dedupe/dedupe.c
+++ b/dedupe/dedupe.c
@@ -3,10 +3,12 @@
#include <openssl/md5.h>
#include <openssl/sha.h>
#include <openssl/ripemd.h>
+#include <assert.h>
#include <errno.h>
#include <dirent.h>
#include <limits.h>
#include <stdlib.h>
+#include <string.h>
#include <fcntl.h>
#include <limits.h>
#include <sys/wait.h>
@@ -20,6 +22,7 @@
#include <sys/wait.h>
#define DEDUPE_VERSION 2
+#define ARRAY_CAPACITY 1000
static int copy_file(const char *src, const char *dst) {
char buf[4096];
@@ -260,20 +263,61 @@ static int dec_to_oct(int dec) {
return ret;
}
-void recursive_delete_skip_gc(char* d) {
+struct array {
+ void** data;
+ int size;
+ int capacity;
+};
+
+static void array_init(struct array* arr, int capacity) {
+ arr->data = malloc(sizeof(void*) * capacity);
+ assert(arr->data != NULL);
+ arr->size = 0;
+ arr->capacity = capacity;
+}
+
+static void array_free(struct array* arr, int free_members) {
+ if (free_members) {
+ int i;
+ for (i = 0; i < arr->size; i++) {
+ free(arr->data[i]);
+ }
+ }
+
+ if (arr->data != NULL) {
+ free(arr->data);
+ arr->data = NULL;
+ }
+ arr->size = 0;
+ arr->capacity = 0;
+}
+
+static void array_add(struct array* arr, void* val) {
+ if (arr->size == arr->capacity) {
+ // Expand array
+ arr->capacity *= 2;
+ arr->data = realloc(arr->data, sizeof(void*) * arr->capacity);
+ assert(arr->data != NULL);
+ }
+ arr->data[arr->size++] = val;
+}
+
+static int string_compare(const void* a, const void* b) {
+ return strcmp(*(char**) a, *(char **) b);
+}
+
+static void recursive_list_dir(char* d, struct array *arr) {
DIR *dp = opendir(d);
if (dp == NULL) {
fprintf(stderr, "Error opening directory: %s\n", d);
return;
}
struct dirent *ep;
- while (ep = readdir(dp)) {
+ while ((ep = readdir(dp))) {
if (strcmp(ep->d_name, ".") == 0)
continue;
if (strcmp(ep->d_name, "..") == 0)
continue;
- if (strcmp(ep->d_name, ".gc") == 0)
- continue;
struct stat cst;
int ret;
char blob[PATH_MAX];
@@ -284,12 +328,11 @@ void recursive_delete_skip_gc(char* d) {
}
if (S_ISDIR(cst.st_mode)) {
- recursive_delete_skip_gc(blob);
+ recursive_list_dir(blob, arr);
+ continue;
}
- if (remove(blob)) {
- fprintf(stderr, "Error removing: %s\n", ep->d_name);
- }
+ array_add(arr, strdup(blob));
}
closedir(dp);
}
@@ -468,13 +511,10 @@ int dedupe_main(int argc, char** argv) {
return 1;
}
- char gc_dir[PATH_MAX];
- sprintf(gc_dir, "%s/%s", blob_dir, ".gc");
- mkdir(gc_dir, S_IRWXU | S_IRWXG | S_IRWXO);
- if (check_file(gc_dir)) {
- fprintf(stderr, "Unable to open gc dir: %s\n", gc_dir);
- return 1;
- }
+ struct array used_files;
+ struct array all_files;
+ array_init(&used_files, ARRAY_CAPACITY);
+ array_init(&all_files, ARRAY_CAPACITY);
char blob[PATH_MAX];
int i;
@@ -483,7 +523,8 @@ int dedupe_main(int argc, char** argv) {
FILE *input_manifest = fopen(argv[i], "rb");
if (input_manifest == NULL) {
fprintf(stderr, "Unable to open input manifest %s\n", argv[i]);
- return 1;
+ failure = 1;
+ goto out;
}
char line[PATH_MAX];
@@ -531,52 +572,39 @@ int dedupe_main(int argc, char** argv) {
char sizeStr[32];
token = tokenize(sizeStr, token, '\t');
int size = atoi(sizeStr);
-
+
sprintf(blob, "%s/%s", blob_dir, key);
- char dst[PATH_MAX];
- sprintf(dst, "%s/%s", gc_dir, key);
- struct stat file_info;
- if (stat(blob, &file_info) == 0) {
- // keys can have a single parent directory. make sure it exists
- mkdir(dirname(dst), S_IRWXU | S_IRWXG | S_IRWXO);
- rename(blob, dst);
- }
+ array_add(&used_files, strdup(blob));
}
}
fclose(input_manifest);
}
- // rm -rf
- if (!failure)
- recursive_delete_skip_gc(blob_dir);
+ recursive_list_dir(blob_dir, &all_files);
- // move .gc over
- char dst[PATH_MAX];
- DIR *dp = opendir(gc_dir);
- if (dp == NULL) {
- fprintf(stderr, "Error opening directory: %s\n", gc_dir);
- return 1;
- }
- struct dirent *ep;
- while (ep = readdir(dp)) {
- if (strcmp(ep->d_name, ".") == 0)
- continue;
- if (strcmp(ep->d_name, "..") == 0)
- continue;
- struct stat cst;
- int ret;
- sprintf(blob, "%s/%s", gc_dir, ep->d_name);
- sprintf(dst, "%s/%s", blob_dir, ep->d_name);
- if ((ret = lstat(blob, &cst))) {
- fprintf(stderr, "Error opening: %s\n", ep->d_name);
- continue;
+ qsort(used_files.data, used_files.size, sizeof(void*), string_compare);
+ qsort(all_files.data, all_files.size, sizeof(void*), string_compare);
+
+ // Search for unused files
+ int j = 0;
+ for (i = 0; i < all_files.size; i++) {
+ int cmp;
+ while (j < used_files.size &&
+ (cmp = strcmp(used_files.data[j], all_files.data[i])) < 0) {
+ j++;
}
- if (rename(blob, dst)) {
- fprintf(stderr, "Error moving: %s\n", ep->d_name);
+ if (cmp > 0 || j >= used_files.size) {
+ if (remove(all_files.data[i])) {
+ fprintf(stderr, "Error removing: %s\n", all_files.data[i]);
+ }
+ printf("Delete: %s\n", all_files.data[i]);
}
}
- closedir(dp);
+
+ out:
+ array_free(&used_files, 1);
+ array_free(&all_files, 1);
return failure;
}