diff options
author | Leah Neukirchen <leah@vuxu.org> | 2023-10-15 22:18:06 +0200 |
---|---|---|
committer | Leah Neukirchen <leah@vuxu.org> | 2023-10-15 22:18:06 +0200 |
commit | b37efe0e408ddb7f16f7ac459b79e1b931ae3b4b (patch) | |
tree | 5ea1676965f63d521cb0c2900244f186e0895720 | |
parent | 07915c710f4b74b9635a7573e7cc6c936809ba48 (diff) | |
download | mico-b37efe0e408ddb7f16f7ac459b79e1b931ae3b4b.tar.gz mico-b37efe0e408ddb7f16f7ac459b79e1b931ae3b4b.tar.xz mico-b37efe0e408ddb7f16f7ac459b79e1b931ae3b4b.zip |
experiment: deduplicate ts series dedup-ts
This doesn't actually save a lot, as ts files compress very well already: Before: Length Method Size Cmpr Date Time CRC-32 Name -------- ------ ------- ---- ---------- ----- -------- ---- 470207 Defl:X 8963 98% 12-29-2021 23:48 081ce894 cardio_bsecdata_eco2/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 21313623 Defl:X 9266751 57% 12-29-2021 23:48 14f0795c cardio_bsecdata_eco2/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 470207 Defl:X 8963 98% 12-29-2021 23:48 081ce894 cardio_bsecdata_gas_resistance/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 20389230 Defl:X 10262765 50% 12-29-2021 23:48 8bfdabdb cardio_bsecdata_gas_resistance/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 470207 Defl:X 8963 98% 12-29-2021 23:48 081ce894 cardio_bsecdata_humidity/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 4282896 Defl:X 4064432 5% 12-29-2021 23:48 71c8abbb cardio_bsecdata_humidity/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 470206 Defl:X 8963 98% 12-29-2021 23:48 80a91662 cardio_bsecdata_iaq/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 16772109 Defl:X 2245726 87% 12-29-2021 23:48 5cf4f904 cardio_bsecdata_iaq/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 470206 Defl:X 8963 98% 12-29-2021 23:48 80a91662 cardio_bsecdata_iaq_accuracy/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 619883 Defl:X 29615 95% 12-29-2021 23:48 8c427887 cardio_bsecdata_iaq_accuracy/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 470206 Defl:X 8963 98% 12-29-2021 23:48 80a91662 cardio_bsecdata_pressure/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 2856925 Defl:X 1660673 42% 12-29-2021 23:48 4ce2fd9b cardio_bsecdata_pressure/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 470206 Defl:X 8963 98% 12-29-2021 23:48 638ba15a cardio_bsecdata_temperature/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 2062648 Defl:X 1046122 49% 12-29-2021 23:48 62744838 cardio_bsecdata_temperature/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d -------- ------- --- ------- 71588759 28638825 60% 14 files After: Length Method Size Cmpr Date Time CRC-32 Name -------- ------ ------- ---- ---------- ----- -------- ---- 470214 Defl:X 8983 98% 10-15-2023 22:12 524e0d1b cardio_bsecdata_eco2/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 21312658 Defl:X 9259690 57% 10-15-2023 22:12 d59af841 cardio_bsecdata_eco2/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 102 Defl:X 88 14% 10-15-2023 22:13 66e45fec cardio_bsecdata_gas_resistance/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 20389237 Defl:X 10287536 50% 10-15-2023 22:13 39a6bd34 cardio_bsecdata_gas_resistance/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 102 Defl:X 88 14% 10-15-2023 22:13 66e45fec cardio_bsecdata_humidity/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 4276047 Defl:X 4057156 5% 10-15-2023 22:13 7eea7072 cardio_bsecdata_humidity/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 470214 Defl:X 8983 98% 10-15-2023 22:13 9ebd9470 cardio_bsecdata_iaq/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 16772117 Defl:X 2245559 87% 10-15-2023 22:13 761cabb6 cardio_bsecdata_iaq/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 101 Defl:X 87 14% 10-15-2023 22:14 1c862bfc cardio_bsecdata_iaq_accuracy/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 619891 Defl:X 29628 95% 10-15-2023 22:14 e9aff1a4 cardio_bsecdata_iaq_accuracy/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 101 Defl:X 87 14% 10-15-2023 22:14 1c862bfc cardio_bsecdata_pressure/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 2856932 Defl:X 1652080 42% 10-15-2023 22:14 661347a6 cardio_bsecdata_pressure/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 470214 Defl:X 8984 98% 10-15-2023 22:14 6c3c1e96 cardio_bsecdata_temperature/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 2035471 Defl:X 1070947 47% 10-15-2023 22:14 83849e43 cardio_bsecdata_temperature/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d -------- ------- --- ------- 69673401 28629896 59% 14 files
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | mico-dump.c | 30 | ||||
-rw-r--r-- | mico-store.c | 54 |
3 files changed, 76 insertions, 10 deletions
diff --git a/Makefile b/Makefile index 6abbfbf..47b6680 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ CFLAGS=-g -O2 -Wall -Wno-switch -Wextra -Wwrite-strings -LDLIBS=-lzip +LDLIBS=-lzip -lz ALL=mico-dump mico-sort mico-store metra/metra diff --git a/mico-dump.c b/mico-dump.c index 75219d5..b4635af 100644 --- a/mico-dump.c +++ b/mico-dump.c @@ -1,3 +1,7 @@ +#define _XOPEN_SOURCE 700 + +#include <sys/stat.h> + #include <arpa/inet.h> #include <assert.h> #include <fnmatch.h> @@ -157,11 +161,33 @@ main(int argc, char *argv[]) struct bitreader ts = { 0 }; struct bitreader vs = { 0 }; + int ts_index = i; + + zip_uint8_t opsys; + zip_uint32_t attr; + zip_file_get_external_attributes(zip, i, 0, &opsys, &attr); + if (opsys == ZIP_OPSYS_UNIX && + S_ISLNK(attr >> 16)) { + printf("symlink!\n"); + struct zip_file *zf = zip_fopen_index(zip, i, 0); + char buf[1024]; + ssize_t len = zip_fread(zf, buf, sizeof buf); + buf[len] = 0; + if (strncmp(buf, "../../", 6) == 0) { + ts_index = zip_name_locate(zip, buf + 6, 0); + fprintf(stderr, "%s -> %d\n", buf + 6, ts_index); + if (ts_index < 0) { + fprintf(stderr, "invalid symlink, skipping"); + continue; + } + } + } + /* XXX verify assumptions on zip file order */ - ts.input = zip_fopen_index(zip, i, 0); + ts.input = zip_fopen_index(zip, ts_index, 0); vs.input = zip_fopen_index(zip, i+1, 0); - char *name = strdup(zip_get_name(zip, i, ZIP_FL_ENC_RAW)); + char *name = strdup(zip_get_name(zip, i+1, ZIP_FL_ENC_RAW)); char *s = strchr(name, '/'); *s = '{'; s = strrchr(name, '/'); diff --git a/mico-store.c b/mico-store.c index f92d592..0b21301 100644 --- a/mico-store.c +++ b/mico-store.c @@ -1,3 +1,5 @@ +#include <sys/stat.h> + #include <arpa/inet.h> #include <assert.h> #include <stdint.h> @@ -6,6 +8,7 @@ #include <stdlib.h> #include <zip.h> +#include <zlib.h> // for crc32 #define MICO_HEADER "\211MiC\r\n\032\n" @@ -150,17 +153,50 @@ write_stream(char *name) memcpy(ts.mem + 8, &nevents, sizeof nevents); memcpy(vs.mem + 8, &nevents, sizeof nevents); + uint32_t crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, (unsigned char *)ts.mem, (long)ts.memlen); + printf("precomputed crc=%d\n", crc); + // time_t mtime = metrics[m].value[0].t / 1000; snprintf(path, sizeof path, "%s/time.dd", prefix); - zip_source_t *buft = zip_source_buffer(zip, - ts.mem, ts.memlen, 0); - int jj= zip_file_add(zip, path, buft, ZIP_FL_ENC_UTF_8); - printf("index of %s = %d\n", path, jj); - if (jj < 0) { - printf("error adding file: %s\n", zip_strerror(zip)); + int found_ts = 0; + int jj; + for (int64_t kk = 0; kk < zip_get_num_entries(zip, 0); kk += 2) { + struct zip_stat stat; + zip_stat_index(zip, kk, 0, &stat); + if (stat.crc != crc) + continue; + + /* XXX better validation the ts are identical than just crc */ + + zip_uint32_t attr = ((S_IFLNK | 0777) << 16L); + + char target[1024]; + snprintf(target, sizeof target, "../../%s", zip_get_name(zip, kk, ZIP_FL_ENC_RAW)); + fprintf(stderr, "duplicate ts crc=%d, target=%s\n", stat.crc, target); + + struct zip_source *link = zip_source_buffer(zip, strdup(target), strlen(target), 1); + jj = zip_file_add(zip, path, link, ZIP_FL_ENC_UTF_8); + if (jj < 0) { + printf("error adding file: %s\n", zip_strerror(zip)); + } + zip_file_set_external_attributes(zip, jj, 0, ZIP_OPSYS_UNIX, attr); + printf("index of symlink %s -> %s = %d\n", path, target, jj); + found_ts = 1; + break; + } + + if (!found_ts) { + zip_source_t *buft = zip_source_buffer(zip, + ts.mem, ts.memlen, 0); + jj= zip_file_add(zip, path, buft, ZIP_FL_ENC_UTF_8); + printf("index of %s = %d\n", path, jj); + if (jj < 0) { + printf("error adding file: %s\n", zip_strerror(zip)); + } + // zip_file_set_mtime(zip, jj, mtime, 0); } - // zip_file_set_mtime(zip, jj, mtime, 0); snprintf(path, sizeof path, "%s/data.d", prefix); @@ -176,12 +212,16 @@ write_stream(char *name) // note that data must be kept in memory until zip_close // actually writes the archive. + // close and reopen the zip to force write and free all buffers zip_close(zip); zip = zip_open(filename, 0, 0); if (!zip) exit(-1); + struct zip_stat mystat; + zip_stat_index(zip, jj-1, 0, &mystat); // XXX + free(ts.mem); free(vs.mem); } |