diff options
author | Leah Neukirchen <leah@vuxu.org> | 2023-10-15 22:18:06 +0200 |
---|---|---|
committer | Leah Neukirchen <leah@vuxu.org> | 2023-10-15 22:18:06 +0200 |
commit | b37efe0e408ddb7f16f7ac459b79e1b931ae3b4b (patch) | |
tree | 5ea1676965f63d521cb0c2900244f186e0895720 /mico-store.c | |
parent | 07915c710f4b74b9635a7573e7cc6c936809ba48 (diff) | |
download | mico-b37efe0e408ddb7f16f7ac459b79e1b931ae3b4b.tar.gz mico-b37efe0e408ddb7f16f7ac459b79e1b931ae3b4b.tar.xz mico-b37efe0e408ddb7f16f7ac459b79e1b931ae3b4b.zip |
experiment: deduplicate ts series dedup-ts
This doesn't actually save a lot, as ts files compress very well already: Before: Length Method Size Cmpr Date Time CRC-32 Name -------- ------ ------- ---- ---------- ----- -------- ---- 470207 Defl:X 8963 98% 12-29-2021 23:48 081ce894 cardio_bsecdata_eco2/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 21313623 Defl:X 9266751 57% 12-29-2021 23:48 14f0795c cardio_bsecdata_eco2/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 470207 Defl:X 8963 98% 12-29-2021 23:48 081ce894 cardio_bsecdata_gas_resistance/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 20389230 Defl:X 10262765 50% 12-29-2021 23:48 8bfdabdb cardio_bsecdata_gas_resistance/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 470207 Defl:X 8963 98% 12-29-2021 23:48 081ce894 cardio_bsecdata_humidity/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 4282896 Defl:X 4064432 5% 12-29-2021 23:48 71c8abbb cardio_bsecdata_humidity/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 470206 Defl:X 8963 98% 12-29-2021 23:48 80a91662 cardio_bsecdata_iaq/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 16772109 Defl:X 2245726 87% 12-29-2021 23:48 5cf4f904 cardio_bsecdata_iaq/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 470206 Defl:X 8963 98% 12-29-2021 23:48 80a91662 cardio_bsecdata_iaq_accuracy/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 619883 Defl:X 29615 95% 12-29-2021 23:48 8c427887 cardio_bsecdata_iaq_accuracy/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 470206 Defl:X 8963 98% 12-29-2021 23:48 80a91662 cardio_bsecdata_pressure/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 2856925 Defl:X 1660673 42% 12-29-2021 23:48 4ce2fd9b cardio_bsecdata_pressure/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 470206 Defl:X 8963 98% 12-29-2021 23:48 638ba15a cardio_bsecdata_temperature/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 2062648 Defl:X 1046122 49% 12-29-2021 23:48 62744838 cardio_bsecdata_temperature/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d -------- ------- --- ------- 71588759 28638825 60% 14 files After: Length Method Size Cmpr Date Time CRC-32 Name -------- ------ ------- ---- ---------- ----- -------- ---- 470214 Defl:X 8983 98% 10-15-2023 22:12 524e0d1b cardio_bsecdata_eco2/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 21312658 Defl:X 9259690 57% 10-15-2023 22:12 d59af841 cardio_bsecdata_eco2/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 102 Defl:X 88 14% 10-15-2023 22:13 66e45fec cardio_bsecdata_gas_resistance/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 20389237 Defl:X 10287536 50% 10-15-2023 22:13 39a6bd34 cardio_bsecdata_gas_resistance/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 102 Defl:X 88 14% 10-15-2023 22:13 66e45fec cardio_bsecdata_humidity/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 4276047 Defl:X 4057156 5% 10-15-2023 22:13 7eea7072 cardio_bsecdata_humidity/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 470214 Defl:X 8983 98% 10-15-2023 22:13 9ebd9470 cardio_bsecdata_iaq/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 16772117 Defl:X 2245559 87% 10-15-2023 22:13 761cabb6 cardio_bsecdata_iaq/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 101 Defl:X 87 14% 10-15-2023 22:14 1c862bfc cardio_bsecdata_iaq_accuracy/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 619891 Defl:X 29628 95% 10-15-2023 22:14 e9aff1a4 cardio_bsecdata_iaq_accuracy/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 101 Defl:X 87 14% 10-15-2023 22:14 1c862bfc cardio_bsecdata_pressure/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 2856932 Defl:X 1652080 42% 10-15-2023 22:14 661347a6 cardio_bsecdata_pressure/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d 470214 Defl:X 8984 98% 10-15-2023 22:14 6c3c1e96 cardio_bsecdata_temperature/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd 2035471 Defl:X 1070947 47% 10-15-2023 22:14 83849e43 cardio_bsecdata_temperature/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d -------- ------- --- ------- 69673401 28629896 59% 14 files
Diffstat (limited to 'mico-store.c')
-rw-r--r-- | mico-store.c | 54 |
1 files changed, 47 insertions, 7 deletions
diff --git a/mico-store.c b/mico-store.c index f92d592..0b21301 100644 --- a/mico-store.c +++ b/mico-store.c @@ -1,3 +1,5 @@ +#include <sys/stat.h> + #include <arpa/inet.h> #include <assert.h> #include <stdint.h> @@ -6,6 +8,7 @@ #include <stdlib.h> #include <zip.h> +#include <zlib.h> // for crc32 #define MICO_HEADER "\211MiC\r\n\032\n" @@ -150,17 +153,50 @@ write_stream(char *name) memcpy(ts.mem + 8, &nevents, sizeof nevents); memcpy(vs.mem + 8, &nevents, sizeof nevents); + uint32_t crc = crc32(0L, Z_NULL, 0); + crc = crc32(crc, (unsigned char *)ts.mem, (long)ts.memlen); + printf("precomputed crc=%d\n", crc); + // time_t mtime = metrics[m].value[0].t / 1000; snprintf(path, sizeof path, "%s/time.dd", prefix); - zip_source_t *buft = zip_source_buffer(zip, - ts.mem, ts.memlen, 0); - int jj= zip_file_add(zip, path, buft, ZIP_FL_ENC_UTF_8); - printf("index of %s = %d\n", path, jj); - if (jj < 0) { - printf("error adding file: %s\n", zip_strerror(zip)); + int found_ts = 0; + int jj; + for (int64_t kk = 0; kk < zip_get_num_entries(zip, 0); kk += 2) { + struct zip_stat stat; + zip_stat_index(zip, kk, 0, &stat); + if (stat.crc != crc) + continue; + + /* XXX better validation the ts are identical than just crc */ + + zip_uint32_t attr = ((S_IFLNK | 0777) << 16L); + + char target[1024]; + snprintf(target, sizeof target, "../../%s", zip_get_name(zip, kk, ZIP_FL_ENC_RAW)); + fprintf(stderr, "duplicate ts crc=%d, target=%s\n", stat.crc, target); + + struct zip_source *link = zip_source_buffer(zip, strdup(target), strlen(target), 1); + jj = zip_file_add(zip, path, link, ZIP_FL_ENC_UTF_8); + if (jj < 0) { + printf("error adding file: %s\n", zip_strerror(zip)); + } + zip_file_set_external_attributes(zip, jj, 0, ZIP_OPSYS_UNIX, attr); + printf("index of symlink %s -> %s = %d\n", path, target, jj); + found_ts = 1; + break; + } + + if (!found_ts) { + zip_source_t *buft = zip_source_buffer(zip, + ts.mem, ts.memlen, 0); + jj= zip_file_add(zip, path, buft, ZIP_FL_ENC_UTF_8); + printf("index of %s = %d\n", path, jj); + if (jj < 0) { + printf("error adding file: %s\n", zip_strerror(zip)); + } + // zip_file_set_mtime(zip, jj, mtime, 0); } - // zip_file_set_mtime(zip, jj, mtime, 0); snprintf(path, sizeof path, "%s/data.d", prefix); @@ -176,12 +212,16 @@ write_stream(char *name) // note that data must be kept in memory until zip_close // actually writes the archive. + // close and reopen the zip to force write and free all buffers zip_close(zip); zip = zip_open(filename, 0, 0); if (!zip) exit(-1); + struct zip_stat mystat; + zip_stat_index(zip, jj-1, 0, &mystat); // XXX + free(ts.mem); free(vs.mem); } |