summary refs log tree commit diff
diff options
context:
space:
mode:
authorLeah Neukirchen <leah@vuxu.org>2023-10-15 22:18:06 +0200
committerLeah Neukirchen <leah@vuxu.org>2023-10-15 22:18:06 +0200
commitb37efe0e408ddb7f16f7ac459b79e1b931ae3b4b (patch)
tree5ea1676965f63d521cb0c2900244f186e0895720
parent07915c710f4b74b9635a7573e7cc6c936809ba48 (diff)
downloadmico-dedup-ts.tar.gz
mico-dedup-ts.tar.xz
mico-dedup-ts.zip
experiment: deduplicate ts series dedup-ts
This doesn't actually save a lot, as ts files compress very well already:

Before:
 Length   Method    Size  Cmpr    Date    Time   CRC-32   Name
--------  ------  ------- ---- ---------- ----- --------  ----
  470207  Defl:X     8963  98% 12-29-2021 23:48 081ce894  cardio_bsecdata_eco2/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd
21313623  Defl:X  9266751  57% 12-29-2021 23:48 14f0795c  cardio_bsecdata_eco2/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d
  470207  Defl:X     8963  98% 12-29-2021 23:48 081ce894  cardio_bsecdata_gas_resistance/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd
20389230  Defl:X 10262765  50% 12-29-2021 23:48 8bfdabdb  cardio_bsecdata_gas_resistance/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d
  470207  Defl:X     8963  98% 12-29-2021 23:48 081ce894  cardio_bsecdata_humidity/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd
 4282896  Defl:X  4064432   5% 12-29-2021 23:48 71c8abbb  cardio_bsecdata_humidity/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d
  470206  Defl:X     8963  98% 12-29-2021 23:48 80a91662  cardio_bsecdata_iaq/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd
16772109  Defl:X  2245726  87% 12-29-2021 23:48 5cf4f904  cardio_bsecdata_iaq/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d
  470206  Defl:X     8963  98% 12-29-2021 23:48 80a91662  cardio_bsecdata_iaq_accuracy/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd
  619883  Defl:X    29615  95% 12-29-2021 23:48 8c427887  cardio_bsecdata_iaq_accuracy/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d
  470206  Defl:X     8963  98% 12-29-2021 23:48 80a91662  cardio_bsecdata_pressure/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd
 2856925  Defl:X  1660673  42% 12-29-2021 23:48 4ce2fd9b  cardio_bsecdata_pressure/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d
  470206  Defl:X     8963  98% 12-29-2021 23:48 638ba15a  cardio_bsecdata_temperature/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd
 2062648  Defl:X  1046122  49% 12-29-2021 23:48 62744838  cardio_bsecdata_temperature/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d
--------          -------  ---                            -------
71588759         28638825  60%                            14 files

After:
 Length   Method    Size  Cmpr    Date    Time   CRC-32   Name
--------  ------  ------- ---- ---------- ----- --------  ----
  470214  Defl:X     8983  98% 10-15-2023 22:12 524e0d1b  cardio_bsecdata_eco2/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd
21312658  Defl:X  9259690  57% 10-15-2023 22:12 d59af841  cardio_bsecdata_eco2/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d
     102  Defl:X       88  14% 10-15-2023 22:13 66e45fec  cardio_bsecdata_gas_resistance/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd
20389237  Defl:X 10287536  50% 10-15-2023 22:13 39a6bd34  cardio_bsecdata_gas_resistance/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d
     102  Defl:X       88  14% 10-15-2023 22:13 66e45fec  cardio_bsecdata_humidity/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd
 4276047  Defl:X  4057156   5% 10-15-2023 22:13 7eea7072  cardio_bsecdata_humidity/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d
  470214  Defl:X     8983  98% 10-15-2023 22:13 9ebd9470  cardio_bsecdata_iaq/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd
16772117  Defl:X  2245559  87% 10-15-2023 22:13 761cabb6  cardio_bsecdata_iaq/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d
     101  Defl:X       87  14% 10-15-2023 22:14 1c862bfc  cardio_bsecdata_iaq_accuracy/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd
  619891  Defl:X    29628  95% 10-15-2023 22:14 e9aff1a4  cardio_bsecdata_iaq_accuracy/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d
     101  Defl:X       87  14% 10-15-2023 22:14 1c862bfc  cardio_bsecdata_pressure/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd
 2856932  Defl:X  1652080  42% 10-15-2023 22:14 661347a6  cardio_bsecdata_pressure/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d
  470214  Defl:X     8984  98% 10-15-2023 22:14 6c3c1e96  cardio_bsecdata_temperature/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/time.dd
 2035471  Defl:X  1070947  47% 10-15-2023 22:14 83849e43  cardio_bsecdata_temperature/hostname="hecate",instance="hecate.home.vuxu.org:9882",job="bme680"/data.d
--------          -------  ---                            -------
69673401         28629896  59%                            14 files
-rw-r--r--Makefile2
-rw-r--r--mico-dump.c30
-rw-r--r--mico-store.c54
3 files changed, 76 insertions, 10 deletions
diff --git a/Makefile b/Makefile
index 6abbfbf..47b6680 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 CFLAGS=-g -O2 -Wall -Wno-switch -Wextra -Wwrite-strings
-LDLIBS=-lzip
+LDLIBS=-lzip -lz
 
 ALL=mico-dump mico-sort mico-store metra/metra
 
diff --git a/mico-dump.c b/mico-dump.c
index 75219d5..b4635af 100644
--- a/mico-dump.c
+++ b/mico-dump.c
@@ -1,3 +1,7 @@
+#define _XOPEN_SOURCE 700
+
+#include <sys/stat.h>
+
 #include <arpa/inet.h>
 #include <assert.h>
 #include <fnmatch.h>
@@ -157,11 +161,33 @@ main(int argc, char *argv[])
 		struct bitreader ts = { 0 };
 		struct bitreader vs = { 0 };
 
+		int ts_index = i;
+
+		zip_uint8_t opsys;
+		zip_uint32_t attr;
+		zip_file_get_external_attributes(zip, i, 0, &opsys, &attr);
+		if (opsys == ZIP_OPSYS_UNIX &&
+		    S_ISLNK(attr >> 16)) {
+			printf("symlink!\n");
+			struct zip_file *zf = zip_fopen_index(zip, i, 0);
+			char buf[1024];
+			ssize_t len = zip_fread(zf, buf, sizeof buf);
+			buf[len] = 0;
+			if (strncmp(buf, "../../", 6) == 0) {
+				ts_index = zip_name_locate(zip, buf + 6, 0);
+				fprintf(stderr, "%s -> %d\n", buf + 6, ts_index);
+				if (ts_index < 0) {
+					fprintf(stderr, "invalid symlink, skipping");
+					continue;
+				}
+			}
+		}
+
 		/* XXX verify assumptions on zip file order */
-		ts.input = zip_fopen_index(zip, i, 0);
+		ts.input = zip_fopen_index(zip, ts_index, 0);
 		vs.input = zip_fopen_index(zip, i+1, 0);
 
-		char *name = strdup(zip_get_name(zip, i, ZIP_FL_ENC_RAW));
+		char *name = strdup(zip_get_name(zip, i+1, ZIP_FL_ENC_RAW));
 		char *s = strchr(name, '/');
 		*s = '{';
 		s = strrchr(name, '/');
diff --git a/mico-store.c b/mico-store.c
index f92d592..0b21301 100644
--- a/mico-store.c
+++ b/mico-store.c
@@ -1,3 +1,5 @@
+#include <sys/stat.h>
+
 #include <arpa/inet.h>
 #include <assert.h>
 #include <stdint.h>
@@ -6,6 +8,7 @@
 #include <stdlib.h>
 
 #include <zip.h>
+#include <zlib.h>    // for crc32
 
 #define MICO_HEADER "\211MiC\r\n\032\n"
 
@@ -150,17 +153,50 @@ write_stream(char *name)
 	memcpy(ts.mem + 8, &nevents, sizeof nevents);
 	memcpy(vs.mem + 8, &nevents, sizeof nevents);
 
+	uint32_t crc = crc32(0L, Z_NULL, 0);
+	crc = crc32(crc, (unsigned char *)ts.mem, (long)ts.memlen);
+	printf("precomputed crc=%d\n", crc);
+
 	// time_t mtime = metrics[m].value[0].t / 1000;
 	snprintf(path, sizeof path, "%s/time.dd", prefix);
 
-	zip_source_t *buft = zip_source_buffer(zip,
-	    ts.mem, ts.memlen, 0);
-	int jj= zip_file_add(zip, path, buft, ZIP_FL_ENC_UTF_8);
-	printf("index of %s = %d\n", path, jj);
-	if (jj < 0) {
-		printf("error adding file: %s\n", zip_strerror(zip));
+	int found_ts = 0;
+	int jj;
+	for (int64_t kk = 0; kk < zip_get_num_entries(zip, 0); kk += 2) {
+		struct zip_stat stat;
+		zip_stat_index(zip, kk, 0, &stat);
+		if (stat.crc != crc)
+			continue;
+
+		/* XXX better validation the ts are identical than just crc */
+
+		zip_uint32_t attr = ((S_IFLNK | 0777) << 16L);
+
+		char target[1024];
+		snprintf(target, sizeof target, "../../%s", zip_get_name(zip, kk, ZIP_FL_ENC_RAW));
+		fprintf(stderr, "duplicate ts crc=%d, target=%s\n", stat.crc, target);
+
+		struct zip_source *link = zip_source_buffer(zip, strdup(target), strlen(target), 1);
+		jj = zip_file_add(zip, path, link, ZIP_FL_ENC_UTF_8);
+		if (jj < 0) {
+			printf("error adding file: %s\n", zip_strerror(zip));
+		}
+		zip_file_set_external_attributes(zip, jj, 0, ZIP_OPSYS_UNIX, attr);
+		printf("index of symlink %s -> %s = %d\n", path, target, jj);
+		found_ts = 1;
+		break;
+	}
+
+	if (!found_ts) {
+		zip_source_t *buft = zip_source_buffer(zip,
+		    ts.mem, ts.memlen, 0);
+		jj= zip_file_add(zip, path, buft, ZIP_FL_ENC_UTF_8);
+		printf("index of %s = %d\n", path, jj);
+		if (jj < 0) {
+			printf("error adding file: %s\n", zip_strerror(zip));
+		}
+		// zip_file_set_mtime(zip, jj, mtime, 0);
 	}
-	// zip_file_set_mtime(zip, jj, mtime, 0);
 
 	snprintf(path, sizeof path, "%s/data.d", prefix);
 
@@ -176,12 +212,16 @@ write_stream(char *name)
 	// note that data must be kept in memory until zip_close
 	// actually writes the archive.
 
+
 	// close and reopen the zip to force write and free all buffers
 	zip_close(zip);
 	zip = zip_open(filename, 0, 0);
 	if (!zip)
 		exit(-1);
 
+	struct zip_stat mystat;
+	zip_stat_index(zip, jj-1, 0, &mystat);   // XXX
+
 	free(ts.mem);
 	free(vs.mem);
 }