2022-11-24 21:04:54 +01:00
|
|
|
#include <errno.h>
|
|
|
|
#include <stdbool.h>
|
|
|
|
#include <stdint.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <sys/stat.h>
|
|
|
|
#include <sys/types.h>
|
|
|
|
#include <unistd.h>
|
|
|
|
|
2022-11-26 13:08:04 +01:00
|
|
|
// a single node, only having information for a single byte.
|
|
|
|
struct Node {
|
|
|
|
uint8_t byte;
|
|
|
|
size_t occurences;
|
|
|
|
unsigned frequencyPriority;
|
|
|
|
};
|
|
|
|
|
|
|
|
// a combination of nodes.
|
|
|
|
struct Heap {
|
|
|
|
struct Heap *parent, *child0, *child1;
|
|
|
|
struct Node **Nodes;
|
|
|
|
bool isRoot;
|
|
|
|
};
|
|
|
|
|
2022-11-26 01:03:37 +01:00
|
|
|
// stolen from stackoverflow
|
|
|
|
// https://stackoverflow.com/questions/8236/how-do-you-determine-the-size-of-a-file-in-c
|
2022-11-24 21:04:54 +01:00
|
|
|
off_t fsize(const char *filename) {
|
|
|
|
struct stat st;
|
|
|
|
|
|
|
|
if (stat(filename, &st) == 0)
|
2022-11-25 00:14:31 +01:00
|
|
|
return st.st_size;
|
2022-11-24 21:04:54 +01:00
|
|
|
|
|
|
|
fprintf(stderr, "Cannot determine size of %s: %s\n", filename,
|
2022-11-25 00:14:31 +01:00
|
|
|
strerror(errno));
|
2022-11-24 21:04:54 +01:00
|
|
|
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
void helper() {
|
|
|
|
printf("huffman compression algorithm implementation for educational "
|
2022-11-25 00:14:31 +01:00
|
|
|
"purposes.\n\nSyntax:\nhuffman -f fileToCompress\t\tcompress the "
|
|
|
|
"given file\nhuffman -xf fileToDecompress\t\tdecompress the given "
|
|
|
|
"file\nhuffman -h\t\t\t\tshow this help\nhuffman -v\t\t\t\tverbose\n");
|
2022-11-24 21:04:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
int main(int argc, char *argv[]) {
|
|
|
|
int opt;
|
|
|
|
bool extract_mode = false;
|
|
|
|
bool verbose = false;
|
|
|
|
bool debug = false;
|
|
|
|
char *filestring = NULL;
|
2022-11-26 01:03:37 +01:00
|
|
|
off_t filelen;
|
2022-11-24 21:04:54 +01:00
|
|
|
|
|
|
|
FILE *fptrR = NULL; // file pointer for reading
|
|
|
|
FILE *fptrW = NULL; // file pointer for writing
|
|
|
|
|
|
|
|
while ((opt = getopt(argc, argv, "dvxhf:")) != -1) {
|
2022-11-25 00:14:31 +01:00
|
|
|
if (debug)
|
|
|
|
printf("optarg is: %s\n", optarg);
|
|
|
|
switch (opt) {
|
|
|
|
case 'v':
|
|
|
|
verbose = true;
|
|
|
|
break;
|
|
|
|
case 'd':
|
|
|
|
debug = true;
|
|
|
|
break;
|
|
|
|
case 'f':
|
|
|
|
filestring = optarg;
|
|
|
|
break;
|
|
|
|
case 'h':
|
|
|
|
helper();
|
|
|
|
exit(0);
|
|
|
|
break;
|
|
|
|
case 'x':
|
|
|
|
extract_mode = true;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
fprintf(stderr, "Usage: %s [-dvhx -f] [file]\n", argv[0]);
|
|
|
|
exit(EXIT_FAILURE);
|
|
|
|
}
|
2022-11-24 21:04:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Now optind (declared extern int by <unistd.h>) is the index of the first
|
|
|
|
// non-option argument. If it is >= argc, there were no non-option arguments.
|
|
|
|
|
|
|
|
if (verbose)
|
2022-11-25 00:14:31 +01:00
|
|
|
printf("selected file: %s\n", filestring);
|
2022-11-24 21:04:54 +01:00
|
|
|
|
|
|
|
if (filestring) {
|
2022-11-25 00:14:31 +01:00
|
|
|
if(debug)
|
|
|
|
printf("[DEBUG]processing given file argument.\n");
|
2022-11-26 01:03:37 +01:00
|
|
|
// open the given file in binary mode
|
2022-11-25 00:14:31 +01:00
|
|
|
fptrR = fopen(filestring, "rb");
|
|
|
|
if (fptrR == NULL) {
|
|
|
|
fprintf(stderr, "The given file does not exist or is unavailable.\n");
|
|
|
|
exit(EXIT_FAILURE);
|
|
|
|
}
|
2022-11-26 01:03:37 +01:00
|
|
|
filelen = fsize(filestring);
|
|
|
|
if(verbose)
|
|
|
|
printf("filesize: %ldB\n", filelen);
|
2022-11-24 21:04:54 +01:00
|
|
|
}
|
|
|
|
else {
|
2022-11-25 00:14:31 +01:00
|
|
|
// empty filestring or filestring is NULL
|
|
|
|
fprintf(stderr, "Usage: %s [-dvhx -f] [file]\n", argv[0]);
|
|
|
|
exit(EXIT_FAILURE);
|
2022-11-24 21:04:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if (extract_mode) {
|
2022-11-25 00:14:31 +01:00
|
|
|
printf("extracting is not yet implemented.\n");
|
|
|
|
// decompress the file
|
2022-11-24 21:04:54 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
else {
|
2022-11-25 00:14:31 +01:00
|
|
|
// compress the file
|
|
|
|
if (verbose)
|
|
|
|
printf("compressing file...\n");
|
|
|
|
|
|
|
|
// frequency analysis
|
|
|
|
|
|
|
|
|
|
|
|
// dump start of file if debugging
|
|
|
|
// FIXME add conditions if the part to print is smaller than 512B
|
|
|
|
if(debug){
|
|
|
|
printf("[DEBUG]First 512 bytes are:\n");
|
|
|
|
fread(buf, 1, 512, fptrR);
|
|
|
|
for(int i=0;i<512;i++){
|
|
|
|
if(i%16==0)
|
|
|
|
printf("%08x\t", i);
|
|
|
|
printf("%02x ", buf[i]);
|
|
|
|
if(i%16==7)
|
|
|
|
printf(" ");
|
|
|
|
if(i%16==15){
|
|
|
|
printf("\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-11-26 01:03:37 +01:00
|
|
|
uint64_t occurences[256] = { 0 };
|
|
|
|
|
|
|
|
// TODO calculate occurences
|
2022-11-26 01:15:02 +01:00
|
|
|
// FIXME this loads the file into RAM completely. Loading a too big file would eat all memory of the system.
|
|
|
|
// This is a dirty hack of an algorithm.
|
2022-11-26 13:08:04 +01:00
|
|
|
// uint8_t* buf = malloc(filelen);
|
2022-11-26 01:15:02 +01:00
|
|
|
for(int i = 0; i < filelen; i++) {
|
|
|
|
fread(buf+i, 1, 1, fptrR);
|
|
|
|
}
|
|
|
|
|
|
|
|
// now go through all of the stored bytes in the buffer and count the occurences.
|
|
|
|
for(int i = 0; i < filelen; i++) {
|
|
|
|
occurences[*(buf+i)]++; // FIXME this might get the value of the bytes but +1, not sure about the logic!
|
|
|
|
}
|
|
|
|
|
|
|
|
// holy shit i think the dirty hack is working
|
|
|
|
// well, at least for smaller files.
|
|
|
|
// SEGVAULT for the 10G file, 1G works.
|
|
|
|
|
|
|
|
|
2022-11-25 00:14:31 +01:00
|
|
|
if(debug){
|
|
|
|
printf("Occurences (Hex):\n");
|
|
|
|
for(int i=0;i<256;i++){
|
|
|
|
if(i%4==0)
|
|
|
|
printf("\n");
|
|
|
|
printf("0x%02x: %016lx\t", i, occurences[i]);
|
|
|
|
}
|
|
|
|
printf("\n\nfile length(by pointer):\t\t%luB\n", filelen);
|
|
|
|
long long int addedUpOccurences = 0; // FIXME might not be enough storage for larger files!
|
|
|
|
for(int i=0;i<256;i++){
|
|
|
|
addedUpOccurences += occurences[i];
|
|
|
|
}
|
|
|
|
printf("file length(added up occurences):\t%lldB\n", addedUpOccurences);
|
|
|
|
}
|
|
|
|
|
|
|
|
if(verbose)
|
|
|
|
printf("\n\nDone calculating occurences of bytes.\n");
|
|
|
|
|
|
|
|
// TODO
|
|
|
|
// calculate the frequencies of the bytes.
|
|
|
|
double frequencies[256];
|
|
|
|
for(int i=0;i<256;i++){
|
|
|
|
frequencies[i]=((double)occurences[i]/(double)filelen)*100; // calculate frequencies of bytes in percent (example: 05.23 (%))
|
|
|
|
}
|
|
|
|
if(debug){
|
|
|
|
printf("Frequencies:\n");
|
|
|
|
for(int i=0;i<256;i++){
|
|
|
|
if(i%8==0)
|
|
|
|
printf("\n");
|
|
|
|
printf("0x%02x: %05.02f%%\t", i, frequencies[i]);
|
|
|
|
}
|
|
|
|
double addedUpFrequencies = 0;
|
|
|
|
for(int i=0;i<256;i++){
|
|
|
|
addedUpFrequencies += frequencies[i];
|
|
|
|
}
|
|
|
|
printf("\n\nadded up frequencies: %05.02f%%\n",addedUpFrequencies);
|
|
|
|
}
|
2022-11-24 21:04:54 +01:00
|
|
|
}
|
|
|
|
fclose(fptrR);
|
|
|
|
printf("\n");
|
|
|
|
exit(EXIT_SUCCESS);
|
|
|
|
}
|