c-basic/huffman/huffman.c

223 lines
6.8 KiB
C
Raw Normal View History

2022-11-24 21:04:54 +01:00
#include <errno.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#define BUFSIZE 512
2022-11-26 13:08:04 +01:00
// a single node, only having information for a single byte.
struct Node {
uint8_t byte;
size_t occurences;
unsigned frequencyPriority;
};
// a combination of nodes.
struct Heap {
struct Heap *parent, *child0, *child1;
struct Node **Nodes;
bool isRoot;
};
// global vars for Heaps and Nodes.
struct Node* nodes;
struct Heap* heaps;
// initialize our nodes
void initNodes(){
nodes = malloc(sizeof(struct Node)*256);
heaps = malloc(sizeof(struct Heap)*256); // not sure if i might need more memory for heaps?
for(int i = 0; i < 256; i++){
nodes[i].byte = i;
nodes[i].occurences = 0;
nodes[i].frequencyPriority = 0;
heaps[i].isRoot = false;
}
}
// stolen from stackoverflow
// https://stackoverflow.com/questions/8236/how-do-you-determine-the-size-of-a-file-in-c
2022-11-24 21:04:54 +01:00
off_t fsize(const char *filename) {
struct stat st;
if (stat(filename, &st) == 0)
2022-11-25 00:14:31 +01:00
return st.st_size;
2022-11-24 21:04:54 +01:00
fprintf(stderr, "Cannot determine size of %s: %s\n", filename,
2022-11-25 00:14:31 +01:00
strerror(errno));
2022-11-24 21:04:54 +01:00
return -1;
}
// print help
2022-11-24 21:04:54 +01:00
void helper() {
printf("huffman compression algorithm implementation for educational "
2022-11-25 00:14:31 +01:00
"purposes.\n\nSyntax:\nhuffman -f fileToCompress\t\tcompress the "
"given file\nhuffman -xf fileToDecompress\t\tdecompress the given "
"file\nhuffman -h\t\t\t\tshow this help\nhuffman -v\t\t\t\tverbose\n");
2022-11-24 21:04:54 +01:00
}
// let the magic happen
2022-11-24 21:04:54 +01:00
int main(int argc, char *argv[]) {
int opt;
bool extract_mode = false;
bool verbose = false;
bool debug = false;
char *filestring = NULL;
off_t filelen;
2022-11-24 21:04:54 +01:00
FILE *fptrR = NULL; // file pointer for reading
FILE *fptrW = NULL; // file pointer for writing
while ((opt = getopt(argc, argv, "dvxhf:")) != -1) {
2022-11-25 00:14:31 +01:00
if (debug)
printf("optarg is: %s\n", optarg);
switch (opt) {
case 'v':
verbose = true;
break;
case 'd':
debug = true;
break;
case 'f':
filestring = optarg;
break;
case 'h':
helper();
exit(0);
break;
case 'x':
extract_mode = true;
break;
default:
fprintf(stderr, "Usage: %s [-dvhx -f] [file]\n", argv[0]);
exit(EXIT_FAILURE);
}
2022-11-24 21:04:54 +01:00
}
// Now optind (declared extern int by <unistd.h>) is the index of the first
// non-option argument. If it is >= argc, there were no non-option arguments.
if (verbose)
2022-11-25 00:14:31 +01:00
printf("selected file: %s\n", filestring);
2022-11-24 21:04:54 +01:00
if (filestring) {
2022-11-25 00:14:31 +01:00
if(debug)
printf("[DEBUG]processing given file argument.\n");
// open the given file in binary mode
2022-11-25 00:14:31 +01:00
fptrR = fopen(filestring, "rb");
if (fptrR == NULL) {
fprintf(stderr, "The given file does not exist or is unavailable.\n");
exit(EXIT_FAILURE);
}
filelen = fsize(filestring);
if(verbose)
printf("filesize: %ldB\n", filelen);
2022-11-24 21:04:54 +01:00
}
else {
2022-11-25 00:14:31 +01:00
// empty filestring or filestring is NULL
fprintf(stderr, "Usage: %s [-dvhx -f] [file]\n", argv[0]);
exit(EXIT_FAILURE);
2022-11-24 21:04:54 +01:00
}
if (extract_mode) {
2022-11-25 00:14:31 +01:00
printf("extracting is not yet implemented.\n");
// decompress the file
2022-11-24 21:04:54 +01:00
}
else {
2022-11-25 00:14:31 +01:00
// compress the file
if (verbose)
printf("compressing file...\n");
// frequency analysis
uint8_t buf [BUFSIZE];
2022-11-25 00:14:31 +01:00
// dump start of file if debugging
// FIXME add conditions if the part to print is smaller than 512B
if(debug){
printf("[DEBUG]First 512 bytes are:\n");
fread(buf, 1, BUFSIZE, fptrR);
for(int i=0;i<BUFSIZE;i++){
2022-11-25 00:14:31 +01:00
if(i%16==0)
printf("%08x\t", i);
printf("%02x ", buf[i]);
if(i%16==7)
printf(" ");
if(i%16==15){
printf("\n");
}
}
}
initNodes();
size_t ret;
while(!feof(fptrR)){
ret = fread(buf, 1, BUFSIZE, fptrR);
if(ret == BUFSIZE){
// fread success, continue as normal
// calculate occurences.
for(int i = 0; i < BUFSIZE; i++){
nodes[buf[i]].occurences++;
}
2022-11-25 00:14:31 +01:00
}
else if((ret < BUFSIZE) && (ret >= 0)){
if(!feof(fptrR)) {
// no EOF, but didn't read full buffer. Assuming an error
printf("Error while reading file.");
exit(EXIT_FAILURE);
}
else{
2022-11-26 16:09:13 +01:00
// reached EOF, but we might still have more Bytes in Buffer, as not every File has n*BUFSIZE Bytes in it.
/*
* I don't really get why i need to use ret+BUFSIZE, sounds to me like it produces
* a buffer overflow, but somehow it gets those last bytes i couldn't get before and brings
* Occurences to 100% in all cases compared against true filelength.
* TODO understand why this works!
*/
for(int i = 0; i < ret+BUFSIZE; i++){
nodes[buf[i]].occurences++;
}
}
2022-11-25 00:14:31 +01:00
}
else{
printf("Undefined behaviour while reading file.\n");
exit(EXIT_FAILURE);
2022-11-25 00:14:31 +01:00
}
2022-11-26 16:09:13 +01:00
// FIXME, the last 512 bit block is always skipped.
// fread seems to advance the File pointer by itself, so the following line would skip half of the file.
// fseek(fptrR, BUFSIZE, SEEK_CUR);
2022-11-25 00:14:31 +01:00
}
if(debug){ // show occurences
printf("Occurences (hex):\n");
size_t addedUpOccurences = 0;
for(int i = 0; i < 256; i++){
if(i%5==0)
printf("\n");
printf("0x%02x: 0x%016llx ", i, nodes[i].occurences);
addedUpOccurences += nodes[i].occurences;
}
2022-11-26 16:09:13 +01:00
printf("\nFilelength: %lldB\nAdded up occurences: %lld\nDifference: %lld\nOccurences in Filelength: %f%%\n",
filelen, addedUpOccurences,
filelen - addedUpOccurences,
(float)(100*(addedUpOccurences/(float)filelen)));
}
2022-11-24 21:04:54 +01:00
}
2022-11-24 21:04:54 +01:00
fclose(fptrR);
printf("\n");
if(debug){ // wait for input to end.
printf("Press Enter to finish.\n");
getchar();
}
2022-11-24 21:04:54 +01:00
exit(EXIT_SUCCESS);
}