/* gcc filename -O3 -march=native -Wall -std=gnu99 -lnetpbm -I/usr/include/netpbm */
/* gcc filename -O3 -march=native -Wall -std=gnu99 -lnetpbm -I/usr/include/netpbm -S -fverbose-asm */

#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <pbm.h>
#include <emmintrin.h>

struct image {
	int cols, rows;
	unsigned char *bitmap;
};
static int image_rowbytes(struct image *img);
static int image_pixel(struct image *img, int x, int y);
static void image_putpixel(struct image *img, int x, int y, unsigned char pix);

static struct image *image_load(char *filename);
static void image_save(char *filename, struct image *img);
static void image_free(struct image *img);

typedef int_fast64_t timestamp_t;
static timestamp_t get_timer(void);


unsigned int inline popc(unsigned int val)
{
	//put count of each k bits into those k bits
	val = (val & 0x55555555) + ((val >>  1) & 0x55555555);
	val = (val & 0x33333333) + ((val >>  2) & 0x33333333);
	val = (val & 0x0f0f0f0f) + ((val >>  4) & 0x0f0f0f0f);
	val = (val & 0x00ff00ff) + ((val >>  8) & 0x00ff00ff);
	val = (val & 0x0000ffff) + ((val >> 16) & 0x0000ffff);
	return val;
}

int __attribute__((noinline)) exercise(struct image *restrict img1, struct image *restrict img2)
{
	assert(img1->cols == img2->cols);
	assert(img1->rows == img2->rows);
	int size = img1->rows * image_rowbytes(img1);
	unsigned int __attribute__((aligned(16))) *restrict bitmap1 = (unsigned int*) img1->bitmap;
	unsigned int __attribute__((aligned(16))) *restrict bitmap2 = (unsigned int*) img2->bitmap;
	int d = size / sizeof(int);

	unsigned int dist = 0;

	for (int i = 0; i < d; i++) {
		unsigned int delta = bitmap1[i] ^ bitmap2[i];
		dist += popc(delta);
	}

	return dist;
}


int main(int argc, char *argv[])
{
	if (argc != 4) {
		fprintf(stderr, "%s ITERS SRC1IMAGE.pbm SRC2IMAGE.pbm\n", argv[0]);
		return EXIT_FAILURE;
	}

	int iters = atoi(argv[1]);
	struct image *img1 = image_load(argv[2]);
	struct image *img2 = image_load(argv[3]);
	timestamp_t t0 = get_timer();

	while (get_timer() - t0 < 1000000) ;
	t0 = get_timer();

	int dist = 0;

	/*** Your code goes here. ***/

	for (int i = 0; i < iters; i++)
		dist = exercise(img1, img2);

	/*** Your code ends here. */

	t0 = get_timer() - t0;
	fprintf(stderr, "time spent: %.3f\n", (double) t0/1e6);
	printf("%d\n", dist);
	image_free(img1);
	image_free(img2);
	return EXIT_SUCCESS;
}


static inline int
image_rowbytes(struct image *img)
{
	return (img->cols + 7) / 8;
}

static inline int
image_pixel(struct image *img, int x, int y)
{
	return (img->bitmap[image_rowbytes(img) * y + x / 8] >> (7 - x % 8)) & 1;
}

static inline void
image_putpixel(struct image *img, int x, int y, unsigned char pix)
{
	img->bitmap[image_rowbytes(img) * y + x / 8] &=  ~(1 << (7 - x % 8));
	img->bitmap[image_rowbytes(img) * y + x / 8] |= (pix << (7 - x % 8));
}

static struct image *
image_load(char *filename)
{
	FILE *f = fopen(filename, "rb");
	if (!f) { perror("load"); exit(EXIT_FAILURE); }

	struct image img; int fmt;
	pbm_readpbminit(f, &img.cols, &img.rows, &fmt);

	int ret = posix_memalign((void**)&img.bitmap, 16, img.rows * image_rowbytes(&img));
	assert(!ret);
	for (int i = 0; i < img.rows; i++)
		pbm_readpbmrow_packed(f, img.bitmap + i * image_rowbytes(&img),
				img.cols, fmt);

	fclose(f);

	struct image *imga = malloc(sizeof(*imga));
	*imga = img;
	return imga;
}

static void
image_save(char *filename, struct image *img)
{
	FILE *f = fopen(filename, "wb");
	if (!f) { perror("save"); exit(EXIT_FAILURE); }

	pbm_writepbminit(f, img->cols, img->rows, 0);
	for (int i = 0; i < img->rows; i++)
		pbm_writepbmrow_packed(f, img->bitmap + i * image_rowbytes(img),
				img->cols, 0);

	fclose(f);
}

static void
image_free(struct image *img)
{
	free(img->bitmap);
	free(img);
}


static timestamp_t
get_timer(void)
{
	struct timeval t;
	gettimeofday(&t, NULL);
	return 1000000*t.tv_sec + t.tv_usec;
}
