/* gcc filename -Wall -std=gnu99 -lnetpbm -I/usr/include/netpbm */
/* gcc filename -Wall -std=gnu99 -lnetpbm -S -fverbose-asm */

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <pbm.h>

struct image {
	int cols, rows;
	unsigned char *bitmap;
};
static int image_rowbytes(struct image *img);
static int image_pixel(struct image *img, int x, int y);
static void image_putpixel(struct image *img, int x, int y, unsigned char pix);

static struct image *image_load(char *filename);
static void image_save(char *filename, struct image *img);
static void image_free(struct image *img);

typedef int_fast64_t timestamp_t;
static timestamp_t get_timer(void);


#define BLKSIZE 32 // must be a multiple of 8 to fit whole row bytes
#define TYPE uint8_t
#define TS (sizeof(TYPE)*8)

/* Take BLKSIZExBLKSIZE block from bitmap1 (starting at coordinates x1,y1),
 * rotate it clock-wise and store it to bitmap2 (at coordinates x2,y2). */
void rotblock(int rowbytes,
              unsigned char *restrict bitmap1, int x1, int y1,
	      unsigned char *restrict bitmap2, int x2, int y2)
{
	for (unsigned int i = 0; i < BLKSIZE; i++)
		for (unsigned int j = 0; j < BLKSIZE; j += TS) {
			TYPE byte = 0;

			for (unsigned int w = 0; w < TS; w++)
			{
				unsigned int xbit = x1*8 + i;
				unsigned int y = y1 + j + w;
				TYPE bit = bitmap1[rowbytes * y + xbit / 8] >> (7 - xbit % 8) & 1; 
				byte |= bit << w;
			}

			*((TYPE*)&bitmap2[rowbytes * (y2 + i) + x2 + (BLKSIZE - j - TS) / 8]) = byte;
		}
}

void __attribute__((noinline)) exercise(struct image *img)
{
	int rowbytes = image_rowbytes(img);
	unsigned char *bitmap2 = malloc(img->rows * rowbytes);

	for (int y = 0; y < img->rows; y += BLKSIZE)
		for (int x = 0; x < rowbytes; x += BLKSIZE / 8) {
			int y2 = x * 8;
			int x2 = (img->rows - BLKSIZE - y) / 8;
			rotblock(rowbytes, img->bitmap, x, y, bitmap2, x2, y2);
		}

	free(img->bitmap); img->bitmap = bitmap2;
}



int main(int argc, char *argv[])
{
	if (argc != 3) {
		fprintf(stderr, "%s SRCIMAGE.pbm DSTIMAGE.pbm\n", argv[0]);
		return EXIT_FAILURE;
	}

	struct image *img = image_load(argv[1]);
	timestamp_t t0 = get_timer();

	while (get_timer() - t0 < 1000000) ;
	t0 = get_timer();

	/*** Your code goes here. ***/

	exercise(img);

	/*** Your code ends here. */

	t0 = get_timer() - t0;
	fprintf(stderr, "time spent: %.3f\n", (double) t0/1e6);
	image_save(argv[2], img);
	image_free(img);
	return EXIT_SUCCESS;
}


static inline int
image_rowbytes(struct image *img)
{
	return (img->cols + 7) / 8;
}

static inline int
image_pixel(struct image *img, int x, int y)
{
	return (img->bitmap[image_rowbytes(img) * y + x / 8] >> (7 - x % 8)) & 1;
}

static inline void
image_putpixel(struct image *img, int x, int y, unsigned char pix)
{
	img->bitmap[image_rowbytes(img) * y + x / 8] &=  ~(1 << (7 - x % 8));
	img->bitmap[image_rowbytes(img) * y + x / 8] |= (pix << (7 - x % 8));
}

static struct image *
image_load(char *filename)
{
	FILE *f = fopen(filename, "rb");
	if (!f) { perror("load"); exit(EXIT_FAILURE); }

	struct image img; int fmt;
	pbm_readpbminit(f, &img.cols, &img.rows, &fmt);

	img.bitmap = malloc(img.rows * image_rowbytes(&img));
	for (int i = 0; i < img.rows; i++)
		pbm_readpbmrow_packed(f, img.bitmap + i * image_rowbytes(&img),
				img.cols, fmt);

	fclose(f);

	struct image *imga = malloc(sizeof(*imga));
	*imga = img;
	return imga;
}

static void
image_save(char *filename, struct image *img)
{
	FILE *f = fopen(filename, "wb");
	if (!f) { perror("save"); exit(EXIT_FAILURE); }

	pbm_writepbminit(f, img->cols, img->rows, 0);
	for (int i = 0; i < img->rows; i++)
		pbm_writepbmrow_packed(f, img->bitmap + i * image_rowbytes(img),
				img->cols, 0);

	fclose(f);
}

static void
image_free(struct image *img)
{
	free(img->bitmap);
	free(img);
}


static timestamp_t
get_timer(void)
{
	struct timeval t;
	gettimeofday(&t, NULL);
	return 1000000*t.tv_sec + t.tv_usec;
}
