#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netdb.h>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include<opencv2/core/cuda_stream_accessor.hpp>
#include <pthread.h>
#include <iostream>
#include <stdexcept>
#include <iostream>
#include "opencv2/imgproc.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/calib3d.hpp"
#include "opencv2/video.hpp"
#include<cstdio>
//#include "opencv2/cudalegacy.hpp"
#include "opencv2/cudaimgproc.hpp"
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudawarping.hpp"
#include "opencv2/cudafeatures2d.hpp"
#include "opencv2/cudafilters.hpp"
#include "opencv2/cudaoptflow.hpp"
#include "opencv2/cudabgsegm.hpp"
//#include <chrono>
//#include "chrono_io.h"


#define NUM_THREADS    1
#define SERVERPORT "3030"
using namespace cv;
using namespace std;
struct timeval tp2;
struct timeval tp1;
#include <atomic>

std::atomic<bool> go1;
std::atomic<bool> go0;
std::atomic <bool> go2;
int nff=0;
cv::cuda::GpuMat mask, P,mask2,mask3;
cv::cuda::GpuMat  src;
cv::cuda::GpuMat dst;
Mat imgOriginalff;
cv::cuda::GpuMat gpu_hsv,gpucount1,gpucount2;
cv::cuda::GpuMat Vx,Vy,V1,V2,V1Sum,V2Sum,VxSum,VySum,X,Y;
cv::cuda::GpuMat Vx3,Vy3,V13,V23,V1Sum3,V2Sum3,VxSum3,VySum3,X3,Y3;
Mat XM,YM;
Mat XM3,YM3;
Mat C1,C2;
VideoCapture cap(CV_CAP_XIAPI);
int H,W;
cv::cuda::GpuMat Ix; cv::cuda::GpuMat Iy;
cv::cuda::GpuMat Ix3;cv::cuda::GpuMat Iy3; 



int iLowH = 0 ;
int iHighH = 30;
int iLowS = 130;
int iHighS = 255;
int iLowV = 42;
int iHighV = 255;

int iLowH3 = 56;
int iHighH3 = 205;
int iLowS3 = 0;
int iHighS3 = 255;
int iLowV3 = 0;
int iHighV3 = 255;


/*-------RG ccd  BGRA output ----------------------------*/
 __global__ void bayerRG(cv::cuda::PtrStepSz<uchar> in, cv::cuda::PtrStepSz<uchar3>  out)
{
    // Note called for every pair, so x/y are for start of cell so need x+1,Y+1 for right/bottom pair
    // R G
    // G B

    // G B
    // R G


    // src
    int x = 2 *((blockIdx.x*blockDim.x) + threadIdx.x);
    int y = 2 * ((blockIdx.y*blockDim.y) + threadIdx.y);
        //cout<<"here"<<endl;
    uchar r,g,b;
    int RR,BB,GG;

 RR= (in(y,x));
if(RR<10)RR=1;
        BB= (in(y+1,x+1));
        GG=(in(y,x+1) +in(y+1,x))/2;

        int bb=0;
        int gg=0;
        int rr = in(y,x) *   int(RR/(4*GG))*int(RR/(5*BB))  *5;
        if(GG>15)
        gg = ((in(y,x-1)+in(y,x+1)+(in(y-1,x)+in(y+1,x)))/4)* int((GG)/(2*BB))*int((3*GG)/RR)*  int(RR/(3*BB))* int(RR/(1.5*GG))  *5 ;

        r=rr;b=bb;g=gg;
        if(rr>200)
        r=255;

        if(gg>200)
        g=255;

        out(y,x).x =  b;
        out(y,x).y =  g;
        out(y,x).z =  r;
        // 'G' in R
        bb=0;
        gg=0;
        rr = ((in(y,x)+in(y,x+2))/2)   *     int(RR/(4*GG))*int(RR/(5*BB))     *5 ;
        if(GG>15)
        gg = (in(y,x+1))* int((GG)/(2*BB))*int((3*GG)/RR)*  int(RR/(3*BB)) *10 * int(RR/(1.5*GG)) ;

        if(rr>200)
        r=255;

        if(gg>200)
        g=255;
        out(y,x+1).x =  b;
        out(y,x+1).y =  g;
        out(y,x+1).z =  r;

            // 'G' in B
        bb=0;
        gg=0;
        rr = ((in(y,x)+in(y+2,x))/2)  *   int(RR/(4*GG))*int(RR/(5*BB))  *5 ;
        if(GG>15)
        gg = (in(y+1,x))* int((GG)/(2*BB))*int((3*GG)/RR)*  int(RR/(3*BB))  * 10 * int(RR/(1.5*GG)) ;

        r=rr;b=bb;g=gg;

        if(rr>200)
        r=255;

        if(gg>200)
        g=255;

        out(y+1,x).x =  b;
        out(y+1,x).y =  g;
        out(y+1,x).z =  r;

            // 'B'
        bb=0;
        gg=0;
        rr =((in(y,x)+in(y,x+2)+in(y+2,x)+in(y+2,x+2))/4) *   int(RR/(4*GG))*int(RR/(5*BB))  *5;
        if(GG>15)
        gg = ((in(y+1,x)+in(y+1,x+2)+in(y,x+1)+in(y+2,x+1))/4)* int((GG)/(2*BB))*int((3*GG)/RR)* int(RR/(1.5*GG)) * int(RR/(3*BB))  * 10;

        r=rr;b=bb;g=gg;
        if(rr>200)
        r=255;

        if(gg>200)
        g=255;
        out(y+1,x+1).x =  b;
        out(y+1,x+1).y =  g;
        out(y+1,x+1).z =  r;

}


/* called from */
//void cuda_bayer(cv::cuda::PtrStepSz<uchar>  img, cv::cuda::PtrStepSz<uchar3>  out)
void cuda_bayer(cv::InputArray _input, cv::OutputArray _output,cv::cuda::Stream _stream)
{
        const cv::cuda::GpuMat input = _input.getGpuMat();
        _output.create(input.size(), CV_8UC3);
        cv::cuda::GpuMat output = _output.getGpuMat();
        dim3 cthreads(16, 16);
        dim3 cblocks(static_cast<int>(std::ceil((input.size().width/2) / static_cast<double>(cthreads.x))), static_cast<int>(std::ceil((input.size().height/2) / static_cast<double>(cthreads.y))));
        cudaStream_t stream = cv::cuda::StreamAccessor::getStream(_stream);
         bayerRG << <cblocks, cthreads, 0, stream >> >(input, output);
}


__global__ void inRange_kernel2(cv::cuda::PtrStepSz<uchar3> input, cv::cuda::PtrStepSz<uchar> output, int lbc0, int ubc0, int lbc1, int ubc1, int lbc2, int ubc2)
{
        int x = blockIdx.x * blockDim.x + threadIdx.x;
        int y = blockIdx.y * blockDim.y + threadIdx.y;
        if (x <= input.cols - 1 && y <= input.rows - 1 && y >= 0 && x >= 0)
        {
                uchar3 toto = input(y, x);
                unsigned char res;
                if (toto.x >= lbc0 && toto.x <= ubc0 && toto.y >= lbc1 && toto.y <= ubc1 && toto.z >= lbc2 && toto.z <= ubc2)
                        res = 1;
                else
                        res = 0;
                output(y, x) = res;
        }
}

void inRange_gpuMat2(cv::InputArray _input,
        cv::OutputArray _output,
        cv::cuda::Stream _stream,
        cv::Scalar lowerb,
        cv::Scalar upperb)
{
        const cv::cuda::GpuMat input = _input.getGpuMat();
        _output.create(input.size(), CV_8UC1);
        cv::cuda::GpuMat output = _output.getGpuMat();
        dim3 cthreads(16, 16);
        dim3 cblocks(static_cast<int>(std::ceil(input.size().width / static_cast<double>(cthreads.x))), static_cast<int>(std::ceil(input.size().height / static_cast<double>(cthreads.y))));
       cudaStream_t stream = cv::cuda::StreamAccessor::getStream(_stream);
        inRange_kernel2 << <cblocks, cthreads, 0, stream >> >(input, output, lowerb[0], upperb[0], lowerb[1], upperb[1], lowerb[2], upperb[2]);
}


void error(const char *msg)
{
    perror(msg);
    exit(0);
}




bool go=false;

void *Track(void *threadid)
 {
        long tid;
        tid = (long)threadid;

        if(tid==0)
        {


                while(1)
                {
                        bool A=cap.grab();
                        cap.retrieve(imgOriginalff);
                        go0.store(true);
                        nff=nff+1;
                }



        }


}



int main(int argc, char *argv[])
{

//   VideoCapture cap(CV_CAP_XIAPI);  // open the default camera

float gain=0.0;
//Camera setup
cap.set(CV_CAP_PROP_XI_SENSOR_FEATURE_VALUE,1);
cap.set(CV_CAP_PROP_XI_DOWNSAMPLING_TYPE,1);
cap.set(CV_CAP_PROP_XI_DOWNSAMPLING,2);
cap.set(CV_CAP_PROP_XI_DATA_FORMAT,5);
cap.set(CV_CAP_PROP_XI_AEAG,0);
cap.set(CV_CAP_PROP_XI_AUTO_WB,0);
cap.set(CV_CAP_PROP_XI_EXPOSURE,1000);
cap.set(CV_CAP_PROP_XI_GAIN,gain);
cap.set(CV_CAP_PROP_XI_OUTPUT_DATA_BIT_DEPTH,8);
//cap.set(CV_CAP_PROP_XI_BPC,1);
H=cap.get(4);
W=cap.get(3);

Mat ix(1, W, CV_32S);
for (int i=0; i< W; i++) { ix.at<int>(i)=i;}
Ix.upload(ix);

Mat iy(H, 1, CV_32S);
for (int j=0; j< H; j++) { iy.at<int>(j)=j;}
Iy.upload(iy);

Mat ix3(1, W, CV_32S);
for (int i=0; i< W; i++) { ix3.at<int>(i)=i;}
 Ix3.upload(ix3);

Mat iy3(H, 1, CV_32S);
for (int j=0; j< H; j++) { iy3.at<int>(j)=j;}
Iy3.upload(iy3);


int rc;
long tt;


pthread_t threads[NUM_THREADS];
        for(tt=0; tt<NUM_THREADS; tt++){
                printf("In main: creating thread %ld\n", tt);
                rc = pthread_create(&threads[tt], NULL, Track, (void *)tt);

                if (rc)
                        printf("ERROR in Creating threads..");

        }







 // createTrackbar("GAIN", "GAIN", &gain, 15);*/
time_t start, end;





    int sockfd;
    struct addrinfo hints, *servinfo, *p;
    int rv;
    int numbytes;

    if (argc != 3) {
        fprintf(stderr,"usage: talker hostname message\n");
        exit(1);
    }

    memset(&hints, 0, sizeof hints);
    hints.ai_family = AF_UNSPEC;
    hints.ai_socktype = SOCK_DGRAM;

    if ((rv = getaddrinfo(argv[1], SERVERPORT, &hints, &servinfo)) != 0) {
        fprintf(stderr, "getaddrinfo: %s\n", gai_strerror(rv));
        return 1;
    }

    // loop through all the results and make a socket
    for(p = servinfo; p != NULL; p = p->ai_next) {
        if ((sockfd = socket(p->ai_family, p->ai_socktype,
                p->ai_protocol)) == -1) {
            perror("talker: socket");
            continue;
        }

        break;
    }

    if (p == NULL) {
        fprintf(stderr, "talker: failed to create socket\n");
        return 2;
    }


        int frame=0;
        int limit=atoi(argv[2]);



        usleep(10000);
        time(&start);
        cv::cuda::Stream s0,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15,s16,s17,s18,s19;
        //cv::cuda::GpuMat OO(1,4,CV_32S);
  while (true)
            {
                if (frame == limit)
                break;

                if(go0){
                frame=frame+1;
                go0.store(false);
                src.upload(imgOriginalff);
                cuda_bayer(src,dst,cv::cuda::Stream());
                cv::cuda::cvtColor(dst, gpu_hsv, 40);
                //go1.store(true);


                inRange_gpuMat2(gpu_hsv,mask,s1,  cv::Scalar(iLowH, iLowS, iLowV), cv::Scalar(iHighH, iHighS, iHighV));
                inRange_gpuMat2(gpu_hsv,mask3,s0,   cv::Scalar(iLowH3, iLowS3, iLowV3), cv::Scalar(iHighH3, iHighS3, iHighV3));

                s1.waitForCompletion();
                cv::cuda::countNonZero(mask,gpucount1,s10);
                cv::cuda::reduce(mask, Vx, 0, CV_REDUCE_SUM, CV_32S,s2);
                cv::cuda::reduce(mask, Vy, 1, CV_REDUCE_SUM, CV_32S,s3);

                s0.waitForCompletion();
                cv::cuda::countNonZero(mask3,gpucount2,s11);
                cv::cuda::reduce(mask3, Vx3, 0, CV_REDUCE_SUM, CV_32S,s4);
                cv::cuda::reduce(mask3, Vy3, 1, CV_REDUCE_SUM, CV_32S,s5);

                cv::cuda::multiply(Vx, Ix, V1,1,-1,s2);

                cv::cuda::multiply(Vy, Iy, V2,1,-1,s3);

                cv::cuda::multiply(Vx3, Ix3, V13,1,-1,s4);

                cv::cuda::multiply(Vy3, Iy3, V23,1,-1,s5);

                cv::cuda::reduce(V1, V1Sum, 1, CV_REDUCE_SUM, CV_32S,s2);

                cv::cuda::reduce(V2, V2Sum, 0, CV_REDUCE_SUM, CV_32S,s3);

                cv::cuda::reduce(V13, V1Sum3, 1, CV_REDUCE_SUM, CV_32S,s4);

                cv::cuda::reduce(V23, V2Sum3, 0, CV_REDUCE_SUM, CV_32S,s5);

               // int countG=cv::cuda::countNonZero(mask);
                //int countG3=cv::cuda::countNonZero(mask3);
                gpucount1.download(C1,s10);
                gpucount2.download(C2,s11);
                V1Sum.download(XM,s2);
                V2Sum.download(YM,s3);
                V1Sum3.download(XM3,s4);
                V2Sum3.download(YM3,s5);

                //oo(0).setTo(V1Sum3);
                s10.waitForCompletion();
                s11.waitForCompletion();
                s2.waitForCompletion();
                s3.waitForCompletion();
                s4.waitForCompletion();
                s5.waitForCompletion();
                int POS[4]={XM.at<int>(0,0)/C1.at<int>(0,0), YM.at<int>(0,0)/C1.at<int>(0,0), XM3.at<int>(0,0)/C2.at<int>(0,0), YM3.at<int>(0,0)/C2.at<int>(0,0) };
                numbytes = sendto(sockfd, &POS, sizeof(POS), 0, p->ai_addr, p->ai_addrlen);
               // cout<<POS[1]<<" "<< POS[3]<<endl;
                }
}







        time(&end);
        double seconds = difftime (end, start);
        cout << "Time taken : " << seconds << " seconds" << endl;
        double fps  = frame / seconds;
cout<<nff<<endl;
        cout << "Estimated frames per second : " << fps << endl;
        sleep(100);
            return 0;
        }










