RemoteSensingFrank/UAVProduct

View on GitHub
UAVProduct/SiftGPU/PyramidGL.cpp

Summary

Maintainability
Test Coverage
////////////////////////////////////////////////////////////////////////////
//    File:        PyramidGL.cpp
//    Author:        Changchang Wu
//    Description : implementation of PyramidGL/PyramidNaive/PyramidPackdc .
//
//
//    Copyright (c) 2007 University of North Carolina at Chapel Hill
//    All Rights Reserved
//
//    Permission to use, copy, modify and distribute this software and its
//    documentation for educational, research and non-profit purposes, without
//    fee, and without a written agreement is hereby granted, provided that the
//    above copyright notice and the following paragraph appear in all copies.
//    
//    The University of North Carolina at Chapel Hill make no representations
//    about the suitability of this software for any purpose. It is provided
//    'as is' without express or implied warranty. 
//
//    Please send BUG REPORTS to ccwu@cs.unc.edu
//
////////////////////////////////////////////////////////////////////////////

#include "GL/glew.h"
#include <iostream>
#include <iomanip>
#include <vector>
#include <algorithm>
#include <fstream>
#include <math.h>
#include <string.h>
using namespace std;

#include "GlobalUtil.h"
#include "GLTexImage.h"
#include "SiftGPU.h"
#include "ShaderMan.h"
#include "SiftPyramid.h"
#include "ProgramGLSL.h"
#include "PyramidGL.h"
#include "FrameBufferObject.h"


#if defined(__SSE__) || _MSC_VER > 1200
#define USE_SSE_FOR_SIFTGPU
#include <xmmintrin.h>
#else
//#pragma message( "SSE optimization off!\n" )
#endif


#define USE_TIMING()        double t, t0, tt;
#define OCTAVE_START()        if(GlobalUtil::_timingO){    t = t0 = CLOCK();    cout<<"#"<<i+_down_sample_factor<<"\t";    }
#define LEVEL_FINISH()        if(GlobalUtil::_timingL){    glFinish();    tt = CLOCK();cout<<(tt-t)<<"\t";    t = CLOCK();}
#define OCTAVE_FINISH()        if(GlobalUtil::_timingO)cout<<"|\t"<<(CLOCK()-t0)<<endl;


//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
PyramidNaive::PyramidNaive(SiftParam& sp): PyramidGL(sp)
{
    _texPyramid = NULL;
    _auxPyramid = NULL;
}

PyramidNaive::~PyramidNaive()
{
    DestroyPyramidData();
}

//align must be 2^i
void PyramidGL::    GetAlignedStorageSize(int num, int align,  int &fw, int &fh)
{
    if(num <=0)
    {
        fw = fh = 0;
    }else if(num < align*align)
    {
        fw = align;
        fh = (int)ceil(double(num) / fw);
    }else if(GlobalUtil::_NarrowFeatureTex)
    {
        double dn = double(num);
        int nb = (int) ceil(dn/GlobalUtil::_texMaxDim/align);    
        fw = align * nb;    
        fh = (int)ceil(dn /fw);/**/
    }else
    {
        double dn = double(num);
        int nb = (int) ceil(dn/GlobalUtil::_texMaxDim/align);
        fh = align * nb;
        if(nb <=1)
        {
            fw = (int)ceil(dn / fh);
            //align this dimension to blocksize
            fw = ((int) ceil(double(fw) /align))*align;
        }else
        {
            fw = GlobalUtil::_texMaxDim;
        }

    }


}

void PyramidGL::GetTextureStorageSize(int num, int &fw, int& fh)
{
    if(num <=0)
    {
        fw = fh = 0;
    }else if(num <= GlobalUtil::_FeatureTexBlock)
    {
        fw = num;
        fh = 1;
    }else if(GlobalUtil::_NarrowFeatureTex)
    {
        double dn = double(num);
        int nb = (int) ceil(dn/GlobalUtil::_texMaxDim/GlobalUtil::_FeatureTexBlock);    
        fw = GlobalUtil::_FeatureTexBlock * nb;    
        fh = (int)ceil(dn /fw);/**/
    }else
    {
        double dn = double(num);
        int nb = (int) ceil(dn/GlobalUtil::_texMaxDim/GlobalUtil::_FeatureTexBlock);
        fh = GlobalUtil::_FeatureTexBlock * nb;
        if(nb <=1)
        {
            fw = (int)ceil(dn / fh);

            //align this dimension to blocksize

            //
            if( fw < fh)
            {
                int temp = fh;
                fh = fw;
                fw = temp;
            }
        }else
        {
            fw = GlobalUtil::_texMaxDim;
        }
    }
}

void PyramidNaive::DestroyPyramidData()
{
    if(_texPyramid)
    {
        delete [] _texPyramid;
        _texPyramid = NULL;
    }
    if(_auxPyramid)
    {
        delete [] _auxPyramid;  
        _auxPyramid = NULL;
    }
}
PyramidGL::PyramidGL(SiftParam &sp):SiftPyramid(sp)
{
    _featureTex = NULL;
    _orientationTex = NULL;
    _descriptorTex = NULL;
    _histoPyramidTex = NULL;    
    //////////////////////////
    InitializeContext();
}

PyramidGL::~PyramidGL()
{
    DestroyPerLevelData();
    DestroySharedData();
    ShaderMan::DestroyShaders();
}

void PyramidGL::InitializeContext()
{
    GlobalUtil::InitGLParam(0);
    if(!GlobalUtil::_GoodOpenGL) return;

    //////////////////////////////////////////////
    ShaderMan::InitShaderMan(param);
}

void PyramidGL::DestroyPerLevelData()
{
    //integers vector to store the feature numbers.
    if(_levelFeatureNum)
    {
        delete [] _levelFeatureNum;
        _levelFeatureNum = NULL;
    }
    //texture used to store features
    if(    _featureTex)
    {
        delete [] _featureTex;
        _featureTex =    NULL;
    }
    //texture used for multi-orientation 
    if(_orientationTex)
    {
        delete [] _orientationTex;
        _orientationTex = NULL;
    }
    int no = _octave_num* param._dog_level_num;

    //two sets of vbos used to display the features
    if(_featureDisplayVBO)
    {
        glDeleteBuffers(no, _featureDisplayVBO);
        delete [] _featureDisplayVBO;
        _featureDisplayVBO = NULL;
    }
    if( _featurePointVBO)
    {
        glDeleteBuffers(no, _featurePointVBO);
        delete [] _featurePointVBO;
        _featurePointVBO = NULL;
    }

}

void PyramidGL::DestroySharedData()
{
    //histogram reduction
    if(_histoPyramidTex)
    {
        delete[]    _histoPyramidTex;
        _hpLevelNum = 0;
        _histoPyramidTex = NULL;
    }

    //descriptor storage shared by all levels
    if(_descriptorTex)
    {
        delete [] _descriptorTex;
        _descriptorTex = NULL;
    }
    //cpu reduction buffer.
    if(_histo_buffer)
    {
        delete[] _histo_buffer;
        _histo_buffer = 0;
    }
}

void PyramidNaive::FitHistogramPyramid()
{
    GLTexImage * tex, *htex;
    int hist_level_num = _hpLevelNum - _pyramid_octave_first; 

    tex = GetBaseLevel(_octave_min , DATA_KEYPOINT) + 2;
    htex = _histoPyramidTex + hist_level_num - 1;
    int w = tex->GetImgWidth() >> 1;
    int h = tex->GetImgHeight() >> 1;

    for(int k = 0; k <hist_level_num -1; k++, htex--)
    {
        if(htex->GetImgHeight()!= h || htex->GetImgWidth() != w)
        {    
            htex->SetImageSize(w, h);
            htex->ZeroHistoMargin();
        }

        w = (w + 1)>>1; h = (h + 1) >> 1;
    }
}

void PyramidNaive::FitPyramid(int w, int h)
{
    //(w, h) <= (_pyramid_width, _pyramid_height);

    _pyramid_octave_first = 0;
    //
    _octave_num  = GlobalUtil::_octave_num_default;

    int _octave_num_max = GetRequiredOctaveNum(min(w, h));

    if(_octave_num < 1 || _octave_num > _octave_num_max) 
    {
        _octave_num = _octave_num_max;
    }


    int pw = _pyramid_width>>1, ph = _pyramid_height>>1;
    while(_pyramid_octave_first + _octave_num < _pyramid_octave_num &&  
        pw >= w && ph >= h)
    {
        _pyramid_octave_first++;
        pw >>= 1;
        ph >>= 1;
    }

    for(int i = 0; i < _octave_num; i++)
    {
        GLTexImage * tex = GetBaseLevel(i + _octave_min);
        GLTexImage * aux = GetBaseLevel(i + _octave_min, DATA_KEYPOINT);
        for(int j = param._level_min; j <= param._level_max; j++, tex++, aux++)
        {
            tex->SetImageSize(w, h);
            aux->SetImageSize(w, h);
        }
        w>>=1;
        h>>=1;
    }
}
void PyramidNaive::InitPyramid(int w, int h, int ds)
{
    int wp, hp, toobig = 0;
    if(ds == 0)
    {
        _down_sample_factor = 0;
        if(GlobalUtil::_octave_min_default>=0)
        {
            wp = w >> GlobalUtil::_octave_min_default;
            hp = h >> GlobalUtil::_octave_min_default;
        }else 
        {
            wp = w << (-GlobalUtil::_octave_min_default);
            hp = h << (-GlobalUtil::_octave_min_default);
        }
        _octave_min = _octave_min_default;
    }else
    {
        //must use 0 as _octave_min; 
        _octave_min = 0;
        _down_sample_factor = ds;
        w >>= ds;
        h >>= ds;
        wp = w;
        hp = h; 

    }

    while(wp > GlobalUtil::_texMaxDim || hp > GlobalUtil::_texMaxDim)
    {
        _octave_min ++;
        wp >>= 1;
        hp >>= 1;
        toobig = 1;
    }

    while(GlobalUtil::_MemCapGPU > 0 && GlobalUtil::_FitMemoryCap  && (wp >_pyramid_width || hp > _pyramid_height) &&
        max(max(wp, hp), max(_pyramid_width, _pyramid_height)) >  1024 * sqrt(GlobalUtil::_MemCapGPU / 140.0))
    {
        _octave_min ++;
        wp >>= 1;
        hp >>= 1;
        toobig = 2;
    }

    if(toobig && GlobalUtil::_verbose)
    {
        std::cout<<(toobig == 2 ? "[**SKIP OCTAVES**]:\tExceeding Memory Cap (-nomc)\n" :
                    "[**SKIP OCTAVES**]:\tReaching the dimension limit (-maxd)!\n");
    }

    if( wp == _pyramid_width && hp == _pyramid_height && _allocated )
    {
        FitPyramid(wp, hp);
    }else if(GlobalUtil::_ForceTightPyramid || _allocated ==0)
    {
        ResizePyramid(wp, hp);
    }
    else if( wp > _pyramid_width || hp > _pyramid_height )
    {
        ResizePyramid(max(wp, _pyramid_width), max(hp, _pyramid_height));
        if(wp < _pyramid_width || hp < _pyramid_height)  FitPyramid(wp, hp);
    }
    else
    {
        //try use the pyramid allocated for large image on small input images
        FitPyramid(wp, hp);
    }

    //select the initial smoothing filter according to the new _octave_min
    ShaderMan::SelectInitialSmoothingFilter(_octave_min + _down_sample_factor, param);
}

void PyramidNaive::ResizePyramid( int w,  int h)
{
    //
    unsigned int totalkb = 0;
    int _octave_num_new, input_sz;
    int i, j;
    GLTexImage * tex, *aux;
    //

    if(_pyramid_width == w && _pyramid_height == h && _allocated) return;

    if(w > GlobalUtil::_texMaxDim || h > GlobalUtil::_texMaxDim) return ;

    if(GlobalUtil::_verbose && GlobalUtil::_timingS) std::cout<<"[Allocate Pyramid]:\t" <<w<<"x"<<h<<endl;
    //first octave does not change
    _pyramid_octave_first = 0;

    
    //compute # of octaves

    input_sz = min(w,h) ;


    _pyramid_width =  w;
    _pyramid_height =  h;

    //reset to preset parameters
    _octave_num_new  = GlobalUtil::_octave_num_default;

    if(_octave_num_new < 1) _octave_num_new = GetRequiredOctaveNum(input_sz)  ;

    if(_pyramid_octave_num != _octave_num_new)
    {
        //destroy the original pyramid if the # of octave changes
        if(_octave_num >0)
        {
            DestroyPerLevelData();
            DestroyPyramidData();
        }
        _pyramid_octave_num = _octave_num_new;
    }

    _octave_num = _pyramid_octave_num;

    int noct = _octave_num;
    int nlev = param._level_num;

    //    //initialize the pyramid
    if(_texPyramid==NULL)    _texPyramid = new GLTexImage[ noct* nlev ];
    if(_auxPyramid==NULL)    _auxPyramid = new GLTexImage[ noct* nlev ];


    tex = GetBaseLevel(_octave_min, DATA_GAUSSIAN);
    aux = GetBaseLevel(_octave_min, DATA_KEYPOINT);
    for(i = 0; i< noct; i++)
    {
        totalkb += (nlev * w * h * 16 / 1024);
        for( j = 0; j< nlev; j++, tex++)
        {
            tex->InitTexture(w, h);
            //tex->AttachToFBO(0);
        }
        //several auxilary textures are not actually required
        totalkb += ((nlev - 3) * w * h * 16 /1024);
        for( j = 0; j< nlev ; j++, aux++)
        {
            if(j < 2) continue;
            if(j >= nlev - 1) continue;
            aux->InitTexture(w, h, 0);
            //aux->AttachToFBO(0);
        }

        w>>=1;
        h>>=1;
    }

    totalkb += ResizeFeatureStorage();


    //
    _allocated = 1;

    if(GlobalUtil::_verbose && GlobalUtil::_timingS) std::cout<<"[Allocate Pyramid]:\t" <<(totalkb/1024)<<"MB\n";

}


int PyramidGL::ResizeFeatureStorage()
{
    int totalkb = 0;
    if(_levelFeatureNum==NULL)    _levelFeatureNum = new int[_octave_num * param._dog_level_num];
    std::fill(_levelFeatureNum, _levelFeatureNum+_octave_num * param._dog_level_num, 0); 

    int wmax = GetBaseLevel(_octave_min)->GetDrawWidth();
    int hmax = GetBaseLevel(_octave_min)->GetDrawHeight();
    int w ,h, i;

    //use a fbo to initialize textures..
    FrameBufferObject fbo;
    
    //
    if(_histo_buffer == NULL) _histo_buffer = new float[((size_t)1) << (2 + 2 * GlobalUtil::_ListGenSkipGPU)];
    //histogram for feature detection

    int num = (int)ceil(log(double(max(wmax, hmax)))/log(2.0));

    if( _hpLevelNum != num)
    {
        _hpLevelNum = num;
        if(GlobalUtil::_ListGenGPU)
        {
            if(_histoPyramidTex ) delete [] _histoPyramidTex;
            _histoPyramidTex = new GLTexImage[_hpLevelNum];
            w = h = 1 ;
            for(i = 0; i < _hpLevelNum; i++)
            {
                _histoPyramidTex[i].InitTexture(w, h, 0);
                _histoPyramidTex[i].AttachToFBO(0);
                w<<=1;
                h<<=1;
            }
        }
    }

    // (4 ^ (_hpLevelNum) -1 / 3) pixels
    if(GlobalUtil::_ListGenGPU) totalkb += (((1 << (2 * _hpLevelNum)) -1) / 3 * 16 / 1024);



    //initialize the feature texture

    int idx = 0, n = _octave_num * param._dog_level_num;
    if(_featureTex==NULL)    _featureTex = new GLTexImage[n];
    if(GlobalUtil::_MaxOrientation >1 && GlobalUtil::_OrientationPack2==0)
    {
        if(_orientationTex== NULL)        _orientationTex = new GLTexImage[n];
    }


    for(i = 0; i < _octave_num; i++)
    {
        GLTexImage * tex = GetBaseLevel(i+_octave_min);
        int fmax = int(tex->GetImgWidth()*tex->GetImgHeight()*GlobalUtil::_MaxFeaturePercent);
        int fw, fh;
        //
        if(fmax > GlobalUtil::_MaxLevelFeatureNum) fmax = GlobalUtil::_MaxLevelFeatureNum;
        else if(fmax < 32) fmax = 32;    //give it at least a space of 32 feature

        GetTextureStorageSize(fmax, fw, fh);
        
        for(int j = 0; j < param._dog_level_num; j++, idx++)
        {

            _featureTex[idx].InitTexture(fw, fh, 0);
            _featureTex[idx].AttachToFBO(0);
            //
            if(_orientationTex)
            {
                _orientationTex[idx].InitTexture(fw, fh, 0);
                _orientationTex[idx].AttachToFBO(0);
            }
        }
        totalkb += fw * fh * 16 * param._dog_level_num * (_orientationTex? 2 : 1) /1024;
    }


    //this just need be initialized once
    if(_descriptorTex==NULL)
    {
        //initialize feature texture pyramid
        wmax = _featureTex->GetImgWidth();
        hmax = _featureTex->GetImgHeight();

        int nf, ns;
        if(GlobalUtil::_DescriptorPPT)
        {
            //32*4 = 128. 
            nf = 32 / GlobalUtil::_DescriptorPPT;    // how many textures we need
            ns = max(4, GlobalUtil::_DescriptorPPT);            // how many point in one texture for one descriptor
        }else
        {
            //at least one, resue for visualization and other work
            nf = 1; ns = 4;
        }
        //
        _alignment = ns;
        //
        _descriptorTex = new GLTexImage[nf];

        int fw, fh;
        GetAlignedStorageSize(hmax*wmax* max(ns, 10), _alignment, fw, fh);

        if(fh < hmax ) fh = hmax;
        if(fw < wmax ) fw = wmax;

        totalkb += ( fw * fh * nf * 16 /1024);
        for(i =0; i < nf; i++)
        {
            _descriptorTex[i].InitTexture(fw, fh);
        }
    }else
    {
        int nf = GlobalUtil::_DescriptorPPT? 32 / GlobalUtil::_DescriptorPPT: 1;
        totalkb += nf * _descriptorTex[0].GetTexWidth() * _descriptorTex[0].GetTexHeight() * 16 /1024;
    }
    return totalkb;
}


void PyramidNaive::BuildPyramid(GLTexInput *input)
{
    USE_TIMING();
    GLTexPacked * tex;
    FilterProgram** filter;
    FrameBufferObject fbo;

    glDrawBuffer(GL_COLOR_ATTACHMENT0_EXT);
    input->FitTexViewPort();

    for (int i = _octave_min; i < _octave_min + _octave_num; i++)
    {

        tex = (GLTexPacked*)GetBaseLevel(i);
        filter = ShaderMan::s_bag->f_gaussian_step;

        OCTAVE_START();

        if( i == _octave_min )
        {
            if(i < 0)    TextureUpSample(tex, input, 1<<(-i)    );            
            else        TextureDownSample(tex, input, 1<<i);
            ShaderMan::FilterInitialImage(tex, NULL);
        }else
        {
            TextureDownSample(tex, GetLevelTexture(i-1, param._level_ds)); 
            ShaderMan::FilterSampledImage(tex, NULL); 
        }
        LEVEL_FINISH();

        for(int j = param._level_min + 1; j <=  param._level_max ; j++, tex++, filter++)
        {
            // filtering
            ShaderMan::FilterImage(*filter, tex+1, tex, NULL);
            LEVEL_FINISH();
        }
        OCTAVE_FINISH();

    }
    if(GlobalUtil::_timingS)    glFinish();
    UnloadProgram();
}






GLTexImage*  PyramidNaive::GetLevelTexture(int octave, int level, int dataName)
{
    if(octave <_octave_min || octave > _octave_min + _octave_num) return NULL;
    switch(dataName)
    {
        case DATA_GAUSSIAN:
        case DATA_DOG:
        case DATA_GRAD:
        case DATA_ROT:
            return _texPyramid+ (_pyramid_octave_first + octave - _octave_min) * param._level_num + (level - param._level_min);
        case DATA_KEYPOINT:
            return _auxPyramid + (_pyramid_octave_first + octave - _octave_min) * param._level_num + (level - param._level_min);
        default:
            return NULL;
    }
}

GLTexImage*  PyramidNaive::GetLevelTexture(int octave, int level)
{
    return _texPyramid+ (_pyramid_octave_first + octave - _octave_min) * param._level_num 
        + (level - param._level_min);
}

//in the packed implementation
// DATA_GAUSSIAN, DATA_DOG, DATA_GAD will be stored in different textures.
GLTexImage*  PyramidNaive::GetBaseLevel(int octave, int dataName)
{
    if(octave <_octave_min || octave > _octave_min + _octave_num) return NULL;
    switch(dataName)
    {
        case DATA_GAUSSIAN:
        case DATA_DOG:
        case DATA_GRAD:
        case DATA_ROT:
            return _texPyramid+ (_pyramid_octave_first + octave - _octave_min) * param._level_num;
        case DATA_KEYPOINT:
            return _auxPyramid + (_pyramid_octave_first + octave - _octave_min) * param._level_num;
        default:
            return NULL;
    }
}









void PyramidNaive::ComputeGradient()
{

    int i, j;
    double  ts, t1;
    GLTexImage * tex;
    FrameBufferObject fbo;


    if(GlobalUtil::_timingS && GlobalUtil::_verbose)ts = CLOCK();
    
    glDrawBuffer(GL_COLOR_ATTACHMENT0_EXT);

    for ( i = _octave_min; i < _octave_min + _octave_num; i++)
    {
        for( j = param._level_min + 1 ; j < param._level_max ; j++)
        {
            tex = GetLevelTexture(i, j);
            tex->FitTexViewPort();
            tex->AttachToFBO(0);
            tex->BindTex();
            ShaderMan::UseShaderGradientPass();
            tex->DrawQuadMT4();
        }
    }        

    if(GlobalUtil::_timingS && GlobalUtil::_verbose)
    {
        glFinish();
        t1 = CLOCK();    
        std::cout<<"<Compute Gradient>\t"<<(t1-ts)<<"\n";
    }

    UnloadProgram();
    GLTexImage::UnbindMultiTex(3);
    fbo.UnattachTex(GL_COLOR_ATTACHMENT1_EXT);
}


//keypoint detection with subpixel localization
void PyramidNaive::DetectKeypointsEX()
{
    int i, j;
    double t0, t, ts, t1, t2;
    GLTexImage * tex, *aux;
    FrameBufferObject fbo;

    if(GlobalUtil::_timingS && GlobalUtil::_verbose)ts = CLOCK();
    
    glDrawBuffer(GL_COLOR_ATTACHMENT0_EXT);
    //extra gradient data required for visualization
    int gradient_only_levels[2] = {param._level_min +1, param._level_max};
    int n_gradient_only_level = GlobalUtil::_UseSiftGPUEX ? 2 : 1;
    for ( i = _octave_min; i < _octave_min + _octave_num; i++)
    {
        for( j =0; j < n_gradient_only_level ; j++)
        {
            tex = GetLevelTexture(i, gradient_only_levels[j]);
            tex->FitTexViewPort();
            tex->AttachToFBO(0);
            tex->BindTex();
            ShaderMan::UseShaderGradientPass();
            tex->DrawQuadMT4();
        }
    }        

    if(GlobalUtil::_timingS && GlobalUtil::_verbose)
    {
        glFinish();
        t1 = CLOCK();
    }

    GLenum buffers[] = { GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT1_EXT };
    glDrawBuffers(2, buffers);
    for ( i = _octave_min; i < _octave_min + _octave_num; i++)
    {
        if(GlobalUtil::_timingO)
        {
            t0 = CLOCK();
            std::cout<<"#"<<(i + _down_sample_factor)<<"\t";
        }
        tex = GetBaseLevel(i) + 2;
        aux = GetBaseLevel(i, DATA_KEYPOINT) +2;
        aux->FitTexViewPort();

        for( j = param._level_min + 2; j <  param._level_max ; j++, aux++, tex++)
        {
            if(GlobalUtil::_timingL)t = CLOCK();        
            tex->AttachToFBO(0);
            aux->AttachToFBO(1);
            glActiveTexture(GL_TEXTURE0);
            tex->BindTex();
            glActiveTexture(GL_TEXTURE1);
            (tex+1)->BindTex();
            glActiveTexture(GL_TEXTURE2);
            (tex-1)->BindTex();
            ShaderMan::UseShaderKeypoint((tex+1)->GetTexID(), (tex-1)->GetTexID());
            aux->DrawQuadMT8();
    
            if(GlobalUtil::_timingL)
            {
                glFinish();
                std::cout<<(CLOCK()-t)<<"\t";
            }
            tex->DetachFBO(0);
            aux->DetachFBO(1);
        }
        if(GlobalUtil::_timingO)
        {
            std::cout<<"|\t"<<(CLOCK()-t0)<<"\n";
        }
    }

    if(GlobalUtil::_timingS)
    {
        glFinish();
        t2 = CLOCK();
        if(GlobalUtil::_verbose) 
            std::cout    <<"<Get Keypoints ..  >\t"<<(t2-t1)<<"\n"
                        <<"<Extra Gradient..  >\t"<<(t1-ts)<<"\n";
    }
    UnloadProgram();
    GLTexImage::UnbindMultiTex(3);
    fbo.UnattachTex(GL_COLOR_ATTACHMENT1_EXT);


}

void PyramidNaive::GenerateFeatureList(int i, int j)
{
    int hist_level_num = _hpLevelNum - _pyramid_octave_first; 
    int hist_skip_gpu = GlobalUtil::_ListGenSkipGPU; 
    int idx = i * param._dog_level_num + j;
    GLTexImage* htex, *ftex, *tex;
    tex = GetBaseLevel(_octave_min + i, DATA_KEYPOINT) + 2 + j;
    ftex = _featureTex + idx;
    htex = _histoPyramidTex + hist_level_num - 1 - i;

    ///
    glActiveTexture(GL_TEXTURE0);
    tex->BindTex();
    htex->AttachToFBO(0);
    int tight = ((htex->GetImgWidth() * 2 == tex->GetImgWidth() -1 || tex->GetTexWidth() == tex->GetImgWidth()) &&
                 (htex->GetImgHeight() *2 == tex->GetImgHeight()-1 || tex->GetTexHeight() == tex->GetImgHeight()));
    ShaderMan::UseShaderGenListInit(tex->GetImgWidth(), tex->GetImgHeight(), tight);
    htex->FitTexViewPort();
    //this uses the fact that no feature is on the edge.
    htex->DrawQuadReduction();

    //reduction..
    htex--;

    //this part might have problems on several GPUS
    //because the output of one pass is the input of the next pass
    //need to call glFinish to make it right
    //but too much glFinish makes it slow
    for(int k = 0; k <hist_level_num - i - 1 - hist_skip_gpu; k++, htex--)
    {
        htex->AttachToFBO(0);
        htex->FitTexViewPort();
        (htex+1)->BindTex();
        ShaderMan::UseShaderGenListHisto();
        htex->DrawQuadReduction();                    
    }

    //
    if(hist_skip_gpu == 0)
    {    
        //read back one pixel
        float fn[4], fcount;
        glReadPixels(0, 0, 1, 1, GL_RGBA , GL_FLOAT, fn);
        fcount = (fn[0] + fn[1] + fn[2] + fn[3]);
        if(fcount < 1) fcount = 0;


        _levelFeatureNum[ idx] = (int)(fcount);
        SetLevelFeatureNum(idx, (int)fcount);
        _featureNum += int(fcount);

        //
        if(fcount < 1.0) return;
    

        ///generate the feature texture

        htex=  _histoPyramidTex;

        htex->BindTex();

        //first pass
        ftex->AttachToFBO(0);
        if(GlobalUtil::_MaxOrientation>1)
        {
            //this is very important...
            ftex->FitRealTexViewPort();
            glClear(GL_COLOR_BUFFER_BIT);
            glFinish();
        }else
        {
            ftex->FitTexViewPort();
            //glFinish();
        }


        ShaderMan::UseShaderGenListStart((float)ftex->GetImgWidth(), htex->GetTexID());

        ftex->DrawQuad();
        //make sure it finishes before the next step
        ftex->DetachFBO(0);

        //pass on each pyramid level
        htex++;
    }else
    {

        int tw = htex[1].GetDrawWidth(), th = htex[1].GetDrawHeight();
        int fc = 0;
        glReadPixels(0, 0, tw, th, GL_RGBA , GL_FLOAT, _histo_buffer);    
        _keypoint_buffer.resize(0);
        for(int y = 0, pos = 0; y < th; y++)
        {
            for(int x= 0; x < tw; x++)
            {
                for(int c = 0; c < 4; c++, pos++)
                {
                    int ss =  (int) _histo_buffer[pos]; 
                    if(ss == 0) continue;
                    float ft[4] = {2 * x + (c%2? 1.5f:  0.5f), 2 * y + (c>=2? 1.5f: 0.5f), 0, 1 };
                    for(int t = 0; t < ss; t++)
                    {
                        ft[2] = (float) t; 
                        _keypoint_buffer.insert(_keypoint_buffer.end(), ft, ft+4);
                    }
                    fc += (int)ss; 
                }
            }
        }
        _levelFeatureNum[ idx] = fc;
        SetLevelFeatureNum(idx, fc);
        if(fc == 0)  return;
        _featureNum += fc;
        /////////////////////
        ftex->AttachToFBO(0);
        if(GlobalUtil::_MaxOrientation>1)
        {
            ftex->FitRealTexViewPort();
            glClear(GL_COLOR_BUFFER_BIT);
            glFlush();
        }else
        {                    
            ftex->FitTexViewPort();
            glFlush();
        }
        _keypoint_buffer.resize(ftex->GetDrawWidth() * ftex->GetDrawHeight()*4, 0);
        ///////////
        glActiveTexture(GL_TEXTURE0);
        ftex->BindTex();
        glTexSubImage2D(GlobalUtil::_texTarget, 0, 0, 0, ftex->GetDrawWidth(),
            ftex->GetDrawHeight(), GL_RGBA, GL_FLOAT, &_keypoint_buffer[0]);
        htex += 2;
    }

    for(int lev = 1 + hist_skip_gpu; lev < hist_level_num  - i; lev++, htex++)
    {

        glActiveTexture(GL_TEXTURE0);
        ftex->BindTex();
        ftex->AttachToFBO(0);
        glActiveTexture(GL_TEXTURE1);
        htex->BindTex();
        ShaderMan::UseShaderGenListStep(ftex->GetTexID(), htex->GetTexID());
        ftex->DrawQuad();
        ftex->DetachFBO(0);    
    }
    GLTexImage::UnbindMultiTex(2);

}

//generate feature list on GPU
void PyramidNaive::GenerateFeatureList()
{
    //generate the histogram0pyramid
    FrameBufferObject fbo;
    glReadBuffer(GL_COLOR_ATTACHMENT0_EXT);
    glDrawBuffer(GL_COLOR_ATTACHMENT0_EXT);
    double t1, t2; 
    int ocount, reverse = (GlobalUtil::_TruncateMethod == 1);
    _featureNum = 0;

    FitHistogramPyramid();

    //for(int i = 0, idx = 0; i < _octave_num; i++)
    FOR_EACH_OCTAVE(i, reverse)
    {
        //output
        if(GlobalUtil::_timingO)
        {
            t1= CLOCK();
            ocount = 0;
            std::cout<<"#"<<i+_octave_min + _down_sample_factor<<":\t";
        }
        //for(int j = 0; j < param._dog_level_num; j++, idx++)
        FOR_EACH_LEVEL(j, reverse)
        {

            if(GlobalUtil::_TruncateMethod && GlobalUtil::_FeatureCountThreshold > 0
                && _featureNum > GlobalUtil::_FeatureCountThreshold) 
            {
                _levelFeatureNum[i * param._dog_level_num + j] = 0;
                continue;
            }else
            {
                GenerateFeatureList(i, j); 
                if(GlobalUtil::_timingO)    
                {
                    int idx = i * param._dog_level_num + j;
                    std::cout<< _levelFeatureNum[idx] <<"\t";
                    ocount += _levelFeatureNum[idx];
                }
            }
        }
        if(GlobalUtil::_timingO)
        {    
            t2 = CLOCK(); 
            std::cout << "| \t" << int(ocount) << " :\t(" << (t2 - t1) << ")\n";
        }
    }
    if(GlobalUtil::_timingS)glFinish();
    if(GlobalUtil::_verbose)
    {
        std::cout<<"#Features:\t"<<_featureNum<<"\n";
    }
}


void PyramidGL::GenerateFeatureDisplayVBO()
{
    //use a big VBO to save all the SIFT box vertices
    int w, h, esize; GLint bsize;
    int nvbo = _octave_num * param._dog_level_num;
    //initialize the vbos
    if(_featureDisplayVBO==NULL)
    {
        _featureDisplayVBO = new GLuint[nvbo];
        glGenBuffers( nvbo, _featureDisplayVBO );
    }
    if(_featurePointVBO == NULL)
    {
        _featurePointVBO = new GLuint[nvbo];
        glGenBuffers(nvbo, _featurePointVBO);
    }

    FrameBufferObject fbo;
    glReadBuffer(GL_COLOR_ATTACHMENT0_EXT);
    glDrawBuffer(GL_COLOR_ATTACHMENT0_EXT);
    glActiveTexture(GL_TEXTURE0);
    //
    GLTexImage & tempTex = *_descriptorTex;
    //
    for(int i = 0, idx = 0; i < _octave_num; i++)
    {
        for(int j = 0; j < param._dog_level_num; j ++, idx++)
        {
            GLTexImage * ftex  = _featureTex + idx;
            if(_levelFeatureNum[idx]<=0)continue;

            //copy the texture into vbo
            fbo.BindFBO();
            tempTex.AttachToFBO(0);

            ftex->BindTex();
            ftex->FitTexViewPort();
            ShaderMan::UseShaderCopyKeypoint();
            ftex->DrawQuad();

            glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB,  _featurePointVBO[ idx]);
            glGetBufferParameteriv(GL_PIXEL_PACK_BUFFER_ARB, GL_BUFFER_SIZE, &bsize);
            esize = ftex->GetImgHeight() * ftex->GetImgWidth()*sizeof(float) *4;

            //increase size when necessary
            if(bsize < esize)
            {
                glBufferData(GL_PIXEL_PACK_BUFFER_ARB, esize*3/2 ,    NULL, GL_STATIC_DRAW_ARB);
                glGetBufferParameteriv(GL_PIXEL_PACK_BUFFER_ARB, GL_BUFFER_SIZE, &bsize);
            }

            //read back if we have enough buffer
            if(bsize >= esize) glReadPixels(0, 0, ftex->GetImgWidth(), ftex->GetImgHeight(), GL_RGBA, GL_FLOAT, 0);
            else  glBufferData(GL_PIXEL_PACK_BUFFER_ARB, 0,    NULL, GL_STATIC_DRAW_ARB);
            glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, 0);


            //box display vbo
            int count = _levelFeatureNum[idx]* 10;
            GetAlignedStorageSize(count, _alignment, w, h);
            w = (int)ceil(double(count)/ h);

            //input
            fbo.BindFBO();
            ftex->BindTex();

            //output
            tempTex.AttachToFBO(0);
            GlobalUtil::FitViewPort(w, h);
            //shader
            ShaderMan::UseShaderGenVBO(    (float)ftex->GetImgWidth(),  (float) w, 
                param.GetLevelSigma(j + param._level_min + 1));
            GLTexImage::DrawQuad(0,  (float)w, 0, (float)h);
        
            //
            glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, _featureDisplayVBO[ idx]);
            glGetBufferParameteriv(GL_PIXEL_PACK_BUFFER_ARB, GL_BUFFER_SIZE, &bsize);
            esize = w*h * sizeof(float)*4;
            //increase size when necessary
            if(bsize < esize)
            {
                glBufferData(GL_PIXEL_PACK_BUFFER_ARB, esize*3/2,    NULL, GL_STATIC_DRAW_ARB);
                glGetBufferParameteriv(GL_PIXEL_PACK_BUFFER_ARB, GL_BUFFER_SIZE, &bsize);
            }
            
            //read back if we have enough buffer    
            if(bsize >= esize)    glReadPixels(0, 0, w, h, GL_RGBA, GL_FLOAT, 0);
            else glBufferData(GL_PIXEL_PACK_BUFFER_ARB, 0,    NULL, GL_STATIC_DRAW_ARB);
            glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, 0);



            
        }
    }
    glReadBuffer(GL_NONE);
    glFinish();

}





void PyramidNaive::GetFeatureOrientations()
{
    GLTexImage * gtex;
    GLTexImage * stex = NULL;
    GLTexImage * ftex = _featureTex;
    GLTexImage * otex = _orientationTex;
    int sid = 0; 
    int * count     = _levelFeatureNum;
    float sigma, sigma_step = powf(2.0f, 1.0f/param._dog_level_num);
    FrameBufferObject fbo;
    if(_orientationTex)
    {
        GLenum buffers[] = { GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT1_EXT };
        glDrawBuffers(2, buffers);
    }else
    {
        glDrawBuffer(GL_COLOR_ATTACHMENT0_EXT);
    }
    for(int i = 0; i < _octave_num; i++)
    {
        gtex = GetLevelTexture(i+_octave_min, param._level_min + 1);
        if(GlobalUtil::_SubpixelLocalization || GlobalUtil::_KeepExtremumSign)
            stex = GetBaseLevel(i+_octave_min, DATA_KEYPOINT) + 2;

        for(int j = 0; j < param._dog_level_num; j++, ftex++, otex++, count++, gtex++, stex++)
        {
            if(*count<=0)continue;

            sigma = param.GetLevelSigma(j+param._level_min+1);

            //
            ftex->FitTexViewPort();

            glActiveTexture(GL_TEXTURE0);
            ftex->BindTex();
            glActiveTexture(GL_TEXTURE1);
            gtex->BindTex();
            //
            ftex->AttachToFBO(0);
            if(_orientationTex)        otex->AttachToFBO(1);
            if(!_existing_keypoints && (GlobalUtil::_SubpixelLocalization|| GlobalUtil::_KeepExtremumSign))
            {
                glActiveTexture(GL_TEXTURE2);
                stex->BindTex();
                sid = * stex;
            }
            ShaderMan::UseShaderOrientation(gtex->GetTexID(),
                gtex->GetImgWidth(), gtex->GetImgHeight(),
                sigma, sid, sigma_step, _existing_keypoints);
            ftex->DrawQuad();
    //        glFinish();
            
        }
    }

    GLTexImage::UnbindMultiTex(3);
    if(GlobalUtil::_timingS)glFinish();

    if(_orientationTex)    fbo.UnattachTex(GL_COLOR_ATTACHMENT1_EXT);

}



//to compare with GPU feature list generation
void PyramidNaive::GenerateFeatureListCPU()
{

    FrameBufferObject fbo;
    _featureNum = 0;
    GLTexImage * tex = GetBaseLevel(_octave_min);
    float * mem = new float [tex->GetTexWidth()*tex->GetTexHeight()];
    vector<float> list;
    int idx = 0;
    for(int i = 0; i < _octave_num; i++)
    {
        for(int j = 0; j < param._dog_level_num; j++, idx++)
        {
            tex = GetBaseLevel(_octave_min + i, DATA_KEYPOINT) + j + 2;
            tex->BindTex();
            glGetTexImage(GlobalUtil::_texTarget, 0, GL_RED, GL_FLOAT, mem);
            //tex->AttachToFBO(0);
            //tex->FitTexViewPort();
            //glReadPixels(0, 0, tex->GetTexWidth(), tex->GetTexHeight(), GL_RED, GL_FLOAT, mem);
            //
            //make a list of 
            list.resize(0);
            float * p = mem;
            int fcount = 0 ;
            for(int k = 0; k < tex->GetTexHeight(); k++)
            {
                for( int m = 0; m < tex->GetTexWidth(); m ++, p++)
                {
                    if(*p==0)continue;
                    if(m ==0 || k ==0 || k >= tex->GetImgHeight() -1 || m >= tex->GetImgWidth() -1 ) continue;
                    list.push_back(m+0.5f);
                    list.push_back(k+0.5f);
                    list.push_back(0);
                    list.push_back(1);
                    fcount ++;


                }
            }
            if(fcount==0)continue;


            
            GLTexImage * ftex = _featureTex+idx;
            _levelFeatureNum[idx] = (fcount);
            SetLevelFeatureNum(idx, fcount);

            _featureNum += (fcount);


            int fw = ftex->GetImgWidth();
            int fh = ftex->GetImgHeight();

            list.resize(4*fh*fw);

            ftex->BindTex();
            ftex->AttachToFBO(0);
    //        glTexImage2D(GlobalUtil::_texTarget, 0, GlobalUtil::_iTexFormat, fw, fh, 0, GL_BGRA, GL_FLOAT, &list[0]);
            glTexSubImage2D(GlobalUtil::_texTarget, 0, 0, 0, fw, fh, GL_RGBA, GL_FLOAT, &list[0]);
            //
        }
    }
    GLTexImage::UnbindTex();
    delete[] mem;
    if(GlobalUtil::_verbose)
    {
        std::cout<<"#Features:\t"<<_featureNum<<"\n";
    }
}

#define FEATURELIST_USE_PBO

void PyramidGL::ReshapeFeatureListCPU()
{
    //make a compact feature list, each with only one orientation
    //download orientations and the featue list
    //reshape it and upload it

    FrameBufferObject fbo;
    int i, szmax =0, sz;
    int n = param._dog_level_num*_octave_num;
    for( i = 0; i < n; i++)
    {
        sz = _featureTex[i].GetImgWidth() * _featureTex[i].GetImgHeight();
        if(sz > szmax ) szmax = sz;
    }
    float * buffer = new float[szmax*24];
    float * buffer1 = buffer;
    float * buffer2 = buffer + szmax*4;
    float * buffer3 = buffer + szmax*8;

    glReadBuffer(GL_COLOR_ATTACHMENT0_EXT);
    glDrawBuffer(GL_COLOR_ATTACHMENT0_EXT);

#ifdef FEATURELIST_USE_PBO
    GLuint ListUploadPBO;
    glGenBuffers(1, &ListUploadPBO); 
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB,  ListUploadPBO);
    glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, szmax * 8 * sizeof(float),    NULL, GL_STREAM_DRAW);
#endif

    _featureNum = 0;

#ifdef NO_DUPLICATE_DOWNLOAD
    const double twopi = 2.0*3.14159265358979323846;
    _keypoint_buffer.resize(0);
    float os = _octave_min>=0? float(1<<_octave_min): 1.0f/(1<<(-_octave_min));
    if(_down_sample_factor>0) os *= float(1<<_down_sample_factor); 
    float offset = GlobalUtil::_LoweOrigin? 0 : 0.5f;
#endif

    for(i = 0; i < n; i++)
    {
        if(_levelFeatureNum[i]==0)continue;

        _featureTex[i].AttachToFBO(0);
        _featureTex[i].FitTexViewPort();
        glReadPixels(0, 0, _featureTex[i].GetImgWidth(), _featureTex[i].GetImgHeight(),GL_RGBA, GL_FLOAT, buffer1);
        
        int fcount =0, ocount;
        float * src = buffer1;
        float * orientation  = buffer2;
        float * des = buffer3;
        if(GlobalUtil::_OrientationPack2 == 0)
        {    
            //read back orientations from another texture
            _orientationTex[i].AttachToFBO(0);
            glReadPixels(0, 0, _orientationTex[i].GetImgWidth(), _orientationTex[i].GetImgHeight(),GL_RGBA, GL_FLOAT, buffer2);
            //make the feature list
            for(int j = 0; j < _levelFeatureNum[i]; j++, src+=4, orientation+=4)
            {
                if(_existing_keypoints) 
                {
                    des[0] = src[0];
                    des[1] = src[1];
                    des[2] = orientation[0];
                    des[3] = src[3];            
                    fcount++;
                    des += 4;
                }else
                {
                    ocount = (int)src[2];
                    for(int k = 0 ; k < ocount; k++, des+=4)
                    {
                        des[0] = src[0];
                        des[1] = src[1];
                        des[2] = orientation[k];
                        des[3] = src[3];            
                        fcount++;
                    }
                }
            }
        }else
        {
            _featureTex[i].DetachFBO(0);
            const static double factor  = 2.0*3.14159265358979323846/65535.0;
            for(int j = 0; j < _levelFeatureNum[i]; j++, src+=4)
            {
                unsigned short * orientations = (unsigned short*) (&src[2]);
                if(_existing_keypoints) 
                {
                    des[0] = src[0];
                    des[1] = src[1];
                    des[2] = float( factor* orientations[0]);
                    des[3] = src[3];            
                    fcount++;
                    des += 4;
                }else
                {
                    if(orientations[0] != 65535)
                    {
                        des[0] = src[0];
                        des[1] = src[1];
                        des[2] = float( factor* orientations[0]);
                        des[3] = src[3];            
                        fcount++;
                        des += 4;

                        if(orientations[1] != 65535)
                        {
                            des[0] = src[0];
                            des[1] = src[1];
                            des[2] = float(factor* orientations[1]);
                            des[3] = src[3];            
                            fcount++;
                            des += 4;
                        }
                    }
                }
            }
        }

        if (fcount == 0){    _levelFeatureNum[i] = 0; continue;    }

        //texture size --------------
        SetLevelFeatureNum(i, fcount);
        int nfw = _featureTex[i].GetImgWidth();
        int nfh = _featureTex[i].GetImgHeight();
        int sz = nfh * nfw;
        if(sz > fcount) memset(des, 0, sizeof(float) * (sz - fcount) * 4);

#ifndef FEATURELIST_USE_PBO
        _featureTex[i].BindTex();
        glTexSubImage2D(GlobalUtil::_texTarget, 0, 0, 0, nfw, nfh, GL_RGBA, GL_FLOAT, buffer3);
        _featureTex[i].UnbindTex();
#else
        float* mem = (float*) glMapBuffer(GL_PIXEL_UNPACK_BUFFER_ARB,  GL_WRITE_ONLY);
        memcpy(mem, buffer3,  sz * 4 * sizeof(float) );
        glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER_ARB);
        _featureTex[i].BindTex();
        glTexSubImage2D(GlobalUtil::_texTarget, 0, 0, 0, nfw, nfh, GL_RGBA, GL_FLOAT, 0);
        _featureTex[i].UnbindTex();
#endif

#ifdef NO_DUPLICATE_DOWNLOAD
        if(fcount > 0)
        {
            float oss = os * (1 << (i / param._dog_level_num));
            _keypoint_buffer.resize((_featureNum + fcount) * 4);
            float* ds = &_keypoint_buffer[_featureNum * 4];
            float* fs = buffer3;
            for(int k = 0;  k < fcount; k++, ds+=4, fs+=4)
            {
                ds[0] = oss*(fs[0]-0.5f) + offset;    //x
                ds[1] = oss*(fs[1]-0.5f) + offset;    //y
                ds[3] = (float)fmod(twopi-fs[2], twopi);    //orientation, mirrored
                ds[2] = oss*fs[3];  //scale
            }
        }
#endif
        _levelFeatureNum[i] = fcount;
        _featureNum += fcount;
    }

    delete[] buffer;
    if(GlobalUtil::_verbose)
    {
        std::cout<<"#Features MO:\t"<<_featureNum<<endl;
    }
    ///////////////////////////////////
#ifdef FEATURELIST_USE_PBO
    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB,  0);
    glDeleteBuffers(1, &ListUploadPBO);
#endif
}



inline void PyramidGL::SetLevelFeatureNum(int idx, int fcount)
{
    int fw, fh;
    GLTexImage * ftex = _featureTex + idx;
    //set feature texture size. normally fh will be one
    GetTextureStorageSize(fcount, fw, fh);
    if(fcount >  ftex->GetTexWidth()*ftex->GetTexHeight())
    {
        ftex->InitTexture(fw, fh, 0);
        if(_orientationTex)            _orientationTex[idx].InitTexture(fw, fh, 0);

    }
    if(GlobalUtil::_NarrowFeatureTex)
        fh = fcount ==0? 0:(int)ceil(double(fcount)/fw);
    else
        fw = fcount ==0? 0:(int)ceil(double(fcount)/fh);
    ftex->SetImageSize(fw, fh);
    if(_orientationTex)        _orientationTex[idx].SetImageSize(fw, fh);
}

void PyramidGL::CleanUpAfterSIFT()
{
    GLTexImage::UnbindMultiTex(3);
    ShaderMan::UnloadProgram();
    FrameBufferObject::DeleteGlobalFBO();
    GlobalUtil::CleanupOpenGL();
}

void PyramidNaive::GetSimplifiedOrientation()
{
    //
    int idx = 0;
//    int n = _octave_num  * param._dog_level_num;
    float sigma, sigma_step = powf(2.0f, 1.0f/param._dog_level_num); 
    GLTexImage * ftex = _featureTex;

    FrameBufferObject fbo;
    glDrawBuffer(GL_COLOR_ATTACHMENT0_EXT);
    for(int i = 0; i < _octave_num; i++)
    {
        GLTexImage *gtex = GetLevelTexture(i+_octave_min, 2+param._level_min);
        for(int j = 0; j < param._dog_level_num; j++, ftex++,  gtex++, idx ++)
        {
            if(_levelFeatureNum[idx]<=0)continue;
            sigma = param.GetLevelSigma(j+param._level_min+1);

            //
            ftex->AttachToFBO(0);
            ftex->FitTexViewPort();

            glActiveTexture(GL_TEXTURE0);
            ftex->BindTex();
            glActiveTexture(GL_TEXTURE1);
            gtex->BindTex();

            ShaderMan::UseShaderSimpleOrientation(gtex->GetTexID(), sigma, sigma_step);
            ftex->DrawQuad();
        }
    }

    GLTexImage::UnbindMultiTex(2);

}


#ifdef USE_SSE_FOR_SIFTGPU
    static inline float dotproduct_128d(float * p)
    {
        float z = 0.0f;
        __m128 sse =_mm_load_ss(&z);
        float* pf = (float*) (&sse);
        for( int i = 0; i < 32; i++, p+=4)
        {
            __m128 ps = _mm_loadu_ps(p);
            sse = _mm_add_ps(sse,  _mm_mul_ps(ps, ps));
        }
        return pf[0] + pf[1] + pf[2] + pf[3];

    }
    static inline void multiply_and_truncate_128d(float* p, float m)
    {
        float z = 0.2f;
        __m128 t = _mm_load_ps1(&z);
        __m128 r = _mm_load_ps1(&m);
        for(int i = 0; i < 32; i++, p+=4)
        {
            __m128 ps = _mm_loadu_ps(p);
            _mm_storeu_ps(p, _mm_min_ps(_mm_mul_ps(ps, r), t));
        }
    }
    static inline void multiply_128d(float* p, float m)
    {
        __m128 r = _mm_load_ps1(&m);
        for(int i = 0; i < 32; i++, p+=4)
        {
            __m128 ps = _mm_loadu_ps(p);
            _mm_storeu_ps(p, _mm_mul_ps(ps, r));
        }
    }
#endif


inline void PyramidGL::NormalizeDescriptor(int num, float*pd)
{

#ifdef USE_SSE_FOR_SIFTGPU
    for(int k = 0; k < num; k++, pd +=128)
    {
        float sq;
        //normalize and truncate to .2
        sq = dotproduct_128d(pd);        sq = 1.0f / sqrtf(sq);
        multiply_and_truncate_128d(pd, sq);

        //renormalize
        sq = dotproduct_128d(pd);        sq = 1.0f / sqrtf(sq);
        multiply_128d(pd, sq);
    }
#else
    //descriptor normalization runs on cpu for OpenGL implemenations
    for(int k = 0; k < num; k++, pd +=128)
    {
        int v;
        float* ppd, sq = 0;
        //int v;
        //normalize
        ppd = pd;
        for(v = 0 ; v < 128; v++, ppd++)    sq += (*ppd)*(*ppd);
        sq = 1.0f / sqrtf(sq);
        //truncate to .2
        ppd = pd;
        for(v = 0; v < 128; v ++, ppd++)    *ppd = min(*ppd*sq, 0.2f);

        //renormalize
        ppd = pd; sq = 0;
        for(v = 0; v < 128; v++, ppd++)    sq += (*ppd)*(*ppd);
        sq = 1.0f / sqrtf(sq);

        ppd = pd;
        for(v = 0; v < 128; v ++, ppd++)    *ppd = *ppd*sq;
    }

#endif
}

inline void PyramidGL::InterlaceDescriptorF2(int w, int h, float* buf, float* pd, int step)
{
    /*
    if(GlobalUtil::_DescriptorPPR == 8)
    {
        const int dstep = w * 128;
        float* pp1 = buf;
        float* pp2 = buf + step;

        for(int u = 0; u < h ; u++, pd+=dstep)
        {
            int v; 
            float* ppd = pd;
            for(v= 0; v < w; v++)
            {
                for(int t = 0; t < 8; t++)
                {
                    *ppd++ = *pp1++;*ppd++ = *pp1++;*ppd++ = *pp1++;*ppd++ = *pp1++;
                    *ppd++ = *pp2++;*ppd++ = *pp2++;*ppd++ = *pp2++;*ppd++ = *pp2++;
                }
                ppd += 64;
            }
            ppd = pd + 64;
            for(v= 0; v < w; v++)
            {
                for(int t = 0; t < 8; t++)
                {
                    *ppd++ = *pp1++;*ppd++ = *pp1++;*ppd++ = *pp1++;*ppd++ = *pp1++;
                    *ppd++ = *pp2++;*ppd++ = *pp2++;*ppd++ = *pp2++;*ppd++ = *pp2++;
                }
                ppd += 64;
            }
        }

    }else */
    if(GlobalUtil::_DescriptorPPR == 8)
    {
        //interlace
        for(int k = 0; k < 2; k++)
        {
            float* pp = buf + k * step;
            float* ppd = pd + k * 4;
            for(int u = 0; u < h ; u++)
            {
                int v; 
                for(v= 0; v < w; v++)
                {
                    for(int t = 0; t < 8; t++)
                    {
                        ppd[0] = pp[0];
                        ppd[1] = pp[1];
                        ppd[2] = pp[2];
                        ppd[3] = pp[3];
                        ppd += 8;
                        pp+= 4;
                    }
                    ppd += 64;
                }
                ppd += ( 64 - 128 * w );
                for(v= 0; v < w; v++)
                {
                    for(int t = 0; t < 8; t++)
                    {
                        ppd[0] = pp[0];
                        ppd[1] = pp[1];
                        ppd[2] = pp[2];
                        ppd[3] = pp[3];

                        ppd += 8;
                        pp+= 4;
                    }
                    ppd += 64;
                }
                ppd -=64;
            }
        }
    }else if(GlobalUtil::_DescriptorPPR == 4)
    {

    }



}
void PyramidGL::GetFeatureDescriptors()
{
    //descriptors...
    float sigma;
    int idx, i, j, k, w, h;
    int ndf = 32 / GlobalUtil::_DescriptorPPT; //number of textures
    int block_width = GlobalUtil::_DescriptorPPR;
    int block_height = GlobalUtil::_DescriptorPPT/GlobalUtil::_DescriptorPPR;
    float* pd =  &_descriptor_buffer[0], * pbuf  = NULL;
    vector<float>read_buffer, descriptor_buffer2;

    //use another buffer, if we need to re-order the descriptors
    if(_keypoint_index.size() > 0)
    {
        descriptor_buffer2.resize(_descriptor_buffer.size());
        pd = &descriptor_buffer2[0];
    }
    FrameBufferObject fbo;

    GLTexImage * gtex, *otex, * ftex;
    GLenum buffers[8] = { 
        GL_COLOR_ATTACHMENT0_EXT,        GL_COLOR_ATTACHMENT1_EXT ,
        GL_COLOR_ATTACHMENT2_EXT,        GL_COLOR_ATTACHMENT3_EXT ,
        GL_COLOR_ATTACHMENT4_EXT,        GL_COLOR_ATTACHMENT5_EXT ,
        GL_COLOR_ATTACHMENT6_EXT,        GL_COLOR_ATTACHMENT7_EXT ,
    };

    glDrawBuffers(ndf, buffers);
    glReadBuffer(GL_COLOR_ATTACHMENT0_EXT);


    for( i = 0, idx = 0, ftex = _featureTex; i < _octave_num; i++)
    {
        gtex = GetBaseLevel(i + _octave_min, DATA_GRAD) + 1;
        otex = GetBaseLevel(i + _octave_min, DATA_ROT)  + 1;
        for( j = 0; j < param._dog_level_num; j++, ftex++, idx++, gtex++, otex++)
        {
            if(_levelFeatureNum[idx]==0)continue;

            sigma = IsUsingRectDescription()?  0 : param.GetLevelSigma(j+param._level_min+1);
            int count = _levelFeatureNum[idx] * block_width;
            GetAlignedStorageSize(count, block_width, w, h);
            h = ((int)ceil(double(count) / w)) * block_height;

            //not enought space for holding the descriptor data
            if(w > _descriptorTex[0].GetTexWidth() || h > _descriptorTex[0].GetTexHeight())
            {
                for(k = 0; k < ndf; k++)_descriptorTex[k].InitTexture(w, h);
            }
            for(k = 0; k < ndf; k++)    _descriptorTex[k].AttachToFBO(k);
            GlobalUtil::FitViewPort(w, h);
            glActiveTexture(GL_TEXTURE0);
            ftex->BindTex();
            glActiveTexture(GL_TEXTURE1);
            gtex->BindTex();
            if(otex!=gtex)
            {
                glActiveTexture(GL_TEXTURE2);
                otex->BindTex();
            }

            ShaderMan::UseShaderDescriptor(gtex->GetTexID(), otex->GetTexID(), 
                w, ftex->GetImgWidth(), gtex->GetImgWidth(), gtex->GetImgHeight(), sigma);
            GLTexImage::DrawQuad(0, (float)w, 0, (float)h);

             //read back float format descriptors and do normalization on CPU
            int step = w*h*4;
            if((unsigned int)step*ndf > read_buffer.size())
            {
                read_buffer.resize(ndf*step);
            }
            pbuf = &read_buffer[0];
            
            //read back
            for(k = 0; k < ndf; k++, pbuf+=step)
            {
                glReadBuffer(GL_COLOR_ATTACHMENT0_EXT + k);
                if(GlobalUtil::_IsNvidia ||  w * h <=  16384) //were
                {
                    glReadPixels(0, 0, w, h, GL_RGBA, GL_FLOAT, pbuf);
                }else
                {
                    int hstep = 16384 / w; 
                    for(int kk = 0; kk < h; kk += hstep)
                        glReadPixels(0, kk, w, min(hstep, h - kk), GL_RGBA, GL_FLOAT, pbuf + w * kk * 4);
                }
            }
    
            //the following two steps run on cpu, so better cpu better speed
            //and release version can be a lot faster than debug version
            //interlace data on the two texture to get the descriptor
            InterlaceDescriptorF2(w / block_width, h / block_height, &read_buffer[0], pd, step);
            
            //need to do normalization
            //the new version uses SSE to speed up this part
            if(GlobalUtil::_NormalizedSIFT) NormalizeDescriptor(_levelFeatureNum[idx], pd);

            pd += 128*_levelFeatureNum[idx];
            glReadBuffer(GL_NONE);
        }
    }


    //finally, put the descriptor back to their original order for existing keypoint list.
    if(_keypoint_index.size() > 0)
    {
        for(i = 0; i < _featureNum; ++i)
        {
            int index = _keypoint_index[i];
            memcpy(&_descriptor_buffer[index*128], &descriptor_buffer2[i*128], 128 * sizeof(float));
        }
    }

    ////////////////////////
    GLTexImage::UnbindMultiTex(3); 
    glDrawBuffer(GL_NONE);
    ShaderMan::UnloadProgram();
    if(GlobalUtil::_timingS)glFinish();
    for(i = 0; i < ndf; i++) fbo.UnattachTex(GL_COLOR_ATTACHMENT0_EXT +i);

}


void PyramidGL::DownloadKeypoints()
{
    const double twopi = 2.0*3.14159265358979323846;
    int idx = 0;
    float * buffer = &_keypoint_buffer[0];
    vector<float> keypoint_buffer2;
    //use a different keypoint buffer when processing with an exisint features list
    //without orientation information. 
    if(_keypoint_index.size() > 0)
    {
        keypoint_buffer2.resize(_keypoint_buffer.size());
        buffer = &keypoint_buffer2[0];
    }
    float * p = buffer, *ps, sigma;
    GLTexImage * ftex = _featureTex;
    FrameBufferObject fbo;
    ftex->FitRealTexViewPort();
    /////////////////////
    float os = _octave_min>=0? float(1<<_octave_min): 1.0f/(1<<(-_octave_min));
    if(_down_sample_factor>0) os *= float(1<<_down_sample_factor); 
    float offset = GlobalUtil::_LoweOrigin? 0 : 0.5f;
    /////////////////////
    for(int i = 0; i < _octave_num; i++, os *= 2.0f)
    {
        
        for(int j = 0; j  < param._dog_level_num; j++, idx++, ftex++)
        {

            if(_levelFeatureNum[idx]>0)
            {    
                ftex->AttachToFBO(0);
                glReadPixels(0, 0, ftex->GetImgWidth(), ftex->GetImgHeight(),GL_RGBA, GL_FLOAT, p);
                ps = p;
                for(int k = 0;  k < _levelFeatureNum[idx]; k++, ps+=4)
                {
                    ps[0] = os*(ps[0]-0.5f) + offset;    //x
                    ps[1] = os*(ps[1]-0.5f) + offset;    //y
                    sigma = os*ps[3]; 
                    ps[3] = (float)fmod(twopi-ps[2], twopi);    //orientation, mirrored
                    ps[2] = sigma;  //scale
                }
                p+= 4* _levelFeatureNum[idx];
            }
        }
    }

    //put the feature into their original order

    if(_keypoint_index.size() > 0)
    {
        for(int i = 0; i < _featureNum; ++i)
        {
            int index = _keypoint_index[i];
            memcpy(&_keypoint_buffer[index*4], &keypoint_buffer2[i*4], 4 * sizeof(float));
        }
    }
}


void PyramidGL::GenerateFeatureListTex()
{
    //generate feature list texture from existing keypoints
    //do feature sorting in the same time?

    FrameBufferObject fbo;
    vector<float> list;
    int idx = 0;
    const double twopi = 2.0*3.14159265358979323846;
    float sigma_half_step = powf(2.0f, 0.5f / param._dog_level_num);
    float octave_sigma = _octave_min>=0? float(1<<_octave_min): 1.0f/(1<<(-_octave_min));
    float offset = GlobalUtil::_LoweOrigin? 0 : 0.5f; 
    if(_down_sample_factor>0) octave_sigma *= float(1<<_down_sample_factor); 

    
    std::fill(_levelFeatureNum, _levelFeatureNum + _octave_num * param._dog_level_num, 0);

    _keypoint_index.resize(0); // should already be 0
    for(int i = 0; i < _octave_num; i++, octave_sigma*= 2.0f)
    {
        for(int j = 0; j < param._dog_level_num; j++, idx++)
        {
            list.resize(0);
            float level_sigma = param.GetLevelSigma(j + param._level_min + 1) * octave_sigma;
            float sigma_min = level_sigma / sigma_half_step;
            float sigma_max = level_sigma * sigma_half_step;
            int fcount = 0 ;
            for(int k = 0; k < _featureNum; k++)
            {
                float * key = &_keypoint_buffer[k*4];
                float sigmak = key[2]; 

                //////////////////////////////////////
                if(IsUsingRectDescription()) sigmak = min(key[2], key[3]) / 12.0f; 

                if( (sigmak >= sigma_min && sigmak < sigma_max)
                    ||(sigmak < sigma_min && i ==0 && j == 0)
                    ||(sigmak > sigma_max && j == param._dog_level_num - 1&& 
                            (i == _octave_num -1  || GlobalUtil::_KeyPointListForceLevel0)))
                {
                    //add this keypoint to the list
                    list.push_back((key[0] - offset) / octave_sigma + 0.5f);
                    list.push_back((key[1] - offset) / octave_sigma + 0.5f);
                    if(IsUsingRectDescription())
                    {
                        list.push_back(key[2] / octave_sigma);
                        list.push_back(key[3] / octave_sigma);
                    }else
                    {
                        list.push_back((float)fmod(twopi-key[3], twopi));
                        list.push_back(key[2] / octave_sigma);
                    }
                    fcount ++;
                    //save the index of keypoints
                    _keypoint_index.push_back(k);
                }
            }

            _levelFeatureNum[idx] = fcount;
            if(fcount==0)continue;
            GLTexImage * ftex = _featureTex+idx;

            SetLevelFeatureNum(idx, fcount);

            int fw = ftex->GetImgWidth();
            int fh = ftex->GetImgHeight();

            list.resize(4*fh*fw);

            ftex->BindTex();
            ftex->AttachToFBO(0);
            glTexSubImage2D(GlobalUtil::_texTarget, 0, 0, 0, fw, fh, GL_RGBA, GL_FLOAT, &list[0]);

            if( fcount == _featureNum) _keypoint_index.resize(0);
        }
        if( GlobalUtil::_KeyPointListForceLevel0 ) break;
    }
    GLTexImage::UnbindTex();
    if(GlobalUtil::_verbose)
    {
        std::cout<<"#Features:\t"<<_featureNum<<"\n";
    }
}



PyramidPacked::PyramidPacked(SiftParam& sp): PyramidGL(sp)
{
    _allPyramid = NULL;
}

PyramidPacked::~PyramidPacked()
{
    DestroyPyramidData();
}


//build the gaussian pyrmaid

void PyramidPacked::BuildPyramid(GLTexInput * input)
{
    //
    USE_TIMING();
    GLTexImage * tex, *tmp;
    FilterProgram ** filter;
    FrameBufferObject fbo;

    glDrawBuffer(GL_COLOR_ATTACHMENT0_EXT);
    input->FitTexViewPort();

    for (int i = _octave_min; i < _octave_min + _octave_num; i++)
    {
        tex = GetBaseLevel(i);
        tmp = GetBaseLevel(i, DATA_DOG) + 2; //use this as a temperory texture

        
        filter = ShaderMan::s_bag->f_gaussian_step;

        OCTAVE_START();

        if( i == _octave_min )
        {
            if(i < 0)    TextureUpSample(tex, input, 1<<(-i-1));            
            else        TextureDownSample(tex, input, 1<<(i+1));
            ShaderMan::FilterInitialImage(tex, tmp); 
        }else
        {
            TextureDownSample(tex, GetLevelTexture(i-1, param._level_ds)); 
            ShaderMan::FilterSampledImage(tex, tmp); 
        }
        LEVEL_FINISH();        

        for(int j = param._level_min + 1; j <=  param._level_max ; j++, tex++, filter++)
        {
            // filtering
            ShaderMan::FilterImage(*filter, tex+1, tex, tmp);
            LEVEL_FINISH();
        }

        OCTAVE_FINISH();

    }
    if(GlobalUtil::_timingS)    glFinish();
    UnloadProgram();    
}

void PyramidPacked::ComputeGradient()
{
    
    //first pass, compute dog, gradient, orientation
    GLenum buffers[4] = { 
        GL_COLOR_ATTACHMENT0_EXT,        GL_COLOR_ATTACHMENT1_EXT ,
        GL_COLOR_ATTACHMENT2_EXT,        GL_COLOR_ATTACHMENT3_EXT
    };

    int i, j;
    double ts, t1;
    FrameBufferObject fbo;

    if(GlobalUtil::_timingS && GlobalUtil::_verbose)ts = CLOCK();

    for(i = _octave_min; i < _octave_min + _octave_num; i++)
    {
        GLTexImage * gus = GetBaseLevel(i) +  1;
        GLTexImage * dog = GetBaseLevel(i, DATA_DOG) +  1;
        GLTexImage * grd = GetBaseLevel(i, DATA_GRAD) +  1;
        GLTexImage * rot = GetBaseLevel(i, DATA_ROT) +  1;
        glDrawBuffers(3, buffers);
        gus->FitTexViewPort();
        //compute the gradient
        for(j = 0; j <  param._dog_level_num ; j++, gus++, dog++, grd++, rot++)
        {
            //gradient, dog, orientation
            glActiveTexture(GL_TEXTURE0);
            gus->BindTex();
            glActiveTexture(GL_TEXTURE1);
            (gus-1)->BindTex();
            //output
            dog->AttachToFBO(0);
            grd->AttachToFBO(1);
            rot->AttachToFBO(2);
            ShaderMan::UseShaderGradientPass((gus-1)->GetTexID());
            //compute
            dog->DrawQuadMT4();
        }
    }
    if(GlobalUtil::_timingS)
    {
        glFinish();
        if(GlobalUtil::_verbose)
        {
            t1 = CLOCK();
            std::cout    <<"<Gradient, DOG  >\t"<<(t1-ts)<<"\n";
        }
    }
    GLTexImage::DetachFBO(1);
    GLTexImage::DetachFBO(2);
    UnloadProgram();
    GLTexImage::UnbindMultiTex(3);
    fbo.UnattachTex(GL_COLOR_ATTACHMENT1_EXT);
}

void PyramidPacked::DetectKeypointsEX()
{

    //first pass, compute dog, gradient, orientation
    GLenum buffers[4] = { 
        GL_COLOR_ATTACHMENT0_EXT,        GL_COLOR_ATTACHMENT1_EXT ,
        GL_COLOR_ATTACHMENT2_EXT,        GL_COLOR_ATTACHMENT3_EXT
    };

    int i, j;
    double t0, t, ts, t1, t2;
    FrameBufferObject fbo;

    if(GlobalUtil::_timingS && GlobalUtil::_verbose)ts = CLOCK();

    for(i = _octave_min; i < _octave_min + _octave_num; i++)
    {
        GLTexImage * gus = GetBaseLevel(i) + 1;
        GLTexImage * dog = GetBaseLevel(i, DATA_DOG) + 1;
        GLTexImage * grd = GetBaseLevel(i, DATA_GRAD) + 1;
        GLTexImage * rot = GetBaseLevel(i, DATA_ROT) + 1;
        glDrawBuffers(3, buffers);
        gus->FitTexViewPort();
        //compute the gradient
        for(j = param._level_min +1; j <=  param._level_max ; j++, gus++, dog++, grd++, rot++)
        {
            //gradient, dog, orientation
            glActiveTexture(GL_TEXTURE0);
            gus->BindTex();
            glActiveTexture(GL_TEXTURE1);
            (gus-1)->BindTex();
            //output
            dog->AttachToFBO(0);
            grd->AttachToFBO(1);
            rot->AttachToFBO(2);
            ShaderMan::UseShaderGradientPass((gus-1)->GetTexID());
            //compute
            dog->DrawQuadMT4();
        }
    }
    if(GlobalUtil::_timingS && GlobalUtil::_verbose)
    {
        glFinish();
        t1 = CLOCK();
    }
    GLTexImage::DetachFBO(1);
    GLTexImage::DetachFBO(2);
    //glDrawBuffers(1, buffers);
    glDrawBuffer(GL_COLOR_ATTACHMENT0_EXT);
    

    GlobalUtil::CheckErrorsGL();

    for ( i = _octave_min; i < _octave_min + _octave_num; i++)
    {
        if(GlobalUtil::_timingO)
        {
            t0 = CLOCK();
            std::cout<<"#"<<(i + _down_sample_factor)<<"\t";
        }
        GLTexImage * dog = GetBaseLevel(i, DATA_DOG) + 2;
        GLTexImage * key = GetBaseLevel(i, DATA_KEYPOINT) +2;
        GLTexImage * gus = GetBaseLevel(i) +  2;
        key->FitTexViewPort();

        for( j = param._level_min +2; j <  param._level_max ; j++, dog++, key++, gus++)
        {
            if(GlobalUtil::_timingL)t = CLOCK();        
            key->AttachToFBO(0);
            glActiveTexture(GL_TEXTURE0);
            dog->BindTex();
            glActiveTexture(GL_TEXTURE1);
            (dog+1)->BindTex();
            glActiveTexture(GL_TEXTURE2);
            (dog-1)->BindTex();
            if(GlobalUtil::_DarknessAdaption)
            {
                glActiveTexture(GL_TEXTURE3);
                gus->BindTex();
            }
            ShaderMan::UseShaderKeypoint((dog+1)->GetTexID(), (dog-1)->GetTexID());
            key->DrawQuadMT8();
            if(GlobalUtil::_timingL)
            {
                glFinish();
                std::cout<<(CLOCK()-t)<<"\t";
            }
        }
        if(GlobalUtil::_timingO)
        {
            glFinish();
            std::cout<<"|\t"<<(CLOCK()-t0)<<"\n";
        }
    }

    if(GlobalUtil::_timingS)
    {
        glFinish();
        if(GlobalUtil::_verbose) 
        {    
            t2 = CLOCK();
            std::cout    <<"<Gradient, DOG  >\t"<<(t1-ts)<<"\n"
                        <<"<Get Keypoints  >\t"<<(t2-t1)<<"\n";
        }
                        
    }
    UnloadProgram();
    GLTexImage::UnbindMultiTex(3);
    fbo.UnattachTex(GL_COLOR_ATTACHMENT1_EXT);
}


void PyramidPacked::GenerateFeatureList(int i, int j)
{
    float fcount = 0.0f; 
    int hist_skip_gpu = GlobalUtil::_ListGenSkipGPU;
    int idx = i * param._dog_level_num + j; 
    int hist_level_num = _hpLevelNum - _pyramid_octave_first;
    GLTexImage * htex, * ftex, * tex;
    htex = _histoPyramidTex + hist_level_num - 1 - i;
    ftex = _featureTex + idx;
    tex = GetBaseLevel(_octave_min + i, DATA_KEYPOINT) + 2 + j;


    //fill zero to an extra row/col if the height/width is odd
    glActiveTexture(GL_TEXTURE0);
    tex->BindTex();
    htex->AttachToFBO(0);
    int tight = (htex->GetImgWidth() * 4 == tex->GetImgWidth() -1 && htex->GetImgHeight() *4 == tex->GetImgHeight()-1 );
    ShaderMan::UseShaderGenListInit(tex->GetImgWidth(), tex->GetImgHeight(), tight);
    htex->FitTexViewPort();
    //this uses the fact that no feature is on the edge.
    htex->DrawQuadReduction();
    //reduction..
    htex--;
    
    //this part might have problems on several GPUS
    //because the output of one pass is the input of the next pass
    //may require glFinish to make it right, but too much glFinish makes it slow
    for(int k = 0; k <hist_level_num - i-1 - hist_skip_gpu; k++, htex--)
    {
        htex->AttachToFBO(0);
        htex->FitTexViewPort();
        (htex+1)->BindTex();
        ShaderMan::UseShaderGenListHisto();
        htex->DrawQuadReduction();        
    }

    if(hist_skip_gpu == 0)
    {        
        //read back one pixel
        float fn[4];
        glReadPixels(0, 0, 1, 1, GL_RGBA , GL_FLOAT, fn);
        fcount = (fn[0] + fn[1] + fn[2] + fn[3]);
        if(fcount < 1) fcount = 0;

        _levelFeatureNum[ idx] = (int)(fcount);
        SetLevelFeatureNum(idx, (int)fcount);

        //save  number of features
        _featureNum += int(fcount);

        //
        if(fcount < 1.0)                 return;;


        ///generate the feature texture
        htex=  _histoPyramidTex;

        htex->BindTex();

        //first pass
        ftex->AttachToFBO(0);
        if(GlobalUtil::_MaxOrientation>1)
        {
            //this is very important...
            ftex->FitRealTexViewPort();
            glClear(GL_COLOR_BUFFER_BIT);
            glFinish();
        }else
        {    
            ftex->FitTexViewPort();
            //glFinish();
        }


        ShaderMan::UseShaderGenListStart((float)ftex->GetImgWidth(), htex->GetTexID());

        ftex->DrawQuad();
        //make sure it finishes before the next step
        ftex->DetachFBO(0);
        //pass on each pyramid level
        htex++;
    }else
    {

        int tw = htex[1].GetDrawWidth(), th = htex[1].GetDrawHeight();
        int fc = 0;
        glReadPixels(0, 0, tw, th, GL_RGBA , GL_FLOAT, _histo_buffer);    
        _keypoint_buffer.resize(0);
        for(int y = 0, pos = 0; y < th; y++)
        {
            for(int x= 0; x < tw; x++)
            {
                for(int c = 0; c < 4; c++, pos++)
                {
                    int ss =  (int) _histo_buffer[pos]; 
                    if(ss == 0) continue;
                    float ft[4] = {2 * x + (c%2? 1.5f:  0.5f), 2 * y + (c>=2? 1.5f: 0.5f), 0, 1 };
                    for(int t = 0; t < ss; t++)
                    {
                        ft[2] = (float) t; 
                        _keypoint_buffer.insert(_keypoint_buffer.end(), ft, ft+4);
                    }
                    fc += (int)ss; 
                }
            }
        }
        _levelFeatureNum[ idx] = fc;
        SetLevelFeatureNum(idx, fc);
        if(fc == 0)  return; 

        fcount = (float) fc;     
        _featureNum += fc;
        /////////////////////
        ftex->AttachToFBO(0);
        if(GlobalUtil::_MaxOrientation>1)
        {
            ftex->FitRealTexViewPort();
            glClear(GL_COLOR_BUFFER_BIT);
            glFlush();
        }else
        {                    
            ftex->FitTexViewPort();
            glFlush();
        }
        _keypoint_buffer.resize(ftex->GetDrawWidth() * ftex->GetDrawHeight()*4, 0);
        ///////////
        glActiveTexture(GL_TEXTURE0);
        ftex->BindTex();
        glTexSubImage2D(GlobalUtil::_texTarget, 0, 0, 0, ftex->GetDrawWidth(),
            ftex->GetDrawHeight(), GL_RGBA, GL_FLOAT, &_keypoint_buffer[0]);
        htex += 2; 
    }

    for(int lev = 1 + hist_skip_gpu; lev < hist_level_num  - i; lev++, htex++)
    {
        glActiveTexture(GL_TEXTURE0);
        ftex->BindTex();
        ftex->AttachToFBO(0);
        glActiveTexture(GL_TEXTURE1);
        htex->BindTex();
        ShaderMan::UseShaderGenListStep(ftex->GetTexID(), htex->GetTexID());
        ftex->DrawQuad();
        ftex->DetachFBO(0);    
    }

    ftex->AttachToFBO(0);
    glActiveTexture(GL_TEXTURE1);
    tex->BindTex();
    ShaderMan::UseShaderGenListEnd(tex->GetTexID());
    ftex->DrawQuad();
    GLTexImage::UnbindMultiTex(2);

}

void PyramidPacked::GenerateFeatureList()
{
    //generate the histogram pyramid
    FrameBufferObject fbo;
    glReadBuffer(GL_COLOR_ATTACHMENT0_EXT);
    glDrawBuffer(GL_COLOR_ATTACHMENT0_EXT);
    double t1, t2; 
    int ocount= 0, reverse = (GlobalUtil::_TruncateMethod == 1);
    _featureNum = 0;

    FitHistogramPyramid();
    //for(int i = 0, idx = 0; i < _octave_num; i++)
    FOR_EACH_OCTAVE(i, reverse)
    {
        if(GlobalUtil::_timingO)
        {
            t1= CLOCK();
            ocount = 0;
            std::cout<<"#"<<i+_octave_min + _down_sample_factor<<":\t";
        }
        //for(int j = 0; j < param._dog_level_num; j++, idx++)
        FOR_EACH_LEVEL(j, reverse)
        {
            if(GlobalUtil::_TruncateMethod && GlobalUtil::_FeatureCountThreshold > 0
                && _featureNum > GlobalUtil::_FeatureCountThreshold) 
            {
                _levelFeatureNum[i * param._dog_level_num + j] = 0;
                continue;
            }
            
            GenerateFeatureList(i, j); 

            if(GlobalUtil::_timingO)
            {
                int idx = i * param._dog_level_num + j;
                ocount += _levelFeatureNum[idx];
                std::cout<< _levelFeatureNum[idx] <<"\t";
            }
        }
        if(GlobalUtil::_timingO)
        {    
            t2 = CLOCK();
            std::cout << "| \t" << int(ocount) << " :\t(" << (t2 - t1) << ")\n";
        }
    }
    if(GlobalUtil::_timingS)glFinish();
    if(GlobalUtil::_verbose)
    {
        std::cout<<"#Features:\t"<<_featureNum<<"\n";
    }

}

void PyramidPacked::GenerateFeatureListCPU()
{
    FrameBufferObject fbo;
    _featureNum = 0;
    GLTexImage * tex = GetBaseLevel(_octave_min);
    float * mem = new float [tex->GetTexWidth()*tex->GetTexHeight()*4];
    vector<float> list;
    int idx = 0;
    for(int i = 0; i < _octave_num; i++)
    {
        for(int j = 0; j < param._dog_level_num; j++, idx++)
        {
            tex = GetBaseLevel(_octave_min + i, DATA_KEYPOINT) + j + 2;
            tex->BindTex();
            glGetTexImage(GlobalUtil::_texTarget, 0, GL_RGBA, GL_FLOAT, mem);
            //tex->AttachToFBO(0);
            //tex->FitTexViewPort();
            //glReadPixels(0, 0, tex->GetTexWidth(), tex->GetTexHeight(), GL_RED, GL_FLOAT, mem);
            //
            //make a list of 
            list.resize(0);
            float *pl = mem;
            int fcount = 0 ;
            for(int k = 0; k < tex->GetDrawHeight(); k++)
            {
                float * p = pl; 
                pl += tex->GetTexWidth() * 4;
                for( int m = 0; m < tex->GetDrawWidth(); m ++, p+=4)
                {
                //    if(m ==0 || k ==0 || k == tex->GetDrawHeight() -1 || m == tex->GetDrawWidth() -1) continue;
                //    if(*p == 0) continue;
                    int t = ((int) fabs(p[0])) - 1;
                    if(t < 0) continue;
                    int xx = m + m + ( (t %2)? 1 : 0);
                    int yy = k + k + ( (t <2)? 0 : 1);
                    if(xx ==0 || yy == 0) continue;
                    if(xx >= tex->GetImgWidth() - 1 || yy >= tex->GetImgHeight() - 1)continue;
                    list.push_back(xx + 0.5f + p[1]);
                    list.push_back(yy + 0.5f + p[2]);
                    list.push_back(GlobalUtil::_KeepExtremumSign && p[0] < 0 ? -1.0f : 1.0f);
                    list.push_back(p[3]);
                    fcount ++;
                }
            }
            if(fcount==0)continue;

            if(GlobalUtil::_timingL) std::cout<<fcount<<".";
            
            GLTexImage * ftex = _featureTex+idx;
            _levelFeatureNum[idx] = (fcount);
            SetLevelFeatureNum(idx, fcount);

            _featureNum += (fcount);


            int fw = ftex->GetImgWidth();
            int fh = ftex->GetImgHeight();

            list.resize(4*fh*fw);

            ftex->BindTex();
            ftex->AttachToFBO(0);
    //        glTexImage2D(GlobalUtil::_texTarget, 0, GlobalUtil::_iTexFormat, fw, fh, 0, GL_BGRA, GL_FLOAT, &list[0]);
            glTexSubImage2D(GlobalUtil::_texTarget, 0, 0, 0, fw, fh, GL_RGBA, GL_FLOAT, &list[0]);
            //
        }
    }
    GLTexImage::UnbindTex();
    delete[] mem;
    if(GlobalUtil::_verbose)
    {
        std::cout<<"#Features:\t"<<_featureNum<<"\n";
    }
}



void PyramidPacked::GetFeatureOrientations()
{
    GLTexImage * gtex, * otex;
    GLTexImage * ftex = _featureTex;
    GLTexImage * fotex = _orientationTex; 
    int * count     = _levelFeatureNum;
    float sigma, sigma_step = powf(2.0f, 1.0f/param._dog_level_num);


    FrameBufferObject fbo;
    if(_orientationTex)
    {
        GLenum buffers[] = { GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT1_EXT };
        glDrawBuffers(2, buffers);
    }else
    {
        glDrawBuffer(GL_COLOR_ATTACHMENT0_EXT);
    }

    for(int i = 0; i < _octave_num; i++)
    {
        gtex = GetBaseLevel(i+_octave_min, DATA_GRAD) + 1;
        otex = GetBaseLevel(i+_octave_min, DATA_ROT) + 1;

        for(int j = 0; j < param._dog_level_num; j++, ftex++, otex++, count++, gtex++, fotex++)
        {
            if(*count<=0)continue;

            sigma = param.GetLevelSigma(j+param._level_min+1);


            ftex->FitTexViewPort();

            glActiveTexture(GL_TEXTURE0);
            ftex->BindTex();
            glActiveTexture(GL_TEXTURE1);
            gtex->BindTex();
            glActiveTexture(GL_TEXTURE2);
            otex->BindTex();
            //
            ftex->AttachToFBO(0);
            if(_orientationTex)        fotex->AttachToFBO(1);

            GlobalUtil::CheckFramebufferStatus();

            ShaderMan::UseShaderOrientation(gtex->GetTexID(),
                gtex->GetImgWidth(), gtex->GetImgHeight(),
                sigma, otex->GetTexID(), sigma_step, _existing_keypoints);
            ftex->DrawQuad();
        }
    }

    GLTexImage::UnbindMultiTex(3);
    if(GlobalUtil::_timingS)glFinish();
    if(_orientationTex)    fbo.UnattachTex(GL_COLOR_ATTACHMENT1_EXT);

}


void PyramidPacked::GetSimplifiedOrientation()
{
    //
    int idx = 0;
//    int n = _octave_num  * param._dog_level_num;
    float sigma, sigma_step = powf(2.0f, 1.0f/param._dog_level_num); 
    GLTexImage * ftex = _featureTex;

    FrameBufferObject fbo;
    glDrawBuffer(GL_COLOR_ATTACHMENT0_EXT);
    for(int i = 0; i < _octave_num; i++)
    {
        GLTexImage *otex = GetBaseLevel(i + _octave_min, DATA_ROT) + 2;
        for(int j = 0; j < param._dog_level_num; j++, ftex++,  otex++, idx ++)
        {
            if(_levelFeatureNum[idx]<=0)continue;
            sigma = param.GetLevelSigma(j+param._level_min+1);
            //
            ftex->AttachToFBO(0);
            ftex->FitTexViewPort();

            glActiveTexture(GL_TEXTURE0);
            ftex->BindTex();
            glActiveTexture(GL_TEXTURE1);
            otex->BindTex();

            ShaderMan::UseShaderSimpleOrientation(otex->GetTexID(), sigma, sigma_step);
            ftex->DrawQuad();
        }
    }
    GLTexImage::UnbindMultiTex(2);
}

void PyramidPacked::InitPyramid(int w, int h, int ds)
{
    int wp, hp, toobig = 0;
    if(ds == 0)
    {
        _down_sample_factor = 0;
        if(GlobalUtil::_octave_min_default>=0)
        {
            wp = w >> GlobalUtil::_octave_min_default;
            hp = h >> GlobalUtil::_octave_min_default;
        }else 
        {
            wp = w << (-GlobalUtil::_octave_min_default);
            hp = h << (-GlobalUtil::_octave_min_default);
        }
        _octave_min = _octave_min_default;
    }else
    {
        //must use 0 as _octave_min; 
        _octave_min = 0;
        _down_sample_factor = ds;
        w >>= ds;
        h >>= ds;
        wp = w;
        hp = h; 
    }

    while(wp > GlobalUtil::_texMaxDim  || hp > GlobalUtil::_texMaxDim )
    {
        _octave_min ++;
        wp >>= 1;
        hp >>= 1;
        toobig = 1;
    }

    while(GlobalUtil::_MemCapGPU > 0 && GlobalUtil::_FitMemoryCap && (wp >_pyramid_width || hp > _pyramid_height) &&
        max(max(wp, hp), max(_pyramid_width, _pyramid_height)) >  1024 * sqrt(GlobalUtil::_MemCapGPU / 96.0) )
    {
        _octave_min ++;
        wp >>= 1;
        hp >>= 1;
        toobig = 2;
    }

    if(toobig && GlobalUtil::_verbose)
    {
        std::cout<<(toobig == 2 ? "[**SKIP OCTAVES**]:\tExceeding Memory Cap (-nomc)\n" :
                    "[**SKIP OCTAVES**]:\tReaching the dimension limit (-maxd)!\n");
    }

    if( wp == _pyramid_width && hp == _pyramid_height && _allocated )
    {
        FitPyramid(wp, hp);
    }else if(GlobalUtil::_ForceTightPyramid || _allocated ==0)
    {
        ResizePyramid(wp, hp);
    }
    else if( wp > _pyramid_width || hp > _pyramid_height )
    {
        ResizePyramid(max(wp, _pyramid_width), max(hp, _pyramid_height));
        if(wp < _pyramid_width || hp < _pyramid_height)  FitPyramid(wp, hp);
    }
    else
    {
        //try use the pyramid allocated for large image on small input images
        FitPyramid(wp, hp);
    }

    //select the initial smoothing filter according to the new _octave_min
    ShaderMan::SelectInitialSmoothingFilter(_octave_min + _down_sample_factor, param);
}



void PyramidPacked::FitPyramid(int w, int h)
{
    //(w, h) <= (_pyramid_width, _pyramid_height);

    _pyramid_octave_first = 0;
    //
    _octave_num  = GlobalUtil::_octave_num_default;

    int _octave_num_max = GetRequiredOctaveNum(min(w, h));

    if(_octave_num < 1 || _octave_num > _octave_num_max) 
    {
        _octave_num = _octave_num_max;
    }


    int pw = _pyramid_width>>1, ph = _pyramid_height>>1;
    while(_pyramid_octave_first + _octave_num < _pyramid_octave_num &&  
        pw >= w && ph >= h)
    {
        _pyramid_octave_first++;
        pw >>= 1;
        ph >>= 1;
    }

    for(int i = 0; i < _octave_num; i++)
    {
        GLTexImage * tex = GetBaseLevel(i + _octave_min);
        GLTexImage * dog = GetBaseLevel(i + _octave_min, DATA_DOG);
        GLTexImage * grd = GetBaseLevel(i + _octave_min, DATA_GRAD);
        GLTexImage * rot = GetBaseLevel(i + _octave_min, DATA_ROT);
        GLTexImage * key = GetBaseLevel(i + _octave_min, DATA_KEYPOINT);
        for(int j = param._level_min; j <= param._level_max; j++, tex++, dog++, grd++, rot++, key++)
        {
            tex->SetImageSize(w, h);
            if(j == param._level_min) continue;
            dog->SetImageSize(w, h);
            grd->SetImageSize(w, h);
            rot->SetImageSize(w, h);
            if(j == param._level_min + 1 || j == param._level_max) continue;
            key->SetImageSize(w, h);
        }
        w>>=1;
        h>>=1;
    }
}


void PyramidPacked::ResizePyramid( int w,  int h)
{
    //
    unsigned int totalkb = 0;
    int _octave_num_new, input_sz, i, j;
    //

    if(_pyramid_width == w && _pyramid_height == h && _allocated) return;

    if(w > GlobalUtil::_texMaxDim || h > GlobalUtil::_texMaxDim) return ;

    if(GlobalUtil::_verbose && GlobalUtil::_timingS) std::cout<<"[Allocate Pyramid]:\t" <<w<<"x"<<h<<endl;
    //first octave does not change
    _pyramid_octave_first = 0;

    
    //compute # of octaves

    input_sz = min(w,h) ;
    _pyramid_width =  w;
    _pyramid_height =  h;

    //reset to preset parameters
    _octave_num_new  = GlobalUtil::_octave_num_default;

    if(_octave_num_new < 1) _octave_num_new = GetRequiredOctaveNum(input_sz) ;


    if(_pyramid_octave_num != _octave_num_new)
    {
        //destroy the original pyramid if the # of octave changes
        if(_octave_num >0) 
        {
            DestroyPerLevelData();
            DestroyPyramidData();
        }
        _pyramid_octave_num = _octave_num_new;
    }

    _octave_num = _pyramid_octave_num;

    int noct = _octave_num;
    int nlev = param._level_num;

    //    //initialize the pyramid
    if(_allPyramid==NULL)    _allPyramid = new GLTexPacked[ noct* nlev * DATA_NUM];


    GLTexPacked * gus = (GLTexPacked *) GetBaseLevel(_octave_min, DATA_GAUSSIAN);
    GLTexPacked * dog = (GLTexPacked *) GetBaseLevel(_octave_min, DATA_DOG);
    GLTexPacked * grd = (GLTexPacked *) GetBaseLevel(_octave_min, DATA_GRAD);
    GLTexPacked * rot = (GLTexPacked *) GetBaseLevel(_octave_min, DATA_ROT);
    GLTexPacked * key = (GLTexPacked *) GetBaseLevel(_octave_min, DATA_KEYPOINT);


    ////////////there could be "out of memory" happening during the allocation

    for(i = 0; i< noct; i++)
    {
        for( j = 0; j< nlev; j++, gus++, dog++, grd++, rot++, key++)
        {
            gus->InitTexture(w, h);
            if(j==0)continue;
            dog->InitTexture(w, h);
            grd->InitTexture(w, h, 0);
            rot->InitTexture(w, h);
            if(j<=1 || j >=nlev -1) continue;
            key->InitTexture(w, h, 0);
        }
        int tsz = (gus -1)->GetTexPixelCount() * 16;
        totalkb += ((nlev *5 -6)* tsz / 1024);
        //several auxilary textures are not actually required
        w>>=1;
        h>>=1;
    }

    totalkb += ResizeFeatureStorage();

    _allocated = 1;

    if(GlobalUtil::_verbose && GlobalUtil::_timingS) std::cout<<"[Allocate Pyramid]:\t" <<(totalkb/1024)<<"MB\n";

}

void PyramidPacked::DestroyPyramidData()
{
    if(_allPyramid)
    {
        delete [] _allPyramid;
        _allPyramid = NULL;
    }
}


GLTexImage*  PyramidPacked::GetLevelTexture(int octave, int level, int dataName)
{
    return _allPyramid+ (_pyramid_octave_first + octave - _octave_min) * param._level_num 
        + param._level_num * _pyramid_octave_num * dataName
        + (level - param._level_min);

}

GLTexImage*  PyramidPacked::GetLevelTexture(int octave, int level)
{
    return _allPyramid+ (_pyramid_octave_first + octave - _octave_min) * param._level_num 
        + (level - param._level_min);
}

//in the packed implementation( still in progress)
// DATA_GAUSSIAN, DATA_DOG, DATA_GAD will be stored in different textures.

GLTexImage*  PyramidPacked::GetBaseLevel(int octave, int dataName)
{
    if(octave <_octave_min || octave > _octave_min + _octave_num) return NULL;
    int offset = (_pyramid_octave_first + octave - _octave_min) * param._level_num;
    int num = param._level_num * _pyramid_octave_num;
    return _allPyramid + num *dataName + offset;
}


void PyramidPacked::FitHistogramPyramid()
{
    GLTexImage * tex, *htex;
    int hist_level_num = _hpLevelNum - _pyramid_octave_first; 

    tex = GetBaseLevel(_octave_min , DATA_KEYPOINT) + 2;
    htex = _histoPyramidTex + hist_level_num - 1;
    int w = (tex->GetImgWidth() + 2) >> 2;
    int h = (tex->GetImgHeight() + 2)>> 2;


    //4n+1 -> n; 4n+2,2, 3 -> n+1
    for(int k = 0; k <hist_level_num -1; k++, htex--)
    {
        if(htex->GetImgHeight()!= h || htex->GetImgWidth() != w)
        {    
            htex->SetImageSize(w, h);
            htex->ZeroHistoMargin();
        }

        w = (w + 1)>>1; h = (h + 1) >> 1;
    }
}