// NetActivation.cc
// copyright 6 August 1997 Kevin Karplus

#include <assert.h>
#include <math.h>	// for M_LN2

#include "Alphabet/Alphabet.h"
#include "AlphabetTuple/AlphabetTuple.h"
#include "Regularizer/BackgroundProbs.h"
#include "NetActivation.h"
#include "ActivationRecord.h"
#include "OneChain.h"
#include "InterfaceDescription.h"
#include "LearningParam.h"
#include "Globals.h"
//SUGATO : 7/1/99
#include "PredictMix.h"

NetActivation::NetActivation(const NeuralNet *n, int init_length)
{   Net=n;
    Chain=0;
    Records = 0;
    Dummies = 0;
    Inputs = 0;
    
    ReAlloc(init_length);
}

void NetActivation::DeleteAll(void)
{
    if (!Records) return;
    Length=AllocLength;
    for (int lay=Net->num_layers()-1; lay>=0; lay--)
    {	for (int j=layer_length(lay)-1; j>=0; j--)
	    delete Records[lay][j];
	delete [] Records[lay];
	delete Dummies[lay];
	delete [] Inputs[lay];
    }

    delete [] Records;
    delete [] Dummies;
    delete [] Inputs;

    Records=0;
    AllocLength = Length = 0;

}

void NetActivation::ReAlloc(int new_length)
{
    DeleteAll();
    NumLayers = Net->num_layers();

    typedef ActivationRecord* arp;
    typedef arp* arpp;
    
    Dummies = new arp[NumLayers];
    Records = new arpp[NumLayers];
    
    Length=AllocLength=new_length;
    for (int lay=NumLayers-1; lay>=0; lay--)
    {	Dummies[lay] = new ActivationRecord(Net->layer(lay));
	Records[lay] = new arp[layer_length(lay)];
	for (int j=layer_length(lay)-1; j>=0; j--)
	{   Records[lay][j] = new ActivationRecord(Net->layer(lay));
	    Records[lay][j]->tell_position(j);
	}
    }

    // Set up the Inputs array of arrays of arrays.
    // Each layer has length padded_layer_length(lay-1).

    Inputs = new const float** [NumLayers];
    for (int iolay=0; iolay<NumLayers; iolay++)
    {   int size = padded_layer_length(iolay-1);
        Inputs[iolay] = new const float* [size];
    }

}


void NetActivation::assign_chain(const OneChain *newchain)
{  
    // Do reallocation (if needed) before changing anything,
    // so that all old records deleted if new allocation done.
    int chainlength = newchain->num_cols();
    if (chainlength > AllocLength) {ReAlloc(chainlength);}

    Chain = newchain;
    Length = chainlength;
    // Fill in each layer with pointers to the correct arrays in
    // the input chain or the records array.
    for (int iolay=0; iolay<NumLayers; iolay++)
    {   int size = padded_layer_length(iolay-1);
	int len = layer_length(iolay-1);
        // fill in the dummy positions on each end
	const float * dummy= 
		iolay==0?
		  Chain->probs_for_layer(0,-1)
		: Dummies[iolay-1]->probs();
	int num_dummies =  (size-len)/2;
	int i;
	for (i=0; i<num_dummies; i++)
	    Inputs[iolay][i] = dummy;
	for (i=size-1; i>=len+num_dummies; i--)
	    Inputs[iolay][i] = dummy;
	// now fill in the real positions
	for (i=0; i<len; i++)
	    Inputs[iolay][i+num_dummies] = 
		iolay==0?
		Chain->probs_for_layer(0,i)
		: Records[iolay-1][i]->probs();
    }
}      

ActivationRecord* NetActivation::record(int lay, int pos) const
{   assert(lay>=0);
    int subscr= pos + Net->overhang(lay);
    if (subscr<0 || subscr>=layer_length(lay))
	return Dummies[lay];
    return Records[lay][subscr];
}

void NetActivation::activate()
{
    assert(NumLayers == Net->num_layers());
    int lay;	// counter for layers
//    int w;	// counter for elements of window
    typedef const float* cfloatp;
    
    // activate one layer at a time, starting at primary inputs
    for(lay=0; lay<NumLayers; lay++)
    {   int overhang = Net->overhang(lay);

        // activate one record at each position in the layer,
        // passing each such record its own window of inputs

	// The first input window set is
	//	Inputs[lay][0..windowsize-1]
	
	const float ** window = Inputs[lay];
	for (int j=-overhang; j<Length+overhang; j++)
	{   record(lay,j)->compute_probs(window++);
	}
	Dummies[lay]->compute_dummy_probs();
    }
}


void NetActivation::test_outputs()
{
    // go backwards through layers
    for(int lay=NumLayers-1; lay>=0; lay--)
    {   
	int overhang=Net->overhang(lay);

	const InterfaceDescription* ifd=Net->interface(lay+1);
	//const AlphabetTuple *A = ifd->Alpha;
	
	if (ifd->is_TrainTo() && !ifd->is_hidden() 
		&& Chain->structure(lay+1)==NULL && Chain->probs_for_layer(lay+1)==NULL)
	{   // Chain has no training data, so set outputs to background
	    cerr << "DEBUG: chain has no training data\n" << flush;
	    const BackgroundProbs* NullProbs= Globals::background_probs(ifd->Alpha,ZeroIfNew);
	    if (NullProbs ==NULL)
	    {    cerr << "ERROR: need BackgroundProbs for " 
	    		<< ifd->Alpha->name() 
			<< " for chain with no training data\n";
		assert(NullProbs);
	    }
	    
	    const float * null_probs = NullProbs->probs();
	    for (int j=-overhang; j<Length+overhang; j++) 
	    { 	record(lay,j)->cost(null_probs);
	    }
	    cerr << "DEBUG: Cost is " << record(lay,0)->cost() << endl << flush;
	}
	else if (ifd->train_to_unique() && !ifd->is_hidden())
        {   
	    assert(overhang==0);
	    for (int j=-overhang; j<Length+overhang; j++) 
            {   
	        if(ifd->NetRegularizer)
	        { 	
		  //int correct_out = Chain->correct_value(lay+1, j);
		  //const float * mix = record(lay,j)->probs(); 
		  //const float * in_counts = Chain->counts_for_layer(0,j);
		  
		  // BAD CODE: The routine for computing cost_mix_output
		  // for a single correct output has to be added to 
		  // PredictMix.cc
		  
		  //    double c = cost_mix_output(mix,ifd->NetRegularizer,in_counts,correct_out);
		  //    record(lay,j)->set_cost(c*M_LN2);

		  // cost_mix_output returns the cost in bits, which has
		  // to be converted to nats 

		  //    record(lay,j)->set_outsum(1);
		}
		else
		{
		    int correct_out = Chain->correct_value(lay+1, j);
		    // If the correct output is less than zero, it signifies 
		    // that at that structure position, there's a gap rather 
		    // than one correct output value.If there's a gap column, 
		    // we want to skip it rather than training or testing with 
		    // it. If there's a non-gap column, then we want to 
		    // continue training.
		    
		    if (correct_out >= 0)  
		    {   record(lay,j)->cost(correct_out);
		    } 
		}
            }
	}
	else if (ifd->is_TrainTo() && !ifd->is_hidden())
	{
	  for(int j=-overhang; j<Length+overhang; j++)
	  {
	    if(ifd->NetRegularizer)
	    { 
		//   cout << "lay: " << lay << endl << flush;
	      const float * out_probs = Chain->probs_for_layer(lay+1,j);
	      const float * in_counts  = Chain->counts_for_layer(0,j);
	      const float * mix = record(lay,j)->probs();
 
	      //cerr << "Mixture Probs:" << endl << flush;
	      //for(int i=0; i<record(lay,j)->layer->num_out(); i++)
	      //	  cerr << i << ": " << mix[i] << endl << flush;

	      //cerr << "Input Counts:" << endl << flush;
	      //for(int i=0; i<record(lay,j)->layer->input_interface()->num_units(); i++)
	      //  cerr << i << ": " << in_counts[i] << endl << flush;

	      //cerr << "Output Probs:" << endl << flush;
	      //for(int i=0; i<record(lay,j)->layer->input_interface()->num_units(); i++)
	      //  cerr << i << ": " << out_probs[i] << endl << flush;

	      double sum=0;
	      for(int i=0; i<record(lay,j)->layer()->input_interface()->num_units(); i++) 
		  sum+=out_probs[i];
	     	    
	      double c = cost_mix_output(mix,ifd->NetRegularizer,in_counts,out_probs);
	      record(lay,j)->set_cost(c*M_LN2);
	      
	      // cost_mix_output returns the cost in bits, which has
	      // to be converted to nats 
	      
	      record(lay,j)->set_outsum(sum);

	      //cerr << "Cost is :" << c << endl << endl << flush;
	      //cerr << "OutSum is :" << sum << endl << endl << flush;
	    }
	    else
	    {
	      const float * correct_probs = Chain->probs_for_layer(lay+1,j);
	      record(lay,j)->cost(correct_probs);
	
	      //cerr << "Output Counts:" << endl << flush;
	      //for(int i=0; i<record(lay,j)->layer->output_interface()->num_units(); i++)
	      //cerr << correct_probs[i] << endl << flush;

	      //cerr << "Cost is:" << record(lay,j)->cost() << endl << flush;
	    }
	  }
	}
    }
}


void NetActivation::back_propagate(double weight)
{
    const AlphabetTuple *A;
    int correct_out;
    int some_layer_trained=0;

    //cerr << Chain->name() << " weight= " << weight << "\n" << flush;
    
    // go backwards through layers
    for(int lay=NumLayers-1; lay>=0; lay--)
    {   
	int overhang=Net->overhang(lay);

	const InterfaceDescription* ifd=Net->interface(lay+1);

	A = ifd->Alpha;

	if (ifd->train_to_unique() && !ifd->is_hidden())
        {   
	    assert(overhang==0);
	    int prev_correct_out, next_correct_out;
	    prev_correct_out=correct_out=next_correct_out=
		Chain->correct_value(lay+1, -overhang);
	    for (int j=-overhang; j<Length+overhang; j++) 
            {   
	        prev_correct_out = correct_out;
	        correct_out = next_correct_out;
	        if (j<Length+overhang -1)
		{   next_correct_out=Chain->correct_value(lay+1, j+1);
                }

		double multiplier = weight *
		      ((correct_out==next_correct_out 
				&& correct_out==prev_correct_out)?
			  1.0 :
			  Net->learning_params()->NetActChangeCorrectWeight)
		    * (record(lay,j)->most_out_weight()==0?
				Net->learning_params()->NetActWrongWeight:
				1.0);

	        if (correct_out >= 0)  
		{   // not a gap position, train normally
		    record(lay,j)->set_share_from_entropy(correct_out,multiplier);
	        } 
		else
		{   // GAP in training data, propagate nothing back
		    record(lay,j)->clear_partials();
		}    
            }
	    some_layer_trained=1;
	}
	else if (ifd->is_TrainTo() && !ifd->is_hidden())
	{
	   for (int j=-overhang; j<Length+overhang; j++) 
            {   
	      // SUGATO : added 7/1/99
	      if(ifd->NetRegularizer)
	      { 
		  double * partials = 
		    new double[ifd->NetRegularizer->num_components()];
		  const float * mix = record(lay,j)->probs(); 
		  const float * in_counts = Chain->counts_for_layer(0,j);
		  const float * out_probs = Chain->probs_for_layer(lay+1,j);
		  partials_of_cost_wrt_mix(partials,mix,ifd->NetRegularizer,
					   in_counts,out_probs);
		  record(lay,j)->set_share_from_partials(partials);
		  delete [] partials;
	      }
	      else
	      {
		const float * correct_probs = Chain->probs_for_layer(lay+1,j);
		record(lay,j)->set_share_from_entropy(correct_probs,1);
	      }
	    }
	    some_layer_trained=1;
	   
	}
	else if (some_layer_trained)
	{   // propagate from next layer

	    // clear the partials for this layer
	    for (int pos=-overhang; pos<Length+overhang; pos++)
		record(lay,pos)->clear_partials();

	    // for each record in the next layer,
	    // add to the partials for the inputs for this layer
	    int next_overhang=Net->overhang(lay+1);
	    int next_wind=Net->layer(lay+1)->num_wind();
	    for (int next_pos= -next_overhang; next_pos<Length+next_overhang; next_pos++)
	    {   ActivationRecord* next_rec=record(lay+1, next_pos);
		for (int w=next_wind-1; w>=0; w--)
		{   int pos = next_pos+w-(next_wind-1)/2;
		    record(lay,pos)-> add_partials_from_next(next_rec, w);
		}
	    }
	    
	    // now use the partials that were computed to set the shares.
	    for (int pos=-overhang; pos<Length+overhang; pos++)
		record(lay,pos)->set_share_from_partials();
	    
	}
    }
}

// Added by Jes Frellsen
// For an already back-propagated set of activations records,
// calculate the partial derivatives with respect to the inputs
void NetActivation::add_InputPartials(double **partials) const
{
    // This layer is -1
    int next_lay =    0;

    const InterfaceDescription* next_ifd = Net->interface(next_lay);
    const NeuralLayer* next_layer = Net->layer(next_lay); 
    
    int num_units = next_ifd->num_units();
    assert(num_units == next_layer->num_in());

    // Lookup overhang and windowsize for nex layer
    int next_overhang=Net->overhang(next_lay);
    int next_wind=Net->layer(next_lay)->num_wind();

    // For each record in the next layer,
    // add to the partials for the inputs for this layer
    for(int next_pos = -next_overhang; next_pos < Length+next_overhang; next_pos++) 
    {	ActivationRecord* next_rec=record(next_lay, next_pos);
        // Add to the partials that the window covers in this layer
        for(int w=next_wind-1; w>=0; w--) 
        {   int pos = w + next_pos-(next_wind-1)/2;
            // Don't use any overhang to partials
            if( 0 <= pos && pos < Length) 
            {	// Set the partials for input in the column
		int num_out = next_layer->num_out();

		for (int o=num_out-1; o>=0; o--) 
		{   double in_share =    next_layer->gain(o) * next_rec->exp_share(o);
		    assert(isfinite(in_share));
		    for (int i=num_out-1; i>=0; i--)
		    {   partials[pos][i] += in_share * next_layer->weight(i,w,o);
		        assert(isfinite(partials[pos][i]));
		    }
		}
            }
        }
    }
}

short int *NetActivation::psec(int layer) const
{  short int *str = new short int[Length];
   for (int i=0; i<Length; ++i) 
   { str[i] = record(layer,i)->highest_prob_output();
   } 

   /*   str[Length]=0; */
   return str; 
}


//Change Log
// 21 March 1998	Kevin Karplus
//	Restored allocation to original intent (I hope),
//	and clarified documentation in .h files.
// 30 April 1998 Kevin Karplus
//	Added NetActChangeCorrectWeight
//	Moved ActWrongWeight to NetActWrongWeight
// 5 May 1998 Melissa Cline
//      Added functionality in back_propogation to skip over any columns
//      in which there's no correct answer - in which the correct output
//      is a gap character.
// 11 May 1998 Melissa Cline
//	Debugged functionality to skip over any columns in which
//	there's no correct answer.
// 28 July 1998 Kevin Karplus
//	Added weight to back_propagate, removed cost computation from
//	back_propagate.
// 15 September 1999 Sugato Basu
//      Added code to compute cost for ALIGNMENTS output format
// 15 September 1999 Sugato Basu
//      Added code to compute cost when NetRegularizer is used to predict
//      mixture components
// 12 October 2001 Kevin Karplus
//	Added missing "some_layer_trained" in back_propagate for
//	TrainTo but not train_to_unique.
// 20 Apr 2004 Sol Katzman
//      Make ActivationRecord data member Layer private,uppercase; add public layer() function.
// Mon Jun 13 03:31:29 PDT 2005 Kevin Karplus
//	Picked up Jes Frellsen's additions of 
//	calc_InputPartials and CorrectPredictedLetters
//	Modified indenting style to match my code.
// Mon Jun 13 05:37:58 PDT 2005 Kevin Karplus
//	Added isfinite assertions to calc_InputPartials
// Mon Jun 13 09:19:11 PDT 2005 Kevin Karplus
//	Eliminated CorrectPredictedLetters (using QualityRecords now)
// Sat Jul  9 07:39:49 PDT 2005 Kevin Karplus
//	Split calc_InputPartials into clear_InputPartials and
//	add_InputPartials 
// Fri Jul 22 14:41:29 PDT 2005 Kevin Karplus
//	Added handling of chain without training data to test_outputs
//	(assumes background probabilities desired)