29

I have an array A with zeros and ones. I want to find the sum of all numbers in A. I want to test two functions:

First function

void test1(int curIndex){
    if(curIndex == size) return;
    test1(curIndex+1);
    s+=A[curIndex];
}

Second function

void test2(int curIndex){
    if(curIndex == size) return;
    s+=A[curIndex];
    test2(curIndex+1);
}

I used the PAPI library to count the number of instructions, here is the entire experiment:

#include <iostream>
#include <fstream>
#include "Statistics.h"

using namespace std;


int size;
int *A;
int s;

void test3(int curIndex){
    if(curIndex == size) return;
    test3(curIndex+1);
    s+=A[curIndex];
}

int main(int argc, char* argv[]){

    size = atoi(argv[1]);
    if(argc!=2){
        cout<<"type ./executable size{odd integer}"<<endl;
        return 1;
    }
    if(size%2!=1){
        cout<<"size must be an odd number"<<endl;
        return 1;
    }
    A = new int[size];
    int i;
    for(i=0;i<size;i++){
        if(i%2==0){
            A[i] = false;
        }
        else{
            A[i] = true;
        }
    }

    Statistics stat(1);
    stat.start();
    test3(0);
    stat.stop();
    stat.printWithHelp();
    cout<<s<<endl;

    return 0;
}

Here is the Statistics.h file:

#ifndef TRIPLETPARSER_STATISTICS_H
#define TRIPLETPARSER_STATISTICS_H

#include <time.h>
#include <unistd.h>
#include <fstream>
#include <papi.h>
#include <iostream>
#include <iostream>

#define BILLION  1000000000LL

using namespace std;

class Statistics {

private:
    timespec s, e;
    /*
    PAPI_BR_CN  Conditional branch instructions
    PAPI_BR_INS     Branch instructions
    PAPI_BR_MSP     Conditional branch instructions mispredicted
    PAPI_BR_NTK     Conditional branch instructions not taken
    PAPI_BR_PRC     Conditional branch instructions correctly predicted
    PAPI_BR_TKN     Conditional branch instructions taken
    PAPI_BR_UCN     Unconditional branch instructions
    PAPI_BRU_IDL    Cycles branch units are idle
    PAPI_BTAC_M     Branch target address cache misses

    PAPI_TLB_DM     Data translation lookaside buffer misses
    */
    int events[10];  // , PAPI_L2_TCA,PAPI_L3_TCM,PAPI_L3_TCA PAPI_BR_CN, PAPI_BR_PRC}; //type of events we are interested in
    int num_hwcntrs;  //total amount of events stored in 'events' array
    long long values[10];
    long long counters[10];

    void handle_error(int err){
        std::cerr << "PAPI error: " << err << std::endl;
    }

public:
    Statistics(int papi){
        for(size_t i = 0; i< 10; i++)
            counters[i]=0.0;

        switch(papi){
            case 0:
                num_hwcntrs = 0;
                break;
            case 1:
                num_hwcntrs = 6;
                events[0] = PAPI_L2_TCA;
                events[1] = PAPI_L3_TCA;
                events[2] = PAPI_L3_TCM;
                events[3] = PAPI_TOT_INS;
                events[4] = PAPI_BR_INS;
                events[5] = PAPI_BR_MSP;
                break;
        }

    }

    void start(){

        for(size_t i = 0; i< 10; i++)
            counters[i]=0.0;

        if (num_hwcntrs != 0 && PAPI_start_counters(events, num_hwcntrs) != PAPI_OK)
            handle_error(1);
    }


    void start(float ratio){

        if (num_hwcntrs != 0 && PAPI_start_counters(events, num_hwcntrs) != PAPI_OK)
            handle_error(1);
    }


    void stop(){
        if (num_hwcntrs != 0 && PAPI_stop_counters(values, num_hwcntrs) != PAPI_OK)
            handle_error(1);
        update();
    }

    void stop(float ratio){
        if (num_hwcntrs != 0 && PAPI_stop_counters(values, num_hwcntrs) != PAPI_OK)
            handle_error(1);
        update();
    }


    void update(){
        for(size_t i = 0; i < num_hwcntrs; i++)
            counters[i] += values[i];
    }

    void print(){
        for(int i=0;i<num_hwcntrs;i++)
            std::cout << counters[i] << "\t";
        //cout<<"L2 cache miss ratio: "<<counters[1]/(double)counters[0]<<endl;
        //cout<<"L3 cache miss ratio: "<<counters[3]/(double)counters[2]<<endl;
    }

    void printWithHelp(){
        cout<<"L2 accesses: "<<counters[0]<<endl;
        cout<<"L2 miss/access ratio: "<<(double)counters[1]/counters[0]<<endl;
        cout<<"L3 accesses: "<<counters[1]<<endl;
        cout<<"L3 misses: "<<counters[2]<<endl;
        cout<<"L3 miss/access ratio: "<<(double)counters[2]/counters[1]<<endl;
        cout<<"Instructions: "<<counters[3]<<endl;
        cout<<"Branches: "<<counters[4]<<endl;
        cout<<"Branch mispredictions: "<<counters[5]<<endl;
        cout<<"Branch miss/predict ratio: "<<(double)counters[5]/counters[4]<<endl;
    }

    void print(float avg_ratio){
        for (int i = 0; i<num_hwcntrs; i++)
            std::cout << (double)(avg_ratio*counters[i]) << "\t";
    }

};

#endif //TRIPLETPARSER_STATISTICS_H

This is the output that I get from the first function when the size of A is 111,111

L2 accesses: 24126
L2 miss/access ratio: 0.131559
L3 accesses: 3174
L3 misses: 587
L3 miss/access ratio: 0.18494
Instructions: 1022776
Branches: 178113
Branch mispredictions: 6976
Branch miss/predict ratio: 0.0391661

This is the output that I get from the second function when the size of A is 111,111

L2 accesses: 7090
L2 miss/access ratio: 0.163752
L3 accesses: 1161
L3 misses: 507
L3 miss/access ratio: 0.436693
Instructions: 555860
Branches: 111189
Branch mispredictions: 25
Branch miss/predict ratio: 0.000224842

Why the difference in the results? The instructions reduce by half, the branch mispredictions are almost eliminated. What is happening here?

10
  • 11
    @PSIAlt, the second is also recursive, albeit eligible for tail-call optimization (I think). Commented Sep 19, 2016 at 13:01
  • 5
    google "tail recursion". Because the recursive call is at the end, the compiler can skip the stack and not bother keeping track of each instance of calling the next step. (Recognizing that some code can be reduced to a special pattern is a lot harder than recognizing that it is already in the special pattern.) Commented Sep 19, 2016 at 13:06
  • 19
    "Why isn't the G++ compiler smart enough to realize that these two functions are exactly the same?" Because they're not... Commented Sep 19, 2016 at 13:06
  • 8
    The functions don't perform the additions in the same order. Deducing that they still produce the same result is pretty difficult. (Here is a string version that illustrates the difference.)
    – molbdnilo
    Commented Sep 19, 2016 at 13:40
  • 4
    @molbdnilo: for that matter, if you're not using -fwrapv than integer addition isn't necessarily associative. Adding together INT_MAX, 1 and -1 in one order results in an overflow with undefined behaviour, in the other order has a defined result. So in some cases (maybe not this one) gcc needs to be cautious in order to avoid violating the assumptions made in its own optimisations elsewhere, turning good code into bad. Commented Sep 19, 2016 at 18:22

1 Answer 1

55
+100

Your second function is tail-recursive. That means the compiler can optimize it to something like:

void test2(int curIndex){
  while(true)
  {
    if(curIndex == size) return;
    s+=A[curIndex];
    curIndex = curIndex + 1;
  }
}

That reduces the number of instructions significantly. It also reduces the number of stack frames needed to (at most) one. As a result, it uses a lot less memory, which results in the reduction of cache misses.

The compiler is not able to make this optimization for the first function.

UPDATE: Some people have asked why the compiler cannot make that optimization on the first function.

Let's start by observing the function is not tail-recursive. A function is tail recursive if the very last thing that happens is a recursive call to the same function, followed by returning the result of that recursive call (if any).

Clearly, that is not the case for the first function, s+=A[curIndex]; is executed after the recursive call.

So then people have asked why the compiler cannot turn the first function into the second one.

The question is "why does G++ not have this feature?" The answer to that question is always the same. Features are unimplemented by default; G++ does not have that feature because no one designed, implemented and shipped the feature to customers.

That should be the end of it, but of course people will want to know why nobody designed, implemented and tested this feature. Well, maybe nobody thought of doing it. But more importantly, the feature would be far from trivial.

First of all, the compiler would have to understand that

test1(curIndex+1);
s+=A[curIndex];

and

s+=A[curIndex];
test1(curIndex+1);

are equivalent. That is a non trivial observation, given that from a mechanical point of view they are not equivalent! Indeed, the first one effectively loops from the end of the array to the start, whereas the second one loops from the start to the end. Is that the same? It yields the same result when A is an int* (and s in an int), but it doesn't in other cases (e.g. when A is a double* and s is a double). Do we expect a compiler to be that smart?

So here we have a potential feature with a high cost to implement. But the cost may be worth it, if the benefit is high. Is the benefit high? I would guess that this happens very little in real code, i.e. developers are likely to write the second form anyway. So there you have it: an expensive feature with little benefit. IMHO, compiler developers are wise to spend their valuable time on more useful features.

9
  • 2
    But the big question is: Why is the compiler not able to optimize the first function? Or: Is there a compiler that do this optimization? Is the optimisation consistent with the C++ standard?
    – knivil
    Commented Sep 19, 2016 at 18:03
  • 2
    @knivil - the question for optimization is always "can a conforming program tell the difference?" Cache hits or misses are not detectable in a conforming program. Commented Sep 19, 2016 at 18:21
  • 12
    @Yakk In languages like C++, proving that you can interchange any operation with a non-inlinable function call is very difficult, to the point where many compilers don't even bother. I know GCC can do this in at least some cases for functions that it has been explicitly informed (with __attribute__) have no side effects, but this is not such a function.
    – zwol
    Commented Sep 19, 2016 at 19:03
  • 16
    @zwol Exactly! This question is less an example about how bad gcc is at reasoning on code, and more an example of how simple a problem can be and still look very much like the halting problem to a compiler
    – Steve Cox
    Commented Sep 19, 2016 at 19:55
  • 2
    @knivil please note that I am talking only about the optimisation of a tail recursive function, which is a special case of tail call optimisation. In the recursive case, only one function is involved, and it is easy to demonstrate the optimization in source. General TCO replaces a call with a jump, possibly into the body of a different function, and cannot always be demonstrated in source. Depending on the calling convention, it is still relatively easy at the assembly level though. Commented Sep 22, 2016 at 12:00

Not the answer you're looking for? Browse other questions tagged or ask your own question.