I have an array A
with zeros and ones. I want to find the sum of all numbers in A
. I want to test two functions:
First function
void test1(int curIndex){
if(curIndex == size) return;
test1(curIndex+1);
s+=A[curIndex];
}
Second function
void test2(int curIndex){
if(curIndex == size) return;
s+=A[curIndex];
test2(curIndex+1);
}
I used the PAPI library to count the number of instructions, here is the entire experiment:
#include <iostream>
#include <fstream>
#include "Statistics.h"
using namespace std;
int size;
int *A;
int s;
void test3(int curIndex){
if(curIndex == size) return;
test3(curIndex+1);
s+=A[curIndex];
}
int main(int argc, char* argv[]){
size = atoi(argv[1]);
if(argc!=2){
cout<<"type ./executable size{odd integer}"<<endl;
return 1;
}
if(size%2!=1){
cout<<"size must be an odd number"<<endl;
return 1;
}
A = new int[size];
int i;
for(i=0;i<size;i++){
if(i%2==0){
A[i] = false;
}
else{
A[i] = true;
}
}
Statistics stat(1);
stat.start();
test3(0);
stat.stop();
stat.printWithHelp();
cout<<s<<endl;
return 0;
}
Here is the Statistics.h
file:
#ifndef TRIPLETPARSER_STATISTICS_H
#define TRIPLETPARSER_STATISTICS_H
#include <time.h>
#include <unistd.h>
#include <fstream>
#include <papi.h>
#include <iostream>
#include <iostream>
#define BILLION 1000000000LL
using namespace std;
class Statistics {
private:
timespec s, e;
/*
PAPI_BR_CN Conditional branch instructions
PAPI_BR_INS Branch instructions
PAPI_BR_MSP Conditional branch instructions mispredicted
PAPI_BR_NTK Conditional branch instructions not taken
PAPI_BR_PRC Conditional branch instructions correctly predicted
PAPI_BR_TKN Conditional branch instructions taken
PAPI_BR_UCN Unconditional branch instructions
PAPI_BRU_IDL Cycles branch units are idle
PAPI_BTAC_M Branch target address cache misses
PAPI_TLB_DM Data translation lookaside buffer misses
*/
int events[10]; // , PAPI_L2_TCA,PAPI_L3_TCM,PAPI_L3_TCA PAPI_BR_CN, PAPI_BR_PRC}; //type of events we are interested in
int num_hwcntrs; //total amount of events stored in 'events' array
long long values[10];
long long counters[10];
void handle_error(int err){
std::cerr << "PAPI error: " << err << std::endl;
}
public:
Statistics(int papi){
for(size_t i = 0; i< 10; i++)
counters[i]=0.0;
switch(papi){
case 0:
num_hwcntrs = 0;
break;
case 1:
num_hwcntrs = 6;
events[0] = PAPI_L2_TCA;
events[1] = PAPI_L3_TCA;
events[2] = PAPI_L3_TCM;
events[3] = PAPI_TOT_INS;
events[4] = PAPI_BR_INS;
events[5] = PAPI_BR_MSP;
break;
}
}
void start(){
for(size_t i = 0; i< 10; i++)
counters[i]=0.0;
if (num_hwcntrs != 0 && PAPI_start_counters(events, num_hwcntrs) != PAPI_OK)
handle_error(1);
}
void start(float ratio){
if (num_hwcntrs != 0 && PAPI_start_counters(events, num_hwcntrs) != PAPI_OK)
handle_error(1);
}
void stop(){
if (num_hwcntrs != 0 && PAPI_stop_counters(values, num_hwcntrs) != PAPI_OK)
handle_error(1);
update();
}
void stop(float ratio){
if (num_hwcntrs != 0 && PAPI_stop_counters(values, num_hwcntrs) != PAPI_OK)
handle_error(1);
update();
}
void update(){
for(size_t i = 0; i < num_hwcntrs; i++)
counters[i] += values[i];
}
void print(){
for(int i=0;i<num_hwcntrs;i++)
std::cout << counters[i] << "\t";
//cout<<"L2 cache miss ratio: "<<counters[1]/(double)counters[0]<<endl;
//cout<<"L3 cache miss ratio: "<<counters[3]/(double)counters[2]<<endl;
}
void printWithHelp(){
cout<<"L2 accesses: "<<counters[0]<<endl;
cout<<"L2 miss/access ratio: "<<(double)counters[1]/counters[0]<<endl;
cout<<"L3 accesses: "<<counters[1]<<endl;
cout<<"L3 misses: "<<counters[2]<<endl;
cout<<"L3 miss/access ratio: "<<(double)counters[2]/counters[1]<<endl;
cout<<"Instructions: "<<counters[3]<<endl;
cout<<"Branches: "<<counters[4]<<endl;
cout<<"Branch mispredictions: "<<counters[5]<<endl;
cout<<"Branch miss/predict ratio: "<<(double)counters[5]/counters[4]<<endl;
}
void print(float avg_ratio){
for (int i = 0; i<num_hwcntrs; i++)
std::cout << (double)(avg_ratio*counters[i]) << "\t";
}
};
#endif //TRIPLETPARSER_STATISTICS_H
This is the output that I get from the first function when the size of A
is 111,111
L2 accesses: 24126
L2 miss/access ratio: 0.131559
L3 accesses: 3174
L3 misses: 587
L3 miss/access ratio: 0.18494
Instructions: 1022776
Branches: 178113
Branch mispredictions: 6976
Branch miss/predict ratio: 0.0391661
This is the output that I get from the second function when the size of A
is 111,111
L2 accesses: 7090
L2 miss/access ratio: 0.163752
L3 accesses: 1161
L3 misses: 507
L3 miss/access ratio: 0.436693
Instructions: 555860
Branches: 111189
Branch mispredictions: 25
Branch miss/predict ratio: 0.000224842
Why the difference in the results? The instructions reduce by half, the branch mispredictions are almost eliminated. What is happening here?
-fwrapv
than integer addition isn't necessarily associative. Adding togetherINT_MAX
,1
and-1
in one order results in an overflow with undefined behaviour, in the other order has a defined result. So in some cases (maybe not this one) gcc needs to be cautious in order to avoid violating the assumptions made in its own optimisations elsewhere, turning good code into bad.