I am working on optimizing a Single Producer Single Consumer (SPSC) queue in C++. The following is the miniman reproducible example for my implementation:
#include <atomic>
#include <cstddef>
#include <iostream>
#include <thread>
struct Queue {
alignas(64) std::atomic<size_t> write_index; // Align to cache line size
alignas(64) std::atomic<size_t> read_index; // Align to cache line size
int* data;
size_t capacity;
Queue(size_t cap) : write_index(0), read_index(0), capacity(cap + 1) {
data = new int[capacity];
}
~Queue() {
delete[] data;
}
bool push(int val) {
size_t write_index_local = write_index.load(std::memory_order_relaxed);
size_t next_write_index = (write_index_local + 1) % capacity;
if (next_write_index == read_index.load(std::memory_order_acquire)) {
// Queue is full
return false;
}
data[write_index_local] = val;
write_index.store(next_write_index, std::memory_order_release);
return true;
}
bool pop(int& val) {
const size_t write_index_local = write_index.load(std::memory_order_acquire);
const size_t read_index_local = read_index.load(std::memory_order_relaxed);
if (read_index_local == write_index_local) {
// Queue is empty
return false;
}
val = data[read_index_local];
size_t next_read_index = (read_index_local + 1) % capacity;
read_index.store(next_read_index, std::memory_order_release);
return true;
}
};
void producer(Queue& queue) {
for (int i = 0; i < 20; ++i) {
while (!queue.push(i)) {
// Busy-wait until there is space in the queue
}
std::cout << "Pushed: " << i << std::endl;
}
}
void consumer(Queue& queue) {
for (int i = 0; i < 20; ++i) {
int val;
while (!queue.pop(val)) {
// Busy-wait until there is an item to pop
}
std::cout << "Popped: " << val << std::endl;
}
}
int main() {
Queue queue(10); // Create a queue with capacity 10 (internally 11)
std::thread producer_thread(producer, std::ref(queue));
std::thread consumer_thread(consumer, std::ref(queue));
producer_thread.join();
consumer_thread.join();
return 0;
}
I'm observing two cache misses in pop function:
When fetching the write_index. When fetching the data at read_index. I want to optimize this to reduce the number of cache misses. Whenever write_index is changed, pop function has to fetch both write_index and the data. Ideally, I would like to fetch both the write_index and the data at the same time, to avoid cache miss while fetching data.
I thought about using prefetching techniques, but I'm not sure about the best approach for this scenario.
Is it possible to fetch both write_index and data at read_index together? Are there any specific data layout strategies or memory order considerations that can help in minimizing cache misses in this context?
data[read_index_local]
is going to be in cache if you're repeatedly callingpop
, why do you think that's not the case? Have you ran benchmarks?