What are C++ performance optimization techniques - 面试题

C++ Performance Optimization Techniques

C++ is known for its high performance, but to fully unleash its performance potential, you need to master various optimization techniques and best practices.

Compiler Optimization

Compilation options:

bash
# Basic optimization
g++ -O2 program.cpp -o program

# Maximum optimization level
g++ -O3 program.cpp -o program

# Link-time optimization (LTO)
g++ -O3 -flto program.cpp -o program

# Architecture-specific optimization
g++ -O3 -march=native program.cpp -o program

# Enable vectorization
g++ -O3 -ftree-vectorize program.cpp -o program

Inline functions:

cpp
// Explicit inline
inline int add(int a, int b) {
    return a + b;
}

// Compiler auto-inlines (small functions)
int multiply(int a, int b) {
    return a * b;
}

// Force inline (GCC/Clang)
__attribute__((always_inline)) int divide(int a, int b) {
    return a / b;
}

// Prevent inlining
__attribute__((noinline)) void heavyFunction() {
    // Complex computation
}

Memory Optimization

Data locality:

cpp
// Not recommended: cache-unfriendly
struct BadLayout {
    int a;
    char padding1[64];  // Wastes cache line
    int b;
    char padding2[64];
    int c;
};

// Recommended: cache-friendly
struct GoodLayout {
    int a, b, c;  // Compact layout
};

// Array access patterns
void processArray(int* arr, size_t size) {
    // Recommended: sequential access
    for (size_t i = 0; i < size; ++i) {
        process(arr[i]);
    }
    
    // Not recommended: random access
    for (size_t i = 0; i < size; ++i) {
        process(arr[randomIndex()]);
    }
}

Memory alignment:

cpp
// Use alignas to specify alignment
struct alignas(64) AlignedData {
    int data[16];
};

// Use aligned_alloc to allocate aligned memory
void* alignedMemory = aligned_alloc(64, 1024);

// Query alignment requirements
constexpr size_t alignment = alignof(double);

Avoid memory fragmentation:

cpp
// Recommended: object pool
template <typename T, size_t PoolSize>
class ObjectPool {
private:
    std::array<T, PoolSize> pool;
    std::bitset<PoolSize> used;
    
public:
    T* allocate() {
        size_t index = used.find_first_not_set();
        if (index != std::string::npos) {
            used.set(index);
            return &pool[index];
        }
        return nullptr;
    }
    
    void deallocate(T* ptr) {
        size_t index = ptr - pool.data();
        used.reset(index);
    }
};

Algorithm Optimization

Avoid unnecessary copies:

cpp
// Not recommended: return value copy
std::vector<int> createVector() {
    std::vector<int> temp;
    // Fill data
    return temp;  // Copy before C++11
}

// Recommended: move semantics
std::vector<int> createVector() {
    std::vector<int> temp;
    // Fill data
    return temp;  // Move after C++11
}

// Use move semantics
std::vector<int> vec = createVector();  // Move construction

Use reference passing:

cpp
// Not recommended: pass by value
void process(std::vector<int> data) {
    // Process data
}

// Recommended: const reference passing
void process(const std::vector<int>& data) {
    // Process data
}

// Use reference when modification is needed
void modify(std::vector<int>& data) {
    // Modify data
}

Choose appropriate containers:

cpp
// Need random access: vector
std::vector<int> vec;

// Need frequent insertion at both ends: deque
std::deque<int> dq;

// Need frequent middle insertion/deletion: list
std::list<int> lst;

// Need key-based lookup: map/unordered_map
std::map<std::string, int> mp;
std::unordered_map<std::string, int> ump;

// Need fast lookup: set/unordered_set
std::set<int> s;
std::unordered_set<int> us;

Concurrency Optimization

Use thread pools:

cpp
class ThreadPool {
private:
    std::vector<std::thread> workers;
    std::queue<std::function<void()>> tasks;
    std::mutex queueMutex;
    std::condition_variable condition;
    bool stop;
    
public:
    ThreadPool(size_t threads) : stop(false) {
        for (size_t i = 0; i < threads; ++i) {
            workers.emplace_back([this] {
                while (true) {
                    std::function<void()> task;
                    {
                        std::unique_lock<std::mutex> lock(queueMutex);
                        condition.wait(lock, [this] {
                            return stop || !tasks.empty();
                        });
                        
                        if (stop && tasks.empty()) return;
                        
                        task = std::move(tasks.front());
                        tasks.pop();
                    }
                    task();
                }
            });
        }
    }
    
    template <class F, class... Args>
    auto enqueue(F&& f, Args&&... args) 
        -> std::future<typename std::result_of<F(Args...)>::type> {
        
        using return_type = typename std::result_of<F(Args...)>::type;
        
        auto task = std::make_shared<std::packaged_task<return_type()>>(
            std::bind(std::forward<F>(f), std::forward<Args>(args)...)
        );
        
        std::future<return_type> res = task->get_future();
        {
            std::unique_lock<std::mutex> lock(queueMutex);
            if (stop) throw std::runtime_error("enqueue on stopped ThreadPool");
            tasks.emplace([task]() { (*task)(); });
        }
        condition.notify_one();
        return res;
    }
    
    ~ThreadPool() {
        {
            std::unique_lock<std::mutex> lock(queueMutex);
            stop = true;
        }
        condition.notify_all();
        for (auto& worker : workers) {
            worker.join();
        }
    }
};

Use atomic operations:

cpp
// Not recommended: use mutex
std::mutex mtx;
int counter = 0;

void increment() {
    std::lock_guard<std::mutex> lock(mtx);
    ++counter;
}

// Recommended: use atomic operations
std::atomic<int> counter(0);

void increment() {
    counter.fetch_add(1, std::memory_order_relaxed);
}

Reduce lock contention:

cpp
// Not recommended: global lock
std::mutex globalMutex;
std::map<int, int> globalMap;

// Recommended: sharded locks
class ShardedMap {
private:
    static constexpr size_t NUM_SHARDS = 16;
    std::array<std::mutex, NUM_SHARDS> mutexes;
    std::array<std::map<int, int>, NUM_SHARDS> maps;
    
    size_t getShard(int key) {
        return key % NUM_SHARDS;
    }
    
public:
    void insert(int key, int value) {
        size_t shard = getShard(key);
        std::lock_guard<std::mutex> lock(mutexes[shard]);
        maps[shard][key] = value;
    }
    
    bool find(int key, int& value) {
        size_t shard = getShard(key);
        std::lock_guard<std::mutex> lock(mutexes[shard]);
        auto it = maps[shard].find(key);
        if (it != maps[shard].end()) {
            value = it->second;
            return true;
        }
        return false;
    }
};

SIMD Optimization

Use compiler intrinsics:

cpp
#include <immintrin.h>

void addArrays(float* a, float* b, float* result, size_t size) {
    size_t i = 0;
    
    // Use AVX2 to process 8 floats
    for (; i + 8 <= size; i += 8) {
        __m256 va = _mm256_loadu_ps(&a[i]);
        __m256 vb = _mm256_loadu_ps(&b[i]);
        __m256 vr = _mm256_add_ps(va, vb);
        _mm256_storeu_ps(&result[i], vr);
    }
    
    // Process remaining elements
    for (; i < size; ++i) {
        result[i] = a[i] + b[i];
    }
}

Use automatic vectorization:

cpp
// Compiler will auto-vectorize
void addArrays(float* __restrict__ a, 
              float* __restrict__ b, 
              float* __restrict__ result, 
              size_t size) {
    #pragma omp simd
    for (size_t i = 0; i < size; ++i) {
        result[i] = a[i] + b[i];
    }
}

Cache Optimization

Prefetch data:

cpp
#include <xmmintrin.h>

void processArray(int* arr, size_t size) {
    for (size_t i = 0; i < size; ++i) {
        // Prefetch next cache line
        if (i + 16 < size) {
            _mm_prefetch(&arr[i + 16], _MM_HINT_T0);
        }
        
        process(arr[i]);
    }
}

Cache line alignment:

cpp
// Avoid false sharing
struct alignas(64) Counter {
    std::atomic<int> value;
    char padding[64 - sizeof(std::atomic<int>)];
};

class SharedCounters {
private:
    Counter counters[4];
    
public:
    void increment(int threadId) {
        counters[threadId % 4].value.fetch_add(1);
    }
};

Performance Profiling

Use profiling tools:

bash
# Use perf (Linux)
perf record -g ./your_program
perf report

# Use gprof
g++ -pg program.cpp -o program
./program
gprof program gmon.out > analysis.txt

# Use Valgrind's callgrind
valgrind --tool=callgrind ./your_program
kcachegrind callgrind.out.<pid>

Use timers:

cpp
#include <chrono>

class Timer {
private:
    std::chrono::high_resolution_clock::time_point start;
    
public:
    Timer() : start(std::chrono::high_resolution_clock::now()) {}
    
    double elapsed() const {
        auto end = std::chrono::high_resolution_clock::now();
        return std::chrono::duration<double>(end - start).count();
    }
};

// Usage
Timer timer;
// Execute code
std::cout << "Elapsed time: " << timer.elapsed() << " seconds" << std::endl;

Best Practices

1. Pre-allocate memory

cpp
// Recommended
std::vector<int> vec;
vec.reserve(10000);  // Pre-allocate space

// Not recommended
std::vector<int> vec;
for (int i = 0; i < 10000; ++i) {
    vec.push_back(i);  // Multiple reallocations
}

2. Use emplace instead of push

cpp
// Recommended
vec.emplace_back(arg1, arg2);  // In-place construction

// Not recommended
vec.push_back(Type(arg1, arg2));  // May create temporary object

3. Avoid premature optimization

cpp
// Write clear code first
int sum = 0;
for (int val : data) {
    sum += val;
}

// Optimize after profiling
if (isPerformanceCritical) {
    // Use SIMD or parallelization
}

4. Use constexpr for compile-time computation

cpp
// Recommended
constexpr int factorial(int n) {
    return n <= 1 ? 1 : n * factorial(n - 1);
}

constexpr int result = factorial(10);  // Compile-time computation

5. Avoid virtual function call overhead

cpp
// Avoid virtual functions in performance-critical code
class FastProcessor {
public:
    void process(int value) {  // Non-virtual
        // Fast processing
    }
};

// Use CRTP for static polymorphism
template <typename Derived>
class Base {
public:
    void interface() {
        static_cast<Derived*>(this)->implementation();
    }
};

Notes

Performance optimization should be based on actual measurements, not guesses
Prioritize optimizing algorithms and data structures over micro-optimizations
Consider readability and maintainability, don't over-optimize
Use profiling tools to identify real bottlenecks
Be aware of optimization differences across compilers and platforms
Consider using modern C++ features (like move semantics) to improve performance
Pay attention to cache coherence and memory barriers in multithreaded environments