I am working with this assignment of optimizing a radix sort code in C++ and I need to reduce the execution time, my code is working and it looks like this:

```
void RadixSort::RadixSortNaive(std::vector<long> &Arr) {
long Max_Value = findMax(Arr);
int Max_Radix = 1;
while (1) {
if (Max_Radix >= Max_Value) break;
Max_Radix = Max_Radix*radix_;
}
for (int i = 1; i < Max_Radix; i = i*radix_) {
for (int j = 0; j < key_length_; j++) {
int K;
if (Arr[j] < i) K = 0;
else K = (Arr[j] / i) % radix_;
Q[K].push(Arr[j]);
}
int idx = 0;
for (int j = 0; j < radix_; j++) {
while (Q[j].empty() == 0) {
Arr[idx] = Q[j].front();
Q[j].pop();
idx++;
}
}
}
class RadixSort{
public :
void setConfig(int key_length, int radix) {
key_length_ = key_length;
radix_ = radix;
for (int i = 0; i < radix_; i++) {
Q.push_back(std::queue<long>());
}
}
long findMax(std::vector<long> Arr) const {
long Max = 0;
for (int i = 0; i < key_length_; i++) {
if (Max < Arr[i])
Max = Arr[i];
}
return Max;
}
void RadixSortNaive(std::vector<long> &Arr);
void RadixSortStudent(std::vector<long> &Arr);
private:
int key_length_;
int radix_;
std::vector<std::queue<long>> Q;
};
}
```

However, I am sure that there is still room for improvement. I have been trying to implement parallelization with OMP library but nothings seems to work. Is there any way where I can improve the previous code? Maybe improving the loops or any other code optimization technique.

Source: Windows Questions C++