This post marks the beginning of my ventures into calculating square roots. However, I have no idea how the processor, more specifically the FPU with the fsqrt instruction, performs the process. I have included a better method for measuring time and also made some improvements to the method I wanted to demonstrate in that post, achieving fewer iterations than the original. However, due to the many divisions, it is still a bit slower than the incredible Newton’s method. Therefore, I will post two codes I developed during this time, for you to give your opinion and see what can be improved. Especially regarding the calculation of Carmack’s method.
Here is the improved code from the initial post:
#include <iostream>
#include <iomanip>
#include <cmath>
#include <chrono>
#include <limits>
#include <vector>
#include <random>
#include <algorithm>
#include <numeric>
// Constants
const int NUM_EXECUTIONS = 1000000;
const double precision = 1e-12; // Adjusted precision
bool showNewtonIteration = true;
bool showAnothersIteration = true;
struct IterationData {
double number;
double sqrtResult;
int iterations;
};
// Function to calculate a good initial approximation
double calculateInitial(double number) {
double initial = (number + 1) * 0.5; // Use multiplication instead of division
for (int i = 0; i < 5; i++) {
initial = 0.5 * (initial + number / initial);
}
return initial;
}
// Optimized Newton's method for finding the square root
double newtonSqrt(double number, double precision, int &iterations, bool showIteration) {
double guess = number * 0.5; // Starting guess
double nextGuess;
iterations = 0;
while (true) {
nextGuess = 0.5 * (guess + number / guess);
iterations++;
if (showIteration && showNewtonIteration) {
std::cout << "Newton's method - ";
std::cout << std::setprecision(std::numeric_limits<double>::max_digits10) << nextGuess << " and more...";
if (std::fabs(nextGuess - guess) < precision) {
std::cout << " reached desired precision";
}
std::cout << std::endl;
}
if (std::fabs(nextGuess - guess) < precision) {
break;
}
guess = nextGuess;
}
return nextGuess;
}
// Iterative another's method with dynamic initial approximation
double anothersMethod(double number, double precision, int &iterations, bool showIteration) {
double f = calculateInitial(number); // Using the initial approximation
double e;
double prev_f = 0;
iterations = 0;
while (true) {
f = 0.5 * (f + number / f); // Recalculate initial dynamically
e = f - (f * f - number) / (f + number / f); // New formula
// Check if the value repeats within the precision
if (std::fabs(f - prev_f) < precision) {
break;
}
prev_f = f;
f = e;
iterations++;
if (showIteration && showAnothersIteration) {
std::cout << "Another's method - ";
std::cout << std::setprecision(std::numeric_limits<double>::max_digits10) << f << " and more...";
if (std::fabs(f - prev_f) < precision) {
std::cout << " reached desired precision";
}
std::cout << std::endl;
}
}
return f;
}
// Function to calculate the error
double calculateError(double root, double number) {
double square = root * root;
return std::fabs(square - number);
}
void showIterations(double number, double &resultNewton, double &resultAnothers, int &iterationsNewton, int &iterationsAnothers) {
resultNewton = newtonSqrt(number, precision, iterationsNewton, true);
std::cout << std::setprecision(std::numeric_limits<double>::max_digits10)
<< "Newton's method final result: " << resultNewton << " reached desired precision, Iterations: " << iterationsNewton << "nn";
resultAnothers = anothersMethod(number, precision, iterationsAnothers, true);
std::cout << std::setprecision(std::numeric_limits<double>::max_digits10)
<< "Another's method final result: " << resultAnothers << " reached desired precision, Iterations: " << iterationsAnothers << "nn";
}
void measureTime(double* randomNumbers, std::vector<double> ×Sqrt, std::vector<double> ×Newton, std::vector<double> ×Anothers,
std::vector<IterationData> &newtonData, std::vector<IterationData> &anothersData) {
// Measuring time for standard sqrt function
std::cout << "Calculating times...n";
auto startSqrt = std::chrono::high_resolution_clock::now();
for (int i = 0; i < NUM_EXECUTIONS; i++) {
volatile double result = std::sqrt(randomNumbers[i]);
}
auto endSqrt = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> durationSqrt = endSqrt - startSqrt;
timesSqrt.push_back(durationSqrt.count());
// Measuring time for Newton's pure method
auto startNewton = std::chrono::high_resolution_clock::now();
for (int i = 0; i < NUM_EXECUTIONS; i++) {
int iterationsNewton = 0;
double result = newtonSqrt(randomNumbers[i], precision, iterationsNewton, false);
newtonData.push_back({randomNumbers[i], result, iterationsNewton});
}
auto endNewton = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> durationNewton = endNewton - startNewton;
timesNewton.push_back(durationNewton.count());
// Measuring time for the another's method
auto startAnothers = std::chrono::high_resolution_clock::now();
for (int i = 0; i < NUM_EXECUTIONS; i++) {
int iterationsAnothers = 0;
double result = anothersMethod(randomNumbers[i], precision, iterationsAnothers, false);
anothersData.push_back({randomNumbers[i], result, iterationsAnothers});
}
auto endAnothers = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> durationAnothers = endAnothers - startAnothers;
timesAnothers.push_back(durationAnothers.count());
}
void displayAverageTimes(const std::vector<double> ×Sqrt, const std::vector<double> ×Newton, const std::vector<double> ×Anothers) {
double avgSqrt = std::accumulate(timesSqrt.begin(), timesSqrt.end(), 0.0) / timesSqrt.size();
double avgNewton = std::accumulate(timesNewton.begin(), timesNewton.end(), 0.0) / timesNewton.size();
double avgAnothers = std::accumulate(timesAnothers.begin(), timesAnothers.end(), 0.0) / timesAnothers.size();
std::cout << "nAverage Results:n";
std::cout << std::scientific << std::setprecision(6); // Ensure values are displayed in scientific notation
std::cout << "Standard sqrt average time (ms): " << avgSqrt << std::endl;
std::cout << "Newton's method average time (ms): " << avgNewton << std::endl;
std::cout << "Another's method average time (ms): " << avgAnothers << std::endl;
}
void calculateAndDisplayErrors(double number, const std::vector<double> &roots) {
double actualSqrt = std::sqrt(number);
std::cout << "nError comparison with actual sqrt(" << number << "): " << std::setprecision(std::numeric_limits<double>::max_digits10) << actualSqrt << "n";
for (size_t i = 0; i < roots.size(); ++i) {
double error = calculateError(roots[i], number);
std::cout << "Error for root " << i + 1 << " (" << std::setprecision(std::numeric_limits<double>::max_digits10) << roots[i] << "): " << error << "n";
}
}
void displaySomeIterations(const std::vector<IterationData> &newtonData, const std::vector<IterationData> &anothersData) {
std::vector<IterationData> sortedNewtonData = newtonData;
std::vector<IterationData> sortedAnothersData = anothersData;
// Sort the data based on the number of iterations
std::sort(sortedNewtonData.begin(), sortedNewtonData.end(), [](const IterationData &a, const IterationData &b) {
return a.iterations > b.iterations;
});
std::sort(sortedAnothersData.begin(), sortedAnothersData.end(), [](const IterationData &a, const IterationData &b) {
return a.iterations > b.iterations;
});
std::cout << "nSome random numbers and their iterations for Newton's method:n";
for (size_t i = 0; i < 10 && i < sortedNewtonData.size(); ++i) {
std::cout << "Number: " << sortedNewtonData[i].number
<< ", Iterations: " << sortedNewtonData[i].iterations
<< ", Sqrt: " << sortedNewtonData[i].sqrtResult << "n";
}
std::cout << "nSome random numbers and their iterations for Another's method:n";
for (size_t i = 0; i < 10 && i < sortedAnothersData.size(); ++i) {
std::cout << "Number: " << sortedAnothersData[i].number
<< ", Iterations: " << sortedAnothersData[i].iterations
<< ", Sqrt: " << sortedAnothersData[i].sqrtResult << "n";
}
}
void clearScreen() {
#ifdef _WIN32
system("cls");
#else
system("clear");
#endif
}
int main() {
double number;
std::cout << "Enter the number to find the square root: ";
std::cin >> number;
// Show iterations for each method with user input
double resultNewton, resultAnothers;
int iterationsNewton, iterationsAnothers;
showIterations(number, resultNewton, resultAnothers, iterationsNewton, iterationsAnothers);
std::cout << "Press any key to continue . . .";
std::cin.get();
std::cin.get();
std::vector<double> timesSqrt, timesNewton, timesAnothers;
std::vector<IterationData> newtonData, anothersData;
double* randomNumbers = new double[NUM_EXECUTIONS];
std::mt19937_64 rng;
std::uniform_real_distribution<double> dist(1.0, 100.0); // Random numbers between 1 and 100
for (int i = 0; i < NUM_EXECUTIONS; i++) {
randomNumbers[i] = dist(rng);
}
measureTime(randomNumbers, timesSqrt, timesNewton, timesAnothers, newtonData, anothersData);
delete[] randomNumbers;
std::cout << "Press any key to continue to error calculations . . .";
std::cin.get();
// Display the error calculations
std::vector<double> roots = {resultNewton, resultAnothers};
calculateAndDisplayErrors(number, roots);
std::cout << "Press any key to continue to time measurements . . .";
std::cin.get();
// Display the average times
displayAverageTimes(timesSqrt, timesNewton, timesAnothers);
std::cout << "Press any key to continue to top iterations display . . .";
std::cin.get();
// Display the top iterations
std::cout << "Some random numbers and their iterations:n";
displaySomeIterations(newtonData, anothersData);
std::cout << "Press any key to exit . . .";
std::cin.get();
return 0;
}
Here is the code to calculate the magic number:
#include <iostream>
#include <iomanip>
#include <cmath>
#include <limits>
#include <vector>
#include <thread>
#include <mutex>
#include <chrono>
#include <atomic>
#include <sstream>
// Uncomment the following line to compile for Linux
// #define LINUX
#ifdef LINUX
#include <unistd.h>
#define MOVE_CURSOR_TO(x, y) std::cout << "33[" << (x) << ";" << (y) << "H"
#else
#include <windows.h>
#include <conio.h>
void moveCursorTo(int x, int y) {
COORD coord = {static_cast<SHORT>(y), static_cast<SHORT>(x)};
SetConsoleCursorPosition(GetStdHandle(STD_OUTPUT_HANDLE), coord);
}
#define MOVE_CURSOR_TO(x, y) moveCursorTo((x), (y))
#endif
const int BAR_WIDTH = 70; // Width of the progress bar
const int SLEEP_DURATION = 1; // Duration to sleep between progress updates in seconds
std::mutex mtx;
float globalMinError = std::numeric_limits<float>::max();
unsigned int globalBestMagicNumber = 0;
std::atomic<bool> done(false);
unsigned int numThreads = std::thread::hardware_concurrency(); // Determine the number of threads based on the processor
std::vector<unsigned int> threadProgress(numThreads, 0);
// Function to calculate the inverse square root using Carmack's method with an adjustable magic number
float fastInverseSqrt(float number, unsigned int magicNumber) {
float x2 = number * 0.5F;
float y = number;
union {
float f;
int i;
} u;
u.f = y;
u.i = magicNumber - (u.i >> 1);
y = u.f;
y = y * (1.5F - (x2 * y * y));
return y;
}
// Function to calculate the square root using the inverse square root
float calculateSqrt(float number, unsigned int magicNumber) {
float invSqrt = fastInverseSqrt(number, magicNumber);
return number * invSqrt;
}
// Function to display a single progress bar
void showProgressBar(unsigned int current, unsigned int total) {
float progress = static_cast<float>(current) / total;
int pos = static_cast<int>(BAR_WIDTH * progress);
std::ostringstream bar;
bar << "[";
for (int i = 0; i < BAR_WIDTH; ++i) {
if (i < pos) bar << "=";
else if (i == pos) bar << ">";
else bar << " ";
}
bar << "] " << std::setw(3) << static_cast<int>(progress * 100.0) << " %";
std::cout << bar.str();
}
// Function to test a range of magic numbers and find the best one
void testMagicNumbersRange(unsigned int start, unsigned int end, const std::vector<float>& testValues, unsigned int threadIndex) {
unsigned int totalIterations = end - start;
for (unsigned int magicNumber = start; magicNumber < end; ++magicNumber) {
float totalError = 0.0f;
for (float value : testValues) {
float approxSqrt = calculateSqrt(value, magicNumber);
float exactSqrt = std::sqrt(value);
float error = std::fabs(approxSqrt - exactSqrt);
totalError += error;
}
float averageError = totalError / testValues.size();
{
std::lock_guard<std::mutex> guard(mtx);
if (averageError < globalMinError) {
globalMinError = averageError;
globalBestMagicNumber = magicNumber;
}
}
// Update progress
threadProgress[threadIndex] = magicNumber - start + 1;
// Show progress for this thread
if ((magicNumber - start) % 0x10000 == 0) {
std::lock_guard<std::mutex> guard(mtx);
MOVE_CURSOR_TO(threadIndex + 8, 1); // Move cursor to the appropriate line
std::cout << "Thread " << threadIndex + 1 << ": ";
showProgressBar(threadProgress[threadIndex], totalIterations);
std::cout.flush();
}
}
threadProgress[threadIndex] = totalIterations; // Mark thread as done
}
// Function to display the header
void showHeader() {
std::cout << "=========================================n";
std::cout << " Inverse Square Root Calculation Progressn";
std::cout << "=========================================n";
std::cout << "Calculating the optimal magic number forn";
std::cout << "fast inverse square root approximation.n";
std::cout << "Processor threads available: " << std::thread::hardware_concurrency() << std::endl;
std::cout << "Threads used: " << numThreads << std::endl;
std::cout << "-----------------------------------------n";
}
// Function to display overall progress
void showProgress(unsigned int totalIterations) {
while (!done) {
std::this_thread::sleep_for(std::chrono::seconds(SLEEP_DURATION));
// Display progress for all threads
std::lock_guard<std::mutex> guard(mtx);
for (unsigned int i = 0; i < numThreads; ++i) {
MOVE_CURSOR_TO(i + 8, 1); // Move cursor to the appropriate line
std::cout << "Thread " << i + 1 << ": ";
showProgressBar(threadProgress[i], totalIterations);
std::cout << std::endl;
}
}
}
int main() {
auto start = std::chrono::high_resolution_clock::now(); // Start timing
std::vector<float> testValues = { 0.5f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 10.0f, 100.0f, 1000.0f };
//unsigned int startMagicNumber = 0x00000000; // Begin with no idea what it can be
//unsigned int endMagicNumber = 0xFFFFFFFF; // End with no idea what it can be
unsigned int startMagicNumber = 0x5F000000; // Approximately close to the real value
unsigned int endMagicNumber = 0x5FFFFFFF; // Approximately close to the real value
unsigned int rangeSize = (endMagicNumber - startMagicNumber) / numThreads;
std::vector<std::thread> threads;
unsigned int totalIterations = rangeSize;
// Display initial header
showHeader();
std::thread progressThread(showProgress, totalIterations);
for (unsigned int i = 0; i < numThreads; ++i) {
unsigned int start = startMagicNumber + i * rangeSize;
unsigned int end = (i == numThreads - 1) ? endMagicNumber : start + rangeSize;
threads.emplace_back(testMagicNumbersRange, start, end, std::ref(testValues), i);
}
for (auto& t : threads) {
t.join();
}
done = true;
progressThread.join();
auto end = std::chrono::high_resolution_clock::now(); // End timing
std::chrono::duration<double> duration = end - start; // Calculate duration
std::cout << "nBest magic number: 0x" << std::hex << globalBestMagicNumber << std::dec << std::endl;
// Test the best magic number found
for (float value : testValues) {
float approxSqrt = calculateSqrt(value, globalBestMagicNumber);
float exactSqrt = std::sqrt(value);
std::cout << "Value: " << value
<< ", Approximated sqrt: " << approxSqrt
<< ", Exact sqrt: " << exactSqrt
<< ", Error: " << std::fabs(approxSqrt - exactSqrt) << std::endl;
}
std::cout << "nTotal execution time: " << duration.count() << " seconds" << std::endl;
std::cout << "Press any key to exit . . .";
std::cin.get();
return 0;
}
I am not sure if there is another way to calculate the magic number without brute force, and I would like to know.
vanzuita is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.