-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinvert.cpp
198 lines (167 loc) · 6.03 KB
/
invert.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#include <iostream>
#include <chrono>
#include <iomanip>
#include <bitset>
#include <array>
#include <thread>
#include "eigen/Dense"
#include "gray.h"
#include "matrix.h"
#include "benchmark.h"
// A single threaded naive approach that computes the inverse for every combination
bool eigen_random()
{
const auto size = 7;
auto success = true;
for (int i = 0; i < 35*4; ++i)
{
auto randomMatrix = Eigen::MatrixXd::Random(size, size);
Eigen::MatrixXd inverse = randomMatrix.inverse();
success = success && inverse.allFinite();
}
return success;
}
// Directly compute the inverse for every combination, but use OpenMP to make use of
// the embarrassingly parallel nature of the problem.
bool eigen_random_openmp()
{
const auto size = 7;
auto success = true;
#pragma omp parallel for reduction(&&: success)
for (int i = 0; i < 35 * 4; ++i)
{
auto randomMatrix = Eigen::MatrixXd::Random(size, size);
Eigen::MatrixXd inverse = randomMatrix.inverse();
success = success && inverse.allFinite();
}
return success;
}
// This approach computes an inverse directly for the initial combination, but then
// uses the Sherman-Morrison formula to update the inverse. The combinations are
// generated in a way that each differs from the last by the replacement of one item,
// which corresponds to replaceing one row and one column in the combination matrix.
bool eigen_sherman()
{
auto success = true;
const auto size = 11;
const auto select_large = 4;
const auto size_large = 7;
const auto select_small = 3;
const auto size_small = 4;
const auto comb_size = select_small + select_large;
// Matrix for all items
Eigen::MatrixXd main = Eigen::MatrixXd::Random(size, size);
// Generate the first combination
gray_join_t gray;
std::bitset<size> selected = gray.next();
// Matrix for this combination
Eigen::MatrixXd combination(size_large, size_large);
// Maps index into main matrix onto an index into the combination matrix
// and its inverse
std::array<int, 11> main_to_comb;
std::array<int, comb_size> comb_to_main;
auto comb_index = 0;
for (size_t main_index = 0; main_index < selected.size(); ++main_index)
{
if (selected[main_index])
{
main_to_comb[main_index] = comb_index;
comb_to_main[comb_index] = main_index;
++comb_index;
}
else
{
main_to_comb[main_index] = -1;
}
}
// Generate the initial combination
auto ci = 0;
for (size_t i = 0; i < selected.size(); ++i)
{
auto cj = 0;
for (size_t j = 0; j < selected.size(); ++j)
{
if (selected[i] && selected[j]) {
combination(ci, cj) = main(i, j);
cj++;
}
}
if (selected[i])
{
ci++;
}
}
// Compute the inverse of the initial combination directly
Eigen::MatrixXd inverse = combination.inverse();
// From now on, update the inverse using two rank-2 updates, to replace a row and column
for (int n = 1; n < 35*4; ++n)
{
// Generate the next combination by removing one item and adding another
uint32_t selected_next = gray.next();
// Index into main matrix of removed item
uint32_t removed = set_bit(selected.to_ulong() &~selected_next);
// Index into main matrix of added item
uint32_t added = set_bit(selected_next & ~selected.to_ulong());
// Index into combination matrix of row/column to swap
auto comb_swap_index = main_to_comb[removed];
// Update the mapping between the main matrix and the combination
main_to_comb[removed] = -1;
main_to_comb[added] = comb_swap_index;
comb_to_main[comb_swap_index] = added;
// Replacement row and column
auto new_row = row_map(main, added, comb_to_main);
auto new_col = col_map(main, added, comb_to_main);
// Sherman-Morrison u, v vectors for row replacement
auto u_row = Eigen::VectorXd::Unit(comb_size, comb_swap_index);
Eigen::RowVectorXd v_row = new_row - combination.row(comb_swap_index);
// Update the combination matrix and its inverse for the row replacement
combination.row(comb_swap_index) = new_row;
inverse = sherman_morrison_update_inverse(inverse, u_row, v_row);
// Vectors for row replacement
Eigen::VectorXd u_col = new_col - combination.col(comb_swap_index);
auto v_col = Eigen::RowVectorXd::Unit(comb_size, comb_swap_index);
// Update the combination matrix and its inverse for the column replacement
combination.col(comb_swap_index) = new_col;
inverse = sherman_morrison_update_inverse(inverse, u_col, v_col);
success = success && inverse.allFinite();
selected = selected_next;
}
return success;
}
int main()
{
// Benchmark a series of approaches to the problem
auto iterations = 10000;
benchmark_t benchmarks[] = {
{"eigen_random", eigen_random},
{"eigen_random_openmp", eigen_random_openmp},
{"eigen_sherman", eigen_sherman},
};
for (const auto& benchmark : benchmarks)
{
std::cout << std::left << std::setw(30) << benchmark.name;
std::cout << time_func(benchmark.func, iterations).count() << "s" << std::endl;
}
// Processor: Intel(R) Core(TM) i7-3770K CPU @ 3.50GHz, 3901 Mhz, 4 Core(s), 8 Logical Processor(s)
// Compiler: Visual C++ 2017 RC 64-bit
// Results for 10000 iterations:
// eigen_random 6.23933s
// eigen_random_openmp 1.58229s
// eigen_sherman 3.14105s
// Possible improvements:
//
// * The Sherman update method is promising in a single threaded context, so
// I'd want to split the work across the available hardware threads while still using this method.
// Each thread would compute an initial inverse directly to avoid dependencies betweeen threads.
// This could end up being faster than eigen_random_openmp.
//
// * Try the CUDA version of BLAS for matrix multiplication to parallelise further. It depends whether
// the cost of copying between system RAM and GPU RAM is worth the gain in speed.
//
// * Investigate block inverse, because the matrix naturally splits into 4x4 and 3x3 diagonal blocks
// and these sizes have closed form inverses which might be quicker to compute. One of the four blocks
// remains the same between adjacent combinations so its inverse could be cached.
//
// * Benchmark different matrix libraries (LAPACK etc).
return 0;
}