autotuned_index.h 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588
  1. /***********************************************************************
  2. * Software License Agreement (BSD License)
  3. *
  4. * Copyright 2008-2009 Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
  5. * Copyright 2008-2009 David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
  6. *
  7. * THE BSD LICENSE
  8. *
  9. * Redistribution and use in source and binary forms, with or without
  10. * modification, are permitted provided that the following conditions
  11. * are met:
  12. *
  13. * 1. Redistributions of source code must retain the above copyright
  14. * notice, this list of conditions and the following disclaimer.
  15. * 2. Redistributions in binary form must reproduce the above copyright
  16. * notice, this list of conditions and the following disclaimer in the
  17. * documentation and/or other materials provided with the distribution.
  18. *
  19. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  20. * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  21. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  22. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  23. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  24. * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  28. * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29. *************************************************************************/
  30. #ifndef OPENCV_FLANN_AUTOTUNED_INDEX_H_
  31. #define OPENCV_FLANN_AUTOTUNED_INDEX_H_
  32. #include "general.h"
  33. #include "nn_index.h"
  34. #include "ground_truth.h"
  35. #include "index_testing.h"
  36. #include "sampling.h"
  37. #include "kdtree_index.h"
  38. #include "kdtree_single_index.h"
  39. #include "kmeans_index.h"
  40. #include "composite_index.h"
  41. #include "linear_index.h"
  42. #include "logger.h"
  43. namespace cvflann
  44. {
  45. template<typename Distance>
  46. NNIndex<Distance>* create_index_by_type(const Matrix<typename Distance::ElementType>& dataset, const IndexParams& params, const Distance& distance);
  47. struct AutotunedIndexParams : public IndexParams
  48. {
  49. AutotunedIndexParams(float target_precision = 0.8, float build_weight = 0.01, float memory_weight = 0, float sample_fraction = 0.1)
  50. {
  51. (*this)["algorithm"] = FLANN_INDEX_AUTOTUNED;
  52. // precision desired (used for autotuning, -1 otherwise)
  53. (*this)["target_precision"] = target_precision;
  54. // build tree time weighting factor
  55. (*this)["build_weight"] = build_weight;
  56. // index memory weighting factor
  57. (*this)["memory_weight"] = memory_weight;
  58. // what fraction of the dataset to use for autotuning
  59. (*this)["sample_fraction"] = sample_fraction;
  60. }
  61. };
  62. template <typename Distance>
  63. class AutotunedIndex : public NNIndex<Distance>
  64. {
  65. public:
  66. typedef typename Distance::ElementType ElementType;
  67. typedef typename Distance::ResultType DistanceType;
  68. AutotunedIndex(const Matrix<ElementType>& inputData, const IndexParams& params = AutotunedIndexParams(), Distance d = Distance()) :
  69. dataset_(inputData), distance_(d)
  70. {
  71. target_precision_ = get_param(params, "target_precision",0.8f);
  72. build_weight_ = get_param(params,"build_weight", 0.01f);
  73. memory_weight_ = get_param(params, "memory_weight", 0.0f);
  74. sample_fraction_ = get_param(params,"sample_fraction", 0.1f);
  75. bestIndex_ = NULL;
  76. }
  77. AutotunedIndex(const AutotunedIndex&);
  78. AutotunedIndex& operator=(const AutotunedIndex&);
  79. virtual ~AutotunedIndex()
  80. {
  81. if (bestIndex_ != NULL) {
  82. delete bestIndex_;
  83. bestIndex_ = NULL;
  84. }
  85. }
  86. /**
  87. * Method responsible with building the index.
  88. */
  89. virtual void buildIndex()
  90. {
  91. std::ostringstream stream;
  92. bestParams_ = estimateBuildParams();
  93. print_params(bestParams_, stream);
  94. Logger::info("----------------------------------------------------\n");
  95. Logger::info("Autotuned parameters:\n");
  96. Logger::info("%s", stream.str().c_str());
  97. Logger::info("----------------------------------------------------\n");
  98. bestIndex_ = create_index_by_type(dataset_, bestParams_, distance_);
  99. bestIndex_->buildIndex();
  100. speedup_ = estimateSearchParams(bestSearchParams_);
  101. stream.str(std::string());
  102. print_params(bestSearchParams_, stream);
  103. Logger::info("----------------------------------------------------\n");
  104. Logger::info("Search parameters:\n");
  105. Logger::info("%s", stream.str().c_str());
  106. Logger::info("----------------------------------------------------\n");
  107. }
  108. /**
  109. * Saves the index to a stream
  110. */
  111. virtual void saveIndex(FILE* stream)
  112. {
  113. save_value(stream, (int)bestIndex_->getType());
  114. bestIndex_->saveIndex(stream);
  115. save_value(stream, get_param<int>(bestSearchParams_, "checks"));
  116. }
  117. /**
  118. * Loads the index from a stream
  119. */
  120. virtual void loadIndex(FILE* stream)
  121. {
  122. int index_type;
  123. load_value(stream, index_type);
  124. IndexParams params;
  125. params["algorithm"] = (flann_algorithm_t)index_type;
  126. bestIndex_ = create_index_by_type<Distance>(dataset_, params, distance_);
  127. bestIndex_->loadIndex(stream);
  128. int checks;
  129. load_value(stream, checks);
  130. bestSearchParams_["checks"] = checks;
  131. }
  132. /**
  133. * Method that searches for nearest-neighbors
  134. */
  135. virtual void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams)
  136. {
  137. int checks = get_param<int>(searchParams,"checks",FLANN_CHECKS_AUTOTUNED);
  138. if (checks == FLANN_CHECKS_AUTOTUNED) {
  139. bestIndex_->findNeighbors(result, vec, bestSearchParams_);
  140. }
  141. else {
  142. bestIndex_->findNeighbors(result, vec, searchParams);
  143. }
  144. }
  145. IndexParams getParameters() const
  146. {
  147. return bestIndex_->getParameters();
  148. }
  149. SearchParams getSearchParameters() const
  150. {
  151. return bestSearchParams_;
  152. }
  153. float getSpeedup() const
  154. {
  155. return speedup_;
  156. }
  157. /**
  158. * Number of features in this index.
  159. */
  160. virtual size_t size() const
  161. {
  162. return bestIndex_->size();
  163. }
  164. /**
  165. * The length of each vector in this index.
  166. */
  167. virtual size_t veclen() const
  168. {
  169. return bestIndex_->veclen();
  170. }
  171. /**
  172. * The amount of memory (in bytes) this index uses.
  173. */
  174. virtual int usedMemory() const
  175. {
  176. return bestIndex_->usedMemory();
  177. }
  178. /**
  179. * Algorithm name
  180. */
  181. virtual flann_algorithm_t getType() const
  182. {
  183. return FLANN_INDEX_AUTOTUNED;
  184. }
  185. private:
  186. struct CostData
  187. {
  188. float searchTimeCost;
  189. float buildTimeCost;
  190. float memoryCost;
  191. float totalCost;
  192. IndexParams params;
  193. };
  194. void evaluate_kmeans(CostData& cost)
  195. {
  196. StartStopTimer t;
  197. int checks;
  198. const int nn = 1;
  199. Logger::info("KMeansTree using params: max_iterations=%d, branching=%d\n",
  200. get_param<int>(cost.params,"iterations"),
  201. get_param<int>(cost.params,"branching"));
  202. KMeansIndex<Distance> kmeans(sampledDataset_, cost.params, distance_);
  203. // measure index build time
  204. t.start();
  205. kmeans.buildIndex();
  206. t.stop();
  207. float buildTime = (float)t.value;
  208. // measure search time
  209. float searchTime = test_index_precision(kmeans, sampledDataset_, testDataset_, gt_matches_, target_precision_, checks, distance_, nn);
  210. float datasetMemory = float(sampledDataset_.rows * sampledDataset_.cols * sizeof(float));
  211. cost.memoryCost = (kmeans.usedMemory() + datasetMemory) / datasetMemory;
  212. cost.searchTimeCost = searchTime;
  213. cost.buildTimeCost = buildTime;
  214. Logger::info("KMeansTree buildTime=%g, searchTime=%g, build_weight=%g\n", buildTime, searchTime, build_weight_);
  215. }
  216. void evaluate_kdtree(CostData& cost)
  217. {
  218. StartStopTimer t;
  219. int checks;
  220. const int nn = 1;
  221. Logger::info("KDTree using params: trees=%d\n", get_param<int>(cost.params,"trees"));
  222. KDTreeIndex<Distance> kdtree(sampledDataset_, cost.params, distance_);
  223. t.start();
  224. kdtree.buildIndex();
  225. t.stop();
  226. float buildTime = (float)t.value;
  227. //measure search time
  228. float searchTime = test_index_precision(kdtree, sampledDataset_, testDataset_, gt_matches_, target_precision_, checks, distance_, nn);
  229. float datasetMemory = float(sampledDataset_.rows * sampledDataset_.cols * sizeof(float));
  230. cost.memoryCost = (kdtree.usedMemory() + datasetMemory) / datasetMemory;
  231. cost.searchTimeCost = searchTime;
  232. cost.buildTimeCost = buildTime;
  233. Logger::info("KDTree buildTime=%g, searchTime=%g\n", buildTime, searchTime);
  234. }
  235. // struct KMeansSimpleDownhillFunctor {
  236. //
  237. // Autotune& autotuner;
  238. // KMeansSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {}
  239. //
  240. // float operator()(int* params) {
  241. //
  242. // float maxFloat = numeric_limits<float>::max();
  243. //
  244. // if (params[0]<2) return maxFloat;
  245. // if (params[1]<0) return maxFloat;
  246. //
  247. // CostData c;
  248. // c.params["algorithm"] = KMEANS;
  249. // c.params["centers-init"] = CENTERS_RANDOM;
  250. // c.params["branching"] = params[0];
  251. // c.params["max-iterations"] = params[1];
  252. //
  253. // autotuner.evaluate_kmeans(c);
  254. //
  255. // return c.timeCost;
  256. //
  257. // }
  258. // };
  259. //
  260. // struct KDTreeSimpleDownhillFunctor {
  261. //
  262. // Autotune& autotuner;
  263. // KDTreeSimpleDownhillFunctor(Autotune& autotuner_) : autotuner(autotuner_) {}
  264. //
  265. // float operator()(int* params) {
  266. // float maxFloat = numeric_limits<float>::max();
  267. //
  268. // if (params[0]<1) return maxFloat;
  269. //
  270. // CostData c;
  271. // c.params["algorithm"] = KDTREE;
  272. // c.params["trees"] = params[0];
  273. //
  274. // autotuner.evaluate_kdtree(c);
  275. //
  276. // return c.timeCost;
  277. //
  278. // }
  279. // };
  280. void optimizeKMeans(std::vector<CostData>& costs)
  281. {
  282. Logger::info("KMEANS, Step 1: Exploring parameter space\n");
  283. // explore kmeans parameters space using combinations of the parameters below
  284. int maxIterations[] = { 1, 5, 10, 15 };
  285. int branchingFactors[] = { 16, 32, 64, 128, 256 };
  286. int kmeansParamSpaceSize = FLANN_ARRAY_LEN(maxIterations) * FLANN_ARRAY_LEN(branchingFactors);
  287. costs.reserve(costs.size() + kmeansParamSpaceSize);
  288. // evaluate kmeans for all parameter combinations
  289. for (size_t i = 0; i < FLANN_ARRAY_LEN(maxIterations); ++i) {
  290. for (size_t j = 0; j < FLANN_ARRAY_LEN(branchingFactors); ++j) {
  291. CostData cost;
  292. cost.params["algorithm"] = FLANN_INDEX_KMEANS;
  293. cost.params["centers_init"] = FLANN_CENTERS_RANDOM;
  294. cost.params["iterations"] = maxIterations[i];
  295. cost.params["branching"] = branchingFactors[j];
  296. evaluate_kmeans(cost);
  297. costs.push_back(cost);
  298. }
  299. }
  300. // Logger::info("KMEANS, Step 2: simplex-downhill optimization\n");
  301. //
  302. // const int n = 2;
  303. // // choose initial simplex points as the best parameters so far
  304. // int kmeansNMPoints[n*(n+1)];
  305. // float kmeansVals[n+1];
  306. // for (int i=0;i<n+1;++i) {
  307. // kmeansNMPoints[i*n] = (int)kmeansCosts[i].params["branching"];
  308. // kmeansNMPoints[i*n+1] = (int)kmeansCosts[i].params["max-iterations"];
  309. // kmeansVals[i] = kmeansCosts[i].timeCost;
  310. // }
  311. // KMeansSimpleDownhillFunctor kmeans_cost_func(*this);
  312. // // run optimization
  313. // optimizeSimplexDownhill(kmeansNMPoints,n,kmeans_cost_func,kmeansVals);
  314. // // store results
  315. // for (int i=0;i<n+1;++i) {
  316. // kmeansCosts[i].params["branching"] = kmeansNMPoints[i*2];
  317. // kmeansCosts[i].params["max-iterations"] = kmeansNMPoints[i*2+1];
  318. // kmeansCosts[i].timeCost = kmeansVals[i];
  319. // }
  320. }
  321. void optimizeKDTree(std::vector<CostData>& costs)
  322. {
  323. Logger::info("KD-TREE, Step 1: Exploring parameter space\n");
  324. // explore kd-tree parameters space using the parameters below
  325. int testTrees[] = { 1, 4, 8, 16, 32 };
  326. // evaluate kdtree for all parameter combinations
  327. for (size_t i = 0; i < FLANN_ARRAY_LEN(testTrees); ++i) {
  328. CostData cost;
  329. cost.params["algorithm"] = FLANN_INDEX_KDTREE;
  330. cost.params["trees"] = testTrees[i];
  331. evaluate_kdtree(cost);
  332. costs.push_back(cost);
  333. }
  334. // Logger::info("KD-TREE, Step 2: simplex-downhill optimization\n");
  335. //
  336. // const int n = 1;
  337. // // choose initial simplex points as the best parameters so far
  338. // int kdtreeNMPoints[n*(n+1)];
  339. // float kdtreeVals[n+1];
  340. // for (int i=0;i<n+1;++i) {
  341. // kdtreeNMPoints[i] = (int)kdtreeCosts[i].params["trees"];
  342. // kdtreeVals[i] = kdtreeCosts[i].timeCost;
  343. // }
  344. // KDTreeSimpleDownhillFunctor kdtree_cost_func(*this);
  345. // // run optimization
  346. // optimizeSimplexDownhill(kdtreeNMPoints,n,kdtree_cost_func,kdtreeVals);
  347. // // store results
  348. // for (int i=0;i<n+1;++i) {
  349. // kdtreeCosts[i].params["trees"] = kdtreeNMPoints[i];
  350. // kdtreeCosts[i].timeCost = kdtreeVals[i];
  351. // }
  352. }
  353. /**
  354. * Chooses the best nearest-neighbor algorithm and estimates the optimal
  355. * parameters to use when building the index (for a given precision).
  356. * Returns a dictionary with the optimal parameters.
  357. */
  358. IndexParams estimateBuildParams()
  359. {
  360. std::vector<CostData> costs;
  361. int sampleSize = int(sample_fraction_ * dataset_.rows);
  362. int testSampleSize = std::min(sampleSize / 10, 1000);
  363. Logger::info("Entering autotuning, dataset size: %d, sampleSize: %d, testSampleSize: %d, target precision: %g\n", dataset_.rows, sampleSize, testSampleSize, target_precision_);
  364. // For a very small dataset, it makes no sense to build any fancy index, just
  365. // use linear search
  366. if (testSampleSize < 10) {
  367. Logger::info("Choosing linear, dataset too small\n");
  368. return LinearIndexParams();
  369. }
  370. // We use a fraction of the original dataset to speedup the autotune algorithm
  371. sampledDataset_ = random_sample(dataset_, sampleSize);
  372. // We use a cross-validation approach, first we sample a testset from the dataset
  373. testDataset_ = random_sample(sampledDataset_, testSampleSize, true);
  374. // We compute the ground truth using linear search
  375. Logger::info("Computing ground truth... \n");
  376. gt_matches_ = Matrix<int>(new int[testDataset_.rows], testDataset_.rows, 1);
  377. StartStopTimer t;
  378. t.start();
  379. compute_ground_truth<Distance>(sampledDataset_, testDataset_, gt_matches_, 0, distance_);
  380. t.stop();
  381. CostData linear_cost;
  382. linear_cost.searchTimeCost = (float)t.value;
  383. linear_cost.buildTimeCost = 0;
  384. linear_cost.memoryCost = 0;
  385. linear_cost.params["algorithm"] = FLANN_INDEX_LINEAR;
  386. costs.push_back(linear_cost);
  387. // Start parameter autotune process
  388. Logger::info("Autotuning parameters...\n");
  389. optimizeKMeans(costs);
  390. optimizeKDTree(costs);
  391. float bestTimeCost = costs[0].searchTimeCost;
  392. for (size_t i = 0; i < costs.size(); ++i) {
  393. float timeCost = costs[i].buildTimeCost * build_weight_ + costs[i].searchTimeCost;
  394. if (timeCost < bestTimeCost) {
  395. bestTimeCost = timeCost;
  396. }
  397. }
  398. float bestCost = costs[0].searchTimeCost / bestTimeCost;
  399. IndexParams bestParams = costs[0].params;
  400. if (bestTimeCost > 0) {
  401. for (size_t i = 0; i < costs.size(); ++i) {
  402. float crtCost = (costs[i].buildTimeCost * build_weight_ + costs[i].searchTimeCost) / bestTimeCost +
  403. memory_weight_ * costs[i].memoryCost;
  404. if (crtCost < bestCost) {
  405. bestCost = crtCost;
  406. bestParams = costs[i].params;
  407. }
  408. }
  409. }
  410. delete[] gt_matches_.data;
  411. delete[] testDataset_.data;
  412. delete[] sampledDataset_.data;
  413. return bestParams;
  414. }
  415. /**
  416. * Estimates the search time parameters needed to get the desired precision.
  417. * Precondition: the index is built
  418. * Postcondition: the searchParams will have the optimum params set, also the speedup obtained over linear search.
  419. */
  420. float estimateSearchParams(SearchParams& searchParams)
  421. {
  422. const int nn = 1;
  423. const size_t SAMPLE_COUNT = 1000;
  424. assert(bestIndex_ != NULL); // must have a valid index
  425. float speedup = 0;
  426. int samples = (int)std::min(dataset_.rows / 10, SAMPLE_COUNT);
  427. if (samples > 0) {
  428. Matrix<ElementType> testDataset = random_sample(dataset_, samples);
  429. Logger::info("Computing ground truth\n");
  430. // we need to compute the ground truth first
  431. Matrix<int> gt_matches(new int[testDataset.rows], testDataset.rows, 1);
  432. StartStopTimer t;
  433. t.start();
  434. compute_ground_truth<Distance>(dataset_, testDataset, gt_matches, 1, distance_);
  435. t.stop();
  436. float linear = (float)t.value;
  437. int checks;
  438. Logger::info("Estimating number of checks\n");
  439. float searchTime;
  440. float cb_index;
  441. if (bestIndex_->getType() == FLANN_INDEX_KMEANS) {
  442. Logger::info("KMeans algorithm, estimating cluster border factor\n");
  443. KMeansIndex<Distance>* kmeans = (KMeansIndex<Distance>*)bestIndex_;
  444. float bestSearchTime = -1;
  445. float best_cb_index = -1;
  446. int best_checks = -1;
  447. for (cb_index = 0; cb_index < 1.1f; cb_index += 0.2f) {
  448. kmeans->set_cb_index(cb_index);
  449. searchTime = test_index_precision(*kmeans, dataset_, testDataset, gt_matches, target_precision_, checks, distance_, nn, 1);
  450. if ((searchTime < bestSearchTime) || (bestSearchTime == -1)) {
  451. bestSearchTime = searchTime;
  452. best_cb_index = cb_index;
  453. best_checks = checks;
  454. }
  455. }
  456. searchTime = bestSearchTime;
  457. cb_index = best_cb_index;
  458. checks = best_checks;
  459. kmeans->set_cb_index(best_cb_index);
  460. Logger::info("Optimum cb_index: %g\n", cb_index);
  461. bestParams_["cb_index"] = cb_index;
  462. }
  463. else {
  464. searchTime = test_index_precision(*bestIndex_, dataset_, testDataset, gt_matches, target_precision_, checks, distance_, nn, 1);
  465. }
  466. Logger::info("Required number of checks: %d \n", checks);
  467. searchParams["checks"] = checks;
  468. speedup = linear / searchTime;
  469. delete[] gt_matches.data;
  470. delete[] testDataset.data;
  471. }
  472. return speedup;
  473. }
  474. private:
  475. NNIndex<Distance>* bestIndex_;
  476. IndexParams bestParams_;
  477. SearchParams bestSearchParams_;
  478. Matrix<ElementType> sampledDataset_;
  479. Matrix<ElementType> testDataset_;
  480. Matrix<int> gt_matches_;
  481. float speedup_;
  482. /**
  483. * The dataset used by this index
  484. */
  485. const Matrix<ElementType> dataset_;
  486. /**
  487. * Index parameters
  488. */
  489. float target_precision_;
  490. float build_weight_;
  491. float memory_weight_;
  492. float sample_fraction_;
  493. Distance distance_;
  494. };
  495. }
  496. #endif /* OPENCV_FLANN_AUTOTUNED_INDEX_H_ */