extents_status.c 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299
  1. /*
  2. * fs/ext4/extents_status.c
  3. *
  4. * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
  5. * Modified by
  6. * Allison Henderson <achender@linux.vnet.ibm.com>
  7. * Hugh Dickins <hughd@google.com>
  8. * Zheng Liu <wenqing.lz@taobao.com>
  9. *
  10. * Ext4 extents status tree core functions.
  11. */
  12. #include <linux/rbtree.h>
  13. #include <linux/list_sort.h>
  14. #include <linux/proc_fs.h>
  15. #include <linux/seq_file.h>
  16. #include "ext4.h"
  17. #include "extents_status.h"
  18. #include <trace/events/ext4.h>
  19. /*
  20. * According to previous discussion in Ext4 Developer Workshop, we
  21. * will introduce a new structure called io tree to track all extent
  22. * status in order to solve some problems that we have met
  23. * (e.g. Reservation space warning), and provide extent-level locking.
  24. * Delay extent tree is the first step to achieve this goal. It is
  25. * original built by Yongqiang Yang. At that time it is called delay
  26. * extent tree, whose goal is only track delayed extents in memory to
  27. * simplify the implementation of fiemap and bigalloc, and introduce
  28. * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called
  29. * delay extent tree at the first commit. But for better understand
  30. * what it does, it has been rename to extent status tree.
  31. *
  32. * Step1:
  33. * Currently the first step has been done. All delayed extents are
  34. * tracked in the tree. It maintains the delayed extent when a delayed
  35. * allocation is issued, and the delayed extent is written out or
  36. * invalidated. Therefore the implementation of fiemap and bigalloc
  37. * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
  38. *
  39. * The following comment describes the implemenmtation of extent
  40. * status tree and future works.
  41. *
  42. * Step2:
  43. * In this step all extent status are tracked by extent status tree.
  44. * Thus, we can first try to lookup a block mapping in this tree before
  45. * finding it in extent tree. Hence, single extent cache can be removed
  46. * because extent status tree can do a better job. Extents in status
  47. * tree are loaded on-demand. Therefore, the extent status tree may not
  48. * contain all of the extents in a file. Meanwhile we define a shrinker
  49. * to reclaim memory from extent status tree because fragmented extent
  50. * tree will make status tree cost too much memory. written/unwritten/-
  51. * hole extents in the tree will be reclaimed by this shrinker when we
  52. * are under high memory pressure. Delayed extents will not be
  53. * reclimed because fiemap, bigalloc, and seek_data/hole need it.
  54. */
  55. /*
  56. * Extent status tree implementation for ext4.
  57. *
  58. *
  59. * ==========================================================================
  60. * Extent status tree tracks all extent status.
  61. *
  62. * 1. Why we need to implement extent status tree?
  63. *
  64. * Without extent status tree, ext4 identifies a delayed extent by looking
  65. * up page cache, this has several deficiencies - complicated, buggy,
  66. * and inefficient code.
  67. *
  68. * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
  69. * block or a range of blocks are belonged to a delayed extent.
  70. *
  71. * Let us have a look at how they do without extent status tree.
  72. * -- FIEMAP
  73. * FIEMAP looks up page cache to identify delayed allocations from holes.
  74. *
  75. * -- SEEK_HOLE/DATA
  76. * SEEK_HOLE/DATA has the same problem as FIEMAP.
  77. *
  78. * -- bigalloc
  79. * bigalloc looks up page cache to figure out if a block is
  80. * already under delayed allocation or not to determine whether
  81. * quota reserving is needed for the cluster.
  82. *
  83. * -- writeout
  84. * Writeout looks up whole page cache to see if a buffer is
  85. * mapped, If there are not very many delayed buffers, then it is
  86. * time comsuming.
  87. *
  88. * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
  89. * bigalloc and writeout can figure out if a block or a range of
  90. * blocks is under delayed allocation(belonged to a delayed extent) or
  91. * not by searching the extent tree.
  92. *
  93. *
  94. * ==========================================================================
  95. * 2. Ext4 extent status tree impelmentation
  96. *
  97. * -- extent
  98. * A extent is a range of blocks which are contiguous logically and
  99. * physically. Unlike extent in extent tree, this extent in ext4 is
  100. * a in-memory struct, there is no corresponding on-disk data. There
  101. * is no limit on length of extent, so an extent can contain as many
  102. * blocks as they are contiguous logically and physically.
  103. *
  104. * -- extent status tree
  105. * Every inode has an extent status tree and all allocation blocks
  106. * are added to the tree with different status. The extent in the
  107. * tree are ordered by logical block no.
  108. *
  109. * -- operations on a extent status tree
  110. * There are three important operations on a delayed extent tree: find
  111. * next extent, adding a extent(a range of blocks) and removing a extent.
  112. *
  113. * -- race on a extent status tree
  114. * Extent status tree is protected by inode->i_es_lock.
  115. *
  116. * -- memory consumption
  117. * Fragmented extent tree will make extent status tree cost too much
  118. * memory. Hence, we will reclaim written/unwritten/hole extents from
  119. * the tree under a heavy memory pressure.
  120. *
  121. *
  122. * ==========================================================================
  123. * 3. Performance analysis
  124. *
  125. * -- overhead
  126. * 1. There is a cache extent for write access, so if writes are
  127. * not very random, adding space operaions are in O(1) time.
  128. *
  129. * -- gain
  130. * 2. Code is much simpler, more readable, more maintainable and
  131. * more efficient.
  132. *
  133. *
  134. * ==========================================================================
  135. * 4. TODO list
  136. *
  137. * -- Refactor delayed space reservation
  138. *
  139. * -- Extent-level locking
  140. */
  141. static struct kmem_cache *ext4_es_cachep;
  142. static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
  143. static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
  144. ext4_lblk_t end);
  145. static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
  146. int nr_to_scan);
  147. static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
  148. struct ext4_inode_info *locked_ei);
  149. int __init ext4_init_es(void)
  150. {
  151. ext4_es_cachep = kmem_cache_create("ext4_extent_status",
  152. sizeof(struct extent_status),
  153. 0, (SLAB_RECLAIM_ACCOUNT), NULL);
  154. if (ext4_es_cachep == NULL)
  155. return -ENOMEM;
  156. return 0;
  157. }
  158. void ext4_exit_es(void)
  159. {
  160. if (ext4_es_cachep)
  161. kmem_cache_destroy(ext4_es_cachep);
  162. }
  163. void ext4_es_init_tree(struct ext4_es_tree *tree)
  164. {
  165. tree->root = RB_ROOT;
  166. tree->cache_es = NULL;
  167. }
  168. #ifdef ES_DEBUG__
  169. static void ext4_es_print_tree(struct inode *inode)
  170. {
  171. struct ext4_es_tree *tree;
  172. struct rb_node *node;
  173. printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
  174. tree = &EXT4_I(inode)->i_es_tree;
  175. node = rb_first(&tree->root);
  176. while (node) {
  177. struct extent_status *es;
  178. es = rb_entry(node, struct extent_status, rb_node);
  179. printk(KERN_DEBUG " [%u/%u) %llu %x",
  180. es->es_lblk, es->es_len,
  181. ext4_es_pblock(es), ext4_es_status(es));
  182. node = rb_next(node);
  183. }
  184. printk(KERN_DEBUG "\n");
  185. }
  186. #else
  187. #define ext4_es_print_tree(inode)
  188. #endif
  189. static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
  190. {
  191. BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
  192. return es->es_lblk + es->es_len - 1;
  193. }
  194. /*
  195. * search through the tree for an delayed extent with a given offset. If
  196. * it can't be found, try to find next extent.
  197. */
  198. static struct extent_status *__es_tree_search(struct rb_root *root,
  199. ext4_lblk_t lblk)
  200. {
  201. struct rb_node *node = root->rb_node;
  202. struct extent_status *es = NULL;
  203. while (node) {
  204. es = rb_entry(node, struct extent_status, rb_node);
  205. if (lblk < es->es_lblk)
  206. node = node->rb_left;
  207. else if (lblk > ext4_es_end(es))
  208. node = node->rb_right;
  209. else
  210. return es;
  211. }
  212. if (es && lblk < es->es_lblk)
  213. return es;
  214. if (es && lblk > ext4_es_end(es)) {
  215. node = rb_next(&es->rb_node);
  216. return node ? rb_entry(node, struct extent_status, rb_node) :
  217. NULL;
  218. }
  219. return NULL;
  220. }
  221. /*
  222. * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
  223. * @es->lblk if it exists, otherwise, the next extent after @es->lblk.
  224. *
  225. * @inode: the inode which owns delayed extents
  226. * @lblk: the offset where we start to search
  227. * @end: the offset where we stop to search
  228. * @es: delayed extent that we found
  229. */
  230. void ext4_es_find_delayed_extent_range(struct inode *inode,
  231. ext4_lblk_t lblk, ext4_lblk_t end,
  232. struct extent_status *es)
  233. {
  234. struct ext4_es_tree *tree = NULL;
  235. struct extent_status *es1 = NULL;
  236. struct rb_node *node;
  237. BUG_ON(es == NULL);
  238. BUG_ON(end < lblk);
  239. trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
  240. read_lock(&EXT4_I(inode)->i_es_lock);
  241. tree = &EXT4_I(inode)->i_es_tree;
  242. /* find extent in cache firstly */
  243. es->es_lblk = es->es_len = es->es_pblk = 0;
  244. if (tree->cache_es) {
  245. es1 = tree->cache_es;
  246. if (in_range(lblk, es1->es_lblk, es1->es_len)) {
  247. es_debug("%u cached by [%u/%u) %llu %x\n",
  248. lblk, es1->es_lblk, es1->es_len,
  249. ext4_es_pblock(es1), ext4_es_status(es1));
  250. goto out;
  251. }
  252. }
  253. es1 = __es_tree_search(&tree->root, lblk);
  254. out:
  255. if (es1 && !ext4_es_is_delayed(es1)) {
  256. while ((node = rb_next(&es1->rb_node)) != NULL) {
  257. es1 = rb_entry(node, struct extent_status, rb_node);
  258. if (es1->es_lblk > end) {
  259. es1 = NULL;
  260. break;
  261. }
  262. if (ext4_es_is_delayed(es1))
  263. break;
  264. }
  265. }
  266. if (es1 && ext4_es_is_delayed(es1)) {
  267. tree->cache_es = es1;
  268. es->es_lblk = es1->es_lblk;
  269. es->es_len = es1->es_len;
  270. es->es_pblk = es1->es_pblk;
  271. }
  272. read_unlock(&EXT4_I(inode)->i_es_lock);
  273. trace_ext4_es_find_delayed_extent_range_exit(inode, es);
  274. }
  275. static struct extent_status *
  276. ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
  277. ext4_fsblk_t pblk)
  278. {
  279. struct extent_status *es;
  280. es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
  281. if (es == NULL)
  282. return NULL;
  283. es->es_lblk = lblk;
  284. es->es_len = len;
  285. es->es_pblk = pblk;
  286. /*
  287. * We don't count delayed extent because we never try to reclaim them
  288. */
  289. if (!ext4_es_is_delayed(es)) {
  290. EXT4_I(inode)->i_es_lru_nr++;
  291. percpu_counter_inc(&EXT4_SB(inode->i_sb)->
  292. s_es_stats.es_stats_lru_cnt);
  293. }
  294. EXT4_I(inode)->i_es_all_nr++;
  295. percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
  296. return es;
  297. }
  298. static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
  299. {
  300. EXT4_I(inode)->i_es_all_nr--;
  301. percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
  302. /* Decrease the lru counter when this es is not delayed */
  303. if (!ext4_es_is_delayed(es)) {
  304. BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
  305. EXT4_I(inode)->i_es_lru_nr--;
  306. percpu_counter_dec(&EXT4_SB(inode->i_sb)->
  307. s_es_stats.es_stats_lru_cnt);
  308. }
  309. kmem_cache_free(ext4_es_cachep, es);
  310. }
  311. /*
  312. * Check whether or not two extents can be merged
  313. * Condition:
  314. * - logical block number is contiguous
  315. * - physical block number is contiguous
  316. * - status is equal
  317. */
  318. static int ext4_es_can_be_merged(struct extent_status *es1,
  319. struct extent_status *es2)
  320. {
  321. if (ext4_es_status(es1) != ext4_es_status(es2))
  322. return 0;
  323. if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
  324. pr_warn("ES assertion failed when merging extents. "
  325. "The sum of lengths of es1 (%d) and es2 (%d) "
  326. "is bigger than allowed file size (%d)\n",
  327. es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
  328. WARN_ON(1);
  329. return 0;
  330. }
  331. if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
  332. return 0;
  333. if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
  334. (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
  335. return 1;
  336. if (ext4_es_is_hole(es1))
  337. return 1;
  338. /* we need to check delayed extent is without unwritten status */
  339. if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
  340. return 1;
  341. return 0;
  342. }
  343. static struct extent_status *
  344. ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
  345. {
  346. struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
  347. struct extent_status *es1;
  348. struct rb_node *node;
  349. node = rb_prev(&es->rb_node);
  350. if (!node)
  351. return es;
  352. es1 = rb_entry(node, struct extent_status, rb_node);
  353. if (ext4_es_can_be_merged(es1, es)) {
  354. es1->es_len += es->es_len;
  355. rb_erase(&es->rb_node, &tree->root);
  356. ext4_es_free_extent(inode, es);
  357. es = es1;
  358. }
  359. return es;
  360. }
  361. static struct extent_status *
  362. ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
  363. {
  364. struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
  365. struct extent_status *es1;
  366. struct rb_node *node;
  367. node = rb_next(&es->rb_node);
  368. if (!node)
  369. return es;
  370. es1 = rb_entry(node, struct extent_status, rb_node);
  371. if (ext4_es_can_be_merged(es, es1)) {
  372. es->es_len += es1->es_len;
  373. rb_erase(node, &tree->root);
  374. ext4_es_free_extent(inode, es1);
  375. }
  376. return es;
  377. }
  378. #ifdef ES_AGGRESSIVE_TEST
  379. #include "ext4_extents.h" /* Needed when ES_AGGRESSIVE_TEST is defined */
  380. static void ext4_es_insert_extent_ext_check(struct inode *inode,
  381. struct extent_status *es)
  382. {
  383. struct ext4_ext_path *path = NULL;
  384. struct ext4_extent *ex;
  385. ext4_lblk_t ee_block;
  386. ext4_fsblk_t ee_start;
  387. unsigned short ee_len;
  388. int depth, ee_status, es_status;
  389. path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
  390. if (IS_ERR(path))
  391. return;
  392. depth = ext_depth(inode);
  393. ex = path[depth].p_ext;
  394. if (ex) {
  395. ee_block = le32_to_cpu(ex->ee_block);
  396. ee_start = ext4_ext_pblock(ex);
  397. ee_len = ext4_ext_get_actual_len(ex);
  398. ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
  399. es_status = ext4_es_is_unwritten(es) ? 1 : 0;
  400. /*
  401. * Make sure ex and es are not overlap when we try to insert
  402. * a delayed/hole extent.
  403. */
  404. if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
  405. if (in_range(es->es_lblk, ee_block, ee_len)) {
  406. pr_warn("ES insert assertion failed for "
  407. "inode: %lu we can find an extent "
  408. "at block [%d/%d/%llu/%c], but we "
  409. "want to add a delayed/hole extent "
  410. "[%d/%d/%llu/%x]\n",
  411. inode->i_ino, ee_block, ee_len,
  412. ee_start, ee_status ? 'u' : 'w',
  413. es->es_lblk, es->es_len,
  414. ext4_es_pblock(es), ext4_es_status(es));
  415. }
  416. goto out;
  417. }
  418. /*
  419. * We don't check ee_block == es->es_lblk, etc. because es
  420. * might be a part of whole extent, vice versa.
  421. */
  422. if (es->es_lblk < ee_block ||
  423. ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
  424. pr_warn("ES insert assertion failed for inode: %lu "
  425. "ex_status [%d/%d/%llu/%c] != "
  426. "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
  427. ee_block, ee_len, ee_start,
  428. ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
  429. ext4_es_pblock(es), es_status ? 'u' : 'w');
  430. goto out;
  431. }
  432. if (ee_status ^ es_status) {
  433. pr_warn("ES insert assertion failed for inode: %lu "
  434. "ex_status [%d/%d/%llu/%c] != "
  435. "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
  436. ee_block, ee_len, ee_start,
  437. ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
  438. ext4_es_pblock(es), es_status ? 'u' : 'w');
  439. }
  440. } else {
  441. /*
  442. * We can't find an extent on disk. So we need to make sure
  443. * that we don't want to add an written/unwritten extent.
  444. */
  445. if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
  446. pr_warn("ES insert assertion failed for inode: %lu "
  447. "can't find an extent at block %d but we want "
  448. "to add a written/unwritten extent "
  449. "[%d/%d/%llu/%x]\n", inode->i_ino,
  450. es->es_lblk, es->es_lblk, es->es_len,
  451. ext4_es_pblock(es), ext4_es_status(es));
  452. }
  453. }
  454. out:
  455. ext4_ext_drop_refs(path);
  456. kfree(path);
  457. }
  458. static void ext4_es_insert_extent_ind_check(struct inode *inode,
  459. struct extent_status *es)
  460. {
  461. struct ext4_map_blocks map;
  462. int retval;
  463. /*
  464. * Here we call ext4_ind_map_blocks to lookup a block mapping because
  465. * 'Indirect' structure is defined in indirect.c. So we couldn't
  466. * access direct/indirect tree from outside. It is too dirty to define
  467. * this function in indirect.c file.
  468. */
  469. map.m_lblk = es->es_lblk;
  470. map.m_len = es->es_len;
  471. retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
  472. if (retval > 0) {
  473. if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
  474. /*
  475. * We want to add a delayed/hole extent but this
  476. * block has been allocated.
  477. */
  478. pr_warn("ES insert assertion failed for inode: %lu "
  479. "We can find blocks but we want to add a "
  480. "delayed/hole extent [%d/%d/%llu/%x]\n",
  481. inode->i_ino, es->es_lblk, es->es_len,
  482. ext4_es_pblock(es), ext4_es_status(es));
  483. return;
  484. } else if (ext4_es_is_written(es)) {
  485. if (retval != es->es_len) {
  486. pr_warn("ES insert assertion failed for "
  487. "inode: %lu retval %d != es_len %d\n",
  488. inode->i_ino, retval, es->es_len);
  489. return;
  490. }
  491. if (map.m_pblk != ext4_es_pblock(es)) {
  492. pr_warn("ES insert assertion failed for "
  493. "inode: %lu m_pblk %llu != "
  494. "es_pblk %llu\n",
  495. inode->i_ino, map.m_pblk,
  496. ext4_es_pblock(es));
  497. return;
  498. }
  499. } else {
  500. /*
  501. * We don't need to check unwritten extent because
  502. * indirect-based file doesn't have it.
  503. */
  504. BUG_ON(1);
  505. }
  506. } else if (retval == 0) {
  507. if (ext4_es_is_written(es)) {
  508. pr_warn("ES insert assertion failed for inode: %lu "
  509. "We can't find the block but we want to add "
  510. "a written extent [%d/%d/%llu/%x]\n",
  511. inode->i_ino, es->es_lblk, es->es_len,
  512. ext4_es_pblock(es), ext4_es_status(es));
  513. return;
  514. }
  515. }
  516. }
  517. static inline void ext4_es_insert_extent_check(struct inode *inode,
  518. struct extent_status *es)
  519. {
  520. /*
  521. * We don't need to worry about the race condition because
  522. * caller takes i_data_sem locking.
  523. */
  524. BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
  525. if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
  526. ext4_es_insert_extent_ext_check(inode, es);
  527. else
  528. ext4_es_insert_extent_ind_check(inode, es);
  529. }
  530. #else
  531. static inline void ext4_es_insert_extent_check(struct inode *inode,
  532. struct extent_status *es)
  533. {
  534. }
  535. #endif
  536. static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
  537. {
  538. struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
  539. struct rb_node **p = &tree->root.rb_node;
  540. struct rb_node *parent = NULL;
  541. struct extent_status *es;
  542. while (*p) {
  543. parent = *p;
  544. es = rb_entry(parent, struct extent_status, rb_node);
  545. if (newes->es_lblk < es->es_lblk) {
  546. if (ext4_es_can_be_merged(newes, es)) {
  547. /*
  548. * Here we can modify es_lblk directly
  549. * because it isn't overlapped.
  550. */
  551. es->es_lblk = newes->es_lblk;
  552. es->es_len += newes->es_len;
  553. if (ext4_es_is_written(es) ||
  554. ext4_es_is_unwritten(es))
  555. ext4_es_store_pblock(es,
  556. newes->es_pblk);
  557. es = ext4_es_try_to_merge_left(inode, es);
  558. goto out;
  559. }
  560. p = &(*p)->rb_left;
  561. } else if (newes->es_lblk > ext4_es_end(es)) {
  562. if (ext4_es_can_be_merged(es, newes)) {
  563. es->es_len += newes->es_len;
  564. es = ext4_es_try_to_merge_right(inode, es);
  565. goto out;
  566. }
  567. p = &(*p)->rb_right;
  568. } else {
  569. BUG_ON(1);
  570. return -EINVAL;
  571. }
  572. }
  573. es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
  574. newes->es_pblk);
  575. if (!es)
  576. return -ENOMEM;
  577. rb_link_node(&es->rb_node, parent, p);
  578. rb_insert_color(&es->rb_node, &tree->root);
  579. out:
  580. tree->cache_es = es;
  581. return 0;
  582. }
  583. /*
  584. * ext4_es_insert_extent() adds information to an inode's extent
  585. * status tree.
  586. *
  587. * Return 0 on success, error code on failure.
  588. */
  589. int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
  590. ext4_lblk_t len, ext4_fsblk_t pblk,
  591. unsigned int status)
  592. {
  593. struct extent_status newes;
  594. ext4_lblk_t end = lblk + len - 1;
  595. int err = 0;
  596. es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
  597. lblk, len, pblk, status, inode->i_ino);
  598. if (!len)
  599. return 0;
  600. BUG_ON(end < lblk);
  601. if ((status & EXTENT_STATUS_DELAYED) &&
  602. (status & EXTENT_STATUS_WRITTEN)) {
  603. ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
  604. " delayed and written which can potentially "
  605. " cause data loss.\n", lblk, len);
  606. WARN_ON(1);
  607. }
  608. newes.es_lblk = lblk;
  609. newes.es_len = len;
  610. ext4_es_store_pblock_status(&newes, pblk, status);
  611. trace_ext4_es_insert_extent(inode, &newes);
  612. ext4_es_insert_extent_check(inode, &newes);
  613. write_lock(&EXT4_I(inode)->i_es_lock);
  614. err = __es_remove_extent(inode, lblk, end);
  615. if (err != 0)
  616. goto error;
  617. retry:
  618. err = __es_insert_extent(inode, &newes);
  619. if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
  620. EXT4_I(inode)))
  621. goto retry;
  622. if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
  623. err = 0;
  624. error:
  625. write_unlock(&EXT4_I(inode)->i_es_lock);
  626. ext4_es_print_tree(inode);
  627. return err;
  628. }
  629. /*
  630. * ext4_es_cache_extent() inserts information into the extent status
  631. * tree if and only if there isn't information about the range in
  632. * question already.
  633. */
  634. void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
  635. ext4_lblk_t len, ext4_fsblk_t pblk,
  636. unsigned int status)
  637. {
  638. struct extent_status *es;
  639. struct extent_status newes;
  640. ext4_lblk_t end = lblk + len - 1;
  641. newes.es_lblk = lblk;
  642. newes.es_len = len;
  643. ext4_es_store_pblock_status(&newes, pblk, status);
  644. trace_ext4_es_cache_extent(inode, &newes);
  645. if (!len)
  646. return;
  647. BUG_ON(end < lblk);
  648. write_lock(&EXT4_I(inode)->i_es_lock);
  649. es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
  650. if (!es || es->es_lblk > end)
  651. __es_insert_extent(inode, &newes);
  652. write_unlock(&EXT4_I(inode)->i_es_lock);
  653. }
  654. /*
  655. * ext4_es_lookup_extent() looks up an extent in extent status tree.
  656. *
  657. * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
  658. *
  659. * Return: 1 on found, 0 on not
  660. */
  661. int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
  662. struct extent_status *es)
  663. {
  664. struct ext4_es_tree *tree;
  665. struct ext4_es_stats *stats;
  666. struct extent_status *es1 = NULL;
  667. struct rb_node *node;
  668. int found = 0;
  669. trace_ext4_es_lookup_extent_enter(inode, lblk);
  670. es_debug("lookup extent in block %u\n", lblk);
  671. tree = &EXT4_I(inode)->i_es_tree;
  672. read_lock(&EXT4_I(inode)->i_es_lock);
  673. /* find extent in cache firstly */
  674. es->es_lblk = es->es_len = es->es_pblk = 0;
  675. if (tree->cache_es) {
  676. es1 = tree->cache_es;
  677. if (in_range(lblk, es1->es_lblk, es1->es_len)) {
  678. es_debug("%u cached by [%u/%u)\n",
  679. lblk, es1->es_lblk, es1->es_len);
  680. found = 1;
  681. goto out;
  682. }
  683. }
  684. node = tree->root.rb_node;
  685. while (node) {
  686. es1 = rb_entry(node, struct extent_status, rb_node);
  687. if (lblk < es1->es_lblk)
  688. node = node->rb_left;
  689. else if (lblk > ext4_es_end(es1))
  690. node = node->rb_right;
  691. else {
  692. found = 1;
  693. break;
  694. }
  695. }
  696. out:
  697. stats = &EXT4_SB(inode->i_sb)->s_es_stats;
  698. if (found) {
  699. BUG_ON(!es1);
  700. es->es_lblk = es1->es_lblk;
  701. es->es_len = es1->es_len;
  702. es->es_pblk = es1->es_pblk;
  703. stats->es_stats_cache_hits++;
  704. } else {
  705. stats->es_stats_cache_misses++;
  706. }
  707. read_unlock(&EXT4_I(inode)->i_es_lock);
  708. trace_ext4_es_lookup_extent_exit(inode, es, found);
  709. return found;
  710. }
  711. static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
  712. ext4_lblk_t end)
  713. {
  714. struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
  715. struct rb_node *node;
  716. struct extent_status *es;
  717. struct extent_status orig_es;
  718. ext4_lblk_t len1, len2;
  719. ext4_fsblk_t block;
  720. int err;
  721. retry:
  722. err = 0;
  723. es = __es_tree_search(&tree->root, lblk);
  724. if (!es)
  725. goto out;
  726. if (es->es_lblk > end)
  727. goto out;
  728. /* Simply invalidate cache_es. */
  729. tree->cache_es = NULL;
  730. orig_es.es_lblk = es->es_lblk;
  731. orig_es.es_len = es->es_len;
  732. orig_es.es_pblk = es->es_pblk;
  733. len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
  734. len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
  735. if (len1 > 0)
  736. es->es_len = len1;
  737. if (len2 > 0) {
  738. if (len1 > 0) {
  739. struct extent_status newes;
  740. newes.es_lblk = end + 1;
  741. newes.es_len = len2;
  742. block = 0x7FDEADBEEFULL;
  743. if (ext4_es_is_written(&orig_es) ||
  744. ext4_es_is_unwritten(&orig_es))
  745. block = ext4_es_pblock(&orig_es) +
  746. orig_es.es_len - len2;
  747. ext4_es_store_pblock_status(&newes, block,
  748. ext4_es_status(&orig_es));
  749. err = __es_insert_extent(inode, &newes);
  750. if (err) {
  751. es->es_lblk = orig_es.es_lblk;
  752. es->es_len = orig_es.es_len;
  753. if ((err == -ENOMEM) &&
  754. __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
  755. EXT4_I(inode)))
  756. goto retry;
  757. goto out;
  758. }
  759. } else {
  760. es->es_lblk = end + 1;
  761. es->es_len = len2;
  762. if (ext4_es_is_written(es) ||
  763. ext4_es_is_unwritten(es)) {
  764. block = orig_es.es_pblk + orig_es.es_len - len2;
  765. ext4_es_store_pblock(es, block);
  766. }
  767. }
  768. goto out;
  769. }
  770. if (len1 > 0) {
  771. node = rb_next(&es->rb_node);
  772. if (node)
  773. es = rb_entry(node, struct extent_status, rb_node);
  774. else
  775. es = NULL;
  776. }
  777. while (es && ext4_es_end(es) <= end) {
  778. node = rb_next(&es->rb_node);
  779. rb_erase(&es->rb_node, &tree->root);
  780. ext4_es_free_extent(inode, es);
  781. if (!node) {
  782. es = NULL;
  783. break;
  784. }
  785. es = rb_entry(node, struct extent_status, rb_node);
  786. }
  787. if (es && es->es_lblk < end + 1) {
  788. ext4_lblk_t orig_len = es->es_len;
  789. len1 = ext4_es_end(es) - end;
  790. es->es_lblk = end + 1;
  791. es->es_len = len1;
  792. if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
  793. block = es->es_pblk + orig_len - len1;
  794. ext4_es_store_pblock(es, block);
  795. }
  796. }
  797. out:
  798. return err;
  799. }
  800. /*
  801. * ext4_es_remove_extent() removes a space from a extent status tree.
  802. *
  803. * Return 0 on success, error code on failure.
  804. */
  805. int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
  806. ext4_lblk_t len)
  807. {
  808. ext4_lblk_t end;
  809. int err = 0;
  810. trace_ext4_es_remove_extent(inode, lblk, len);
  811. es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
  812. lblk, len, inode->i_ino);
  813. if (!len)
  814. return err;
  815. end = lblk + len - 1;
  816. BUG_ON(end < lblk);
  817. write_lock(&EXT4_I(inode)->i_es_lock);
  818. err = __es_remove_extent(inode, lblk, end);
  819. write_unlock(&EXT4_I(inode)->i_es_lock);
  820. ext4_es_print_tree(inode);
  821. return err;
  822. }
  823. static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
  824. struct list_head *b)
  825. {
  826. struct ext4_inode_info *eia, *eib;
  827. eia = list_entry(a, struct ext4_inode_info, i_es_lru);
  828. eib = list_entry(b, struct ext4_inode_info, i_es_lru);
  829. if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
  830. !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
  831. return 1;
  832. if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
  833. ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
  834. return -1;
  835. if (eia->i_touch_when == eib->i_touch_when)
  836. return 0;
  837. if (time_after(eia->i_touch_when, eib->i_touch_when))
  838. return 1;
  839. else
  840. return -1;
  841. }
  842. static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
  843. struct ext4_inode_info *locked_ei)
  844. {
  845. struct ext4_inode_info *ei;
  846. struct ext4_es_stats *es_stats;
  847. struct list_head *cur, *tmp;
  848. LIST_HEAD(skipped);
  849. ktime_t start_time;
  850. u64 scan_time;
  851. int nr_shrunk = 0;
  852. int retried = 0, skip_precached = 1, nr_skipped = 0;
  853. es_stats = &sbi->s_es_stats;
  854. start_time = ktime_get();
  855. spin_lock(&sbi->s_es_lru_lock);
  856. retry:
  857. list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
  858. int shrunk;
  859. /*
  860. * If we have already reclaimed all extents from extent
  861. * status tree, just stop the loop immediately.
  862. */
  863. if (percpu_counter_read_positive(
  864. &es_stats->es_stats_lru_cnt) == 0)
  865. break;
  866. ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
  867. /*
  868. * Skip the inode that is newer than the last_sorted
  869. * time. Normally we try hard to avoid shrinking
  870. * precached inodes, but we will as a last resort.
  871. */
  872. if ((es_stats->es_stats_last_sorted < ei->i_touch_when) ||
  873. (skip_precached && ext4_test_inode_state(&ei->vfs_inode,
  874. EXT4_STATE_EXT_PRECACHED))) {
  875. nr_skipped++;
  876. list_move_tail(cur, &skipped);
  877. continue;
  878. }
  879. if (ei->i_es_lru_nr == 0 || ei == locked_ei ||
  880. !write_trylock(&ei->i_es_lock))
  881. continue;
  882. shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
  883. if (ei->i_es_lru_nr == 0)
  884. list_del_init(&ei->i_es_lru);
  885. write_unlock(&ei->i_es_lock);
  886. nr_shrunk += shrunk;
  887. nr_to_scan -= shrunk;
  888. if (nr_to_scan == 0)
  889. break;
  890. }
  891. /* Move the newer inodes into the tail of the LRU list. */
  892. list_splice_tail(&skipped, &sbi->s_es_lru);
  893. INIT_LIST_HEAD(&skipped);
  894. /*
  895. * If we skipped any inodes, and we weren't able to make any
  896. * forward progress, sort the list and try again.
  897. */
  898. if ((nr_shrunk == 0) && nr_skipped && !retried) {
  899. retried++;
  900. list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
  901. es_stats->es_stats_last_sorted = jiffies;
  902. ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
  903. i_es_lru);
  904. /*
  905. * If there are no non-precached inodes left on the
  906. * list, start releasing precached extents.
  907. */
  908. if (ext4_test_inode_state(&ei->vfs_inode,
  909. EXT4_STATE_EXT_PRECACHED))
  910. skip_precached = 0;
  911. goto retry;
  912. }
  913. spin_unlock(&sbi->s_es_lru_lock);
  914. if (locked_ei && nr_shrunk == 0)
  915. nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
  916. scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
  917. if (likely(es_stats->es_stats_scan_time))
  918. es_stats->es_stats_scan_time = (scan_time +
  919. es_stats->es_stats_scan_time*3) / 4;
  920. else
  921. es_stats->es_stats_scan_time = scan_time;
  922. if (scan_time > es_stats->es_stats_max_scan_time)
  923. es_stats->es_stats_max_scan_time = scan_time;
  924. if (likely(es_stats->es_stats_shrunk))
  925. es_stats->es_stats_shrunk = (nr_shrunk +
  926. es_stats->es_stats_shrunk*3) / 4;
  927. else
  928. es_stats->es_stats_shrunk = nr_shrunk;
  929. trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached,
  930. nr_skipped, retried);
  931. return nr_shrunk;
  932. }
  933. static unsigned long ext4_es_count(struct shrinker *shrink,
  934. struct shrink_control *sc)
  935. {
  936. unsigned long nr;
  937. struct ext4_sb_info *sbi;
  938. sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
  939. nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
  940. trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
  941. return nr;
  942. }
  943. static unsigned long ext4_es_scan(struct shrinker *shrink,
  944. struct shrink_control *sc)
  945. {
  946. struct ext4_sb_info *sbi = container_of(shrink,
  947. struct ext4_sb_info, s_es_shrinker);
  948. int nr_to_scan = sc->nr_to_scan;
  949. int ret, nr_shrunk;
  950. ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
  951. trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
  952. if (!nr_to_scan)
  953. return ret;
  954. nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
  955. trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
  956. return nr_shrunk;
  957. }
  958. static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
  959. {
  960. return *pos ? NULL : SEQ_START_TOKEN;
  961. }
  962. static void *
  963. ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
  964. {
  965. return NULL;
  966. }
  967. static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
  968. {
  969. struct ext4_sb_info *sbi = seq->private;
  970. struct ext4_es_stats *es_stats = &sbi->s_es_stats;
  971. struct ext4_inode_info *ei, *max = NULL;
  972. unsigned int inode_cnt = 0;
  973. if (v != SEQ_START_TOKEN)
  974. return 0;
  975. /* here we just find an inode that has the max nr. of objects */
  976. spin_lock(&sbi->s_es_lru_lock);
  977. list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) {
  978. inode_cnt++;
  979. if (max && max->i_es_all_nr < ei->i_es_all_nr)
  980. max = ei;
  981. else if (!max)
  982. max = ei;
  983. }
  984. spin_unlock(&sbi->s_es_lru_lock);
  985. seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
  986. percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
  987. percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt));
  988. seq_printf(seq, " %lu/%lu cache hits/misses\n",
  989. es_stats->es_stats_cache_hits,
  990. es_stats->es_stats_cache_misses);
  991. if (es_stats->es_stats_last_sorted != 0)
  992. seq_printf(seq, " %u ms last sorted interval\n",
  993. jiffies_to_msecs(jiffies -
  994. es_stats->es_stats_last_sorted));
  995. if (inode_cnt)
  996. seq_printf(seq, " %d inodes on lru list\n", inode_cnt);
  997. seq_printf(seq, "average:\n %llu us scan time\n",
  998. div_u64(es_stats->es_stats_scan_time, 1000));
  999. seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk);
  1000. if (inode_cnt)
  1001. seq_printf(seq,
  1002. "maximum:\n %lu inode (%u objects, %u reclaimable)\n"
  1003. " %llu us max scan time\n",
  1004. max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr,
  1005. div_u64(es_stats->es_stats_max_scan_time, 1000));
  1006. return 0;
  1007. }
  1008. static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
  1009. {
  1010. }
  1011. static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
  1012. .start = ext4_es_seq_shrinker_info_start,
  1013. .next = ext4_es_seq_shrinker_info_next,
  1014. .stop = ext4_es_seq_shrinker_info_stop,
  1015. .show = ext4_es_seq_shrinker_info_show,
  1016. };
  1017. static int
  1018. ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
  1019. {
  1020. int ret;
  1021. ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
  1022. if (!ret) {
  1023. struct seq_file *m = file->private_data;
  1024. m->private = PDE_DATA(inode);
  1025. }
  1026. return ret;
  1027. }
  1028. static int
  1029. ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
  1030. {
  1031. return seq_release(inode, file);
  1032. }
  1033. static const struct file_operations ext4_es_seq_shrinker_info_fops = {
  1034. .owner = THIS_MODULE,
  1035. .open = ext4_es_seq_shrinker_info_open,
  1036. .read = seq_read,
  1037. .llseek = seq_lseek,
  1038. .release = ext4_es_seq_shrinker_info_release,
  1039. };
  1040. int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
  1041. {
  1042. int err;
  1043. INIT_LIST_HEAD(&sbi->s_es_lru);
  1044. spin_lock_init(&sbi->s_es_lru_lock);
  1045. sbi->s_es_stats.es_stats_last_sorted = 0;
  1046. sbi->s_es_stats.es_stats_shrunk = 0;
  1047. sbi->s_es_stats.es_stats_cache_hits = 0;
  1048. sbi->s_es_stats.es_stats_cache_misses = 0;
  1049. sbi->s_es_stats.es_stats_scan_time = 0;
  1050. sbi->s_es_stats.es_stats_max_scan_time = 0;
  1051. err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
  1052. if (err)
  1053. return err;
  1054. err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0, GFP_KERNEL);
  1055. if (err)
  1056. goto err1;
  1057. sbi->s_es_shrinker.scan_objects = ext4_es_scan;
  1058. sbi->s_es_shrinker.count_objects = ext4_es_count;
  1059. sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
  1060. err = register_shrinker(&sbi->s_es_shrinker);
  1061. if (err)
  1062. goto err2;
  1063. if (sbi->s_proc)
  1064. proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
  1065. &ext4_es_seq_shrinker_info_fops, sbi);
  1066. return 0;
  1067. err2:
  1068. percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
  1069. err1:
  1070. percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
  1071. return err;
  1072. }
  1073. void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
  1074. {
  1075. if (sbi->s_proc)
  1076. remove_proc_entry("es_shrinker_info", sbi->s_proc);
  1077. percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
  1078. percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
  1079. unregister_shrinker(&sbi->s_es_shrinker);
  1080. }
  1081. void ext4_es_lru_add(struct inode *inode)
  1082. {
  1083. struct ext4_inode_info *ei = EXT4_I(inode);
  1084. struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
  1085. ei->i_touch_when = jiffies;
  1086. if (!list_empty(&ei->i_es_lru))
  1087. return;
  1088. spin_lock(&sbi->s_es_lru_lock);
  1089. if (list_empty(&ei->i_es_lru))
  1090. list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
  1091. spin_unlock(&sbi->s_es_lru_lock);
  1092. }
  1093. void ext4_es_lru_del(struct inode *inode)
  1094. {
  1095. struct ext4_inode_info *ei = EXT4_I(inode);
  1096. struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
  1097. spin_lock(&sbi->s_es_lru_lock);
  1098. if (!list_empty(&ei->i_es_lru))
  1099. list_del_init(&ei->i_es_lru);
  1100. spin_unlock(&sbi->s_es_lru_lock);
  1101. }
  1102. static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
  1103. int nr_to_scan)
  1104. {
  1105. struct inode *inode = &ei->vfs_inode;
  1106. struct ext4_es_tree *tree = &ei->i_es_tree;
  1107. struct rb_node *node;
  1108. struct extent_status *es;
  1109. unsigned long nr_shrunk = 0;
  1110. static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
  1111. DEFAULT_RATELIMIT_BURST);
  1112. if (ei->i_es_lru_nr == 0)
  1113. return 0;
  1114. if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
  1115. __ratelimit(&_rs))
  1116. ext4_warning(inode->i_sb, "forced shrink of precached extents");
  1117. node = rb_first(&tree->root);
  1118. while (node != NULL) {
  1119. es = rb_entry(node, struct extent_status, rb_node);
  1120. node = rb_next(&es->rb_node);
  1121. /*
  1122. * We can't reclaim delayed extent from status tree because
  1123. * fiemap, bigallic, and seek_data/hole need to use it.
  1124. */
  1125. if (!ext4_es_is_delayed(es)) {
  1126. rb_erase(&es->rb_node, &tree->root);
  1127. ext4_es_free_extent(inode, es);
  1128. nr_shrunk++;
  1129. if (--nr_to_scan == 0)
  1130. break;
  1131. }
  1132. }
  1133. tree->cache_es = NULL;
  1134. return nr_shrunk;
  1135. }