d3322752426fae84372e38132ebb5243f7993775
[linux] / fs / f2fs / node.c
1 /*
2  * fs/f2fs/node.c
3  *
4  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5  *             http://www.samsung.com/
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  */
11 #include <linux/fs.h>
12 #include <linux/f2fs_fs.h>
13 #include <linux/mpage.h>
14 #include <linux/backing-dev.h>
15 #include <linux/blkdev.h>
16 #include <linux/pagevec.h>
17 #include <linux/swap.h>
18
19 #include "f2fs.h"
20 #include "node.h"
21 #include "segment.h"
22 #include "xattr.h"
23 #include "trace.h"
24 #include <trace/events/f2fs.h>
25
26 #define on_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock)
27
28 static struct kmem_cache *nat_entry_slab;
29 static struct kmem_cache *free_nid_slab;
30 static struct kmem_cache *nat_entry_set_slab;
31
32 bool available_free_memory(struct f2fs_sb_info *sbi, int type)
33 {
34         struct f2fs_nm_info *nm_i = NM_I(sbi);
35         struct sysinfo val;
36         unsigned long avail_ram;
37         unsigned long mem_size = 0;
38         bool res = false;
39
40         si_meminfo(&val);
41
42         /* only uses low memory */
43         avail_ram = val.totalram - val.totalhigh;
44
45         /*
46          * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
47          */
48         if (type == FREE_NIDS) {
49                 mem_size = (nm_i->nid_cnt[FREE_NID] *
50                                 sizeof(struct free_nid)) >> PAGE_SHIFT;
51                 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
52         } else if (type == NAT_ENTRIES) {
53                 mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
54                                                         PAGE_SHIFT;
55                 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
56                 if (excess_cached_nats(sbi))
57                         res = false;
58         } else if (type == DIRTY_DENTS) {
59                 if (sbi->sb->s_bdi->wb.dirty_exceeded)
60                         return false;
61                 mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
62                 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
63         } else if (type == INO_ENTRIES) {
64                 int i;
65
66                 for (i = 0; i < MAX_INO_ENTRY; i++)
67                         mem_size += sbi->im[i].ino_num *
68                                                 sizeof(struct ino_entry);
69                 mem_size >>= PAGE_SHIFT;
70                 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
71         } else if (type == EXTENT_CACHE) {
72                 mem_size = (atomic_read(&sbi->total_ext_tree) *
73                                 sizeof(struct extent_tree) +
74                                 atomic_read(&sbi->total_ext_node) *
75                                 sizeof(struct extent_node)) >> PAGE_SHIFT;
76                 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
77         } else if (type == INMEM_PAGES) {
78                 /* it allows 20% / total_ram for inmemory pages */
79                 mem_size = get_pages(sbi, F2FS_INMEM_PAGES);
80                 res = mem_size < (val.totalram / 5);
81         } else {
82                 if (!sbi->sb->s_bdi->wb.dirty_exceeded)
83                         return true;
84         }
85         return res;
86 }
87
88 static void clear_node_page_dirty(struct page *page)
89 {
90         struct address_space *mapping = page->mapping;
91         unsigned int long flags;
92
93         if (PageDirty(page)) {
94                 spin_lock_irqsave(&mapping->tree_lock, flags);
95                 radix_tree_tag_clear(&mapping->page_tree,
96                                 page_index(page),
97                                 PAGECACHE_TAG_DIRTY);
98                 spin_unlock_irqrestore(&mapping->tree_lock, flags);
99
100                 clear_page_dirty_for_io(page);
101                 dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
102         }
103         ClearPageUptodate(page);
104 }
105
106 static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
107 {
108         pgoff_t index = current_nat_addr(sbi, nid);
109         return get_meta_page(sbi, index);
110 }
111
112 static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
113 {
114         struct page *src_page;
115         struct page *dst_page;
116         pgoff_t src_off;
117         pgoff_t dst_off;
118         void *src_addr;
119         void *dst_addr;
120         struct f2fs_nm_info *nm_i = NM_I(sbi);
121
122         src_off = current_nat_addr(sbi, nid);
123         dst_off = next_nat_addr(sbi, src_off);
124
125         /* get current nat block page with lock */
126         src_page = get_meta_page(sbi, src_off);
127         dst_page = grab_meta_page(sbi, dst_off);
128         f2fs_bug_on(sbi, PageDirty(src_page));
129
130         src_addr = page_address(src_page);
131         dst_addr = page_address(dst_page);
132         memcpy(dst_addr, src_addr, PAGE_SIZE);
133         set_page_dirty(dst_page);
134         f2fs_put_page(src_page, 1);
135
136         set_to_next_nat(nm_i, nid);
137
138         return dst_page;
139 }
140
141 static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail)
142 {
143         struct nat_entry *new;
144
145         if (no_fail)
146                 new = f2fs_kmem_cache_alloc(nat_entry_slab,
147                                                 GFP_NOFS | __GFP_ZERO);
148         else
149                 new = kmem_cache_alloc(nat_entry_slab,
150                                                 GFP_NOFS | __GFP_ZERO);
151         if (new) {
152                 nat_set_nid(new, nid);
153                 nat_reset_flag(new);
154         }
155         return new;
156 }
157
158 static void __free_nat_entry(struct nat_entry *e)
159 {
160         kmem_cache_free(nat_entry_slab, e);
161 }
162
163 /* must be locked by nat_tree_lock */
164 static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i,
165         struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail)
166 {
167         if (no_fail)
168                 f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne);
169         else if (radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne))
170                 return NULL;
171
172         if (raw_ne)
173                 node_info_from_raw_nat(&ne->ni, raw_ne);
174         list_add_tail(&ne->list, &nm_i->nat_entries);
175         nm_i->nat_cnt++;
176         return ne;
177 }
178
179 static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
180 {
181         return radix_tree_lookup(&nm_i->nat_root, n);
182 }
183
184 static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i,
185                 nid_t start, unsigned int nr, struct nat_entry **ep)
186 {
187         return radix_tree_gang_lookup(&nm_i->nat_root, (void **)ep, start, nr);
188 }
189
190 static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
191 {
192         list_del(&e->list);
193         radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
194         nm_i->nat_cnt--;
195         __free_nat_entry(e);
196 }
197
198 static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
199                                                 struct nat_entry *ne)
200 {
201         nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
202         struct nat_entry_set *head;
203
204         head = radix_tree_lookup(&nm_i->nat_set_root, set);
205         if (!head) {
206                 head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS);
207
208                 INIT_LIST_HEAD(&head->entry_list);
209                 INIT_LIST_HEAD(&head->set_list);
210                 head->set = set;
211                 head->entry_cnt = 0;
212                 f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
213         }
214
215         if (get_nat_flag(ne, IS_DIRTY))
216                 goto refresh_list;
217
218         nm_i->dirty_nat_cnt++;
219         head->entry_cnt++;
220         set_nat_flag(ne, IS_DIRTY, true);
221 refresh_list:
222         if (nat_get_blkaddr(ne) == NEW_ADDR)
223                 list_del_init(&ne->list);
224         else
225                 list_move_tail(&ne->list, &head->entry_list);
226 }
227
228 static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
229                 struct nat_entry_set *set, struct nat_entry *ne)
230 {
231         list_move_tail(&ne->list, &nm_i->nat_entries);
232         set_nat_flag(ne, IS_DIRTY, false);
233         set->entry_cnt--;
234         nm_i->dirty_nat_cnt--;
235 }
236
237 static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
238                 nid_t start, unsigned int nr, struct nat_entry_set **ep)
239 {
240         return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep,
241                                                         start, nr);
242 }
243
244 int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
245 {
246         struct f2fs_nm_info *nm_i = NM_I(sbi);
247         struct nat_entry *e;
248         bool need = false;
249
250         down_read(&nm_i->nat_tree_lock);
251         e = __lookup_nat_cache(nm_i, nid);
252         if (e) {
253                 if (!get_nat_flag(e, IS_CHECKPOINTED) &&
254                                 !get_nat_flag(e, HAS_FSYNCED_INODE))
255                         need = true;
256         }
257         up_read(&nm_i->nat_tree_lock);
258         return need;
259 }
260
261 bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
262 {
263         struct f2fs_nm_info *nm_i = NM_I(sbi);
264         struct nat_entry *e;
265         bool is_cp = true;
266
267         down_read(&nm_i->nat_tree_lock);
268         e = __lookup_nat_cache(nm_i, nid);
269         if (e && !get_nat_flag(e, IS_CHECKPOINTED))
270                 is_cp = false;
271         up_read(&nm_i->nat_tree_lock);
272         return is_cp;
273 }
274
275 bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
276 {
277         struct f2fs_nm_info *nm_i = NM_I(sbi);
278         struct nat_entry *e;
279         bool need_update = true;
280
281         down_read(&nm_i->nat_tree_lock);
282         e = __lookup_nat_cache(nm_i, ino);
283         if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
284                         (get_nat_flag(e, IS_CHECKPOINTED) ||
285                          get_nat_flag(e, HAS_FSYNCED_INODE)))
286                 need_update = false;
287         up_read(&nm_i->nat_tree_lock);
288         return need_update;
289 }
290
291 /* must be locked by nat_tree_lock */
292 static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
293                                                 struct f2fs_nat_entry *ne)
294 {
295         struct f2fs_nm_info *nm_i = NM_I(sbi);
296         struct nat_entry *new, *e;
297
298         new = __alloc_nat_entry(nid, false);
299         if (!new)
300                 return;
301
302         down_write(&nm_i->nat_tree_lock);
303         e = __lookup_nat_cache(nm_i, nid);
304         if (!e)
305                 e = __init_nat_entry(nm_i, new, ne, false);
306         else
307                 f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) ||
308                                 nat_get_blkaddr(e) !=
309                                         le32_to_cpu(ne->block_addr) ||
310                                 nat_get_version(e) != ne->version);
311         up_write(&nm_i->nat_tree_lock);
312         if (e != new)
313                 __free_nat_entry(new);
314 }
315
316 static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
317                         block_t new_blkaddr, bool fsync_done)
318 {
319         struct f2fs_nm_info *nm_i = NM_I(sbi);
320         struct nat_entry *e;
321         struct nat_entry *new = __alloc_nat_entry(ni->nid, true);
322
323         down_write(&nm_i->nat_tree_lock);
324         e = __lookup_nat_cache(nm_i, ni->nid);
325         if (!e) {
326                 e = __init_nat_entry(nm_i, new, NULL, true);
327                 copy_node_info(&e->ni, ni);
328                 f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
329         } else if (new_blkaddr == NEW_ADDR) {
330                 /*
331                  * when nid is reallocated,
332                  * previous nat entry can be remained in nat cache.
333                  * So, reinitialize it with new information.
334                  */
335                 copy_node_info(&e->ni, ni);
336                 f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
337         }
338         /* let's free early to reduce memory consumption */
339         if (e != new)
340                 __free_nat_entry(new);
341
342         /* sanity check */
343         f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr);
344         f2fs_bug_on(sbi, nat_get_blkaddr(e) == NULL_ADDR &&
345                         new_blkaddr == NULL_ADDR);
346         f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR &&
347                         new_blkaddr == NEW_ADDR);
348         f2fs_bug_on(sbi, nat_get_blkaddr(e) != NEW_ADDR &&
349                         nat_get_blkaddr(e) != NULL_ADDR &&
350                         new_blkaddr == NEW_ADDR);
351
352         /* increment version no as node is removed */
353         if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
354                 unsigned char version = nat_get_version(e);
355                 nat_set_version(e, inc_node_version(version));
356         }
357
358         /* change address */
359         nat_set_blkaddr(e, new_blkaddr);
360         if (new_blkaddr == NEW_ADDR || new_blkaddr == NULL_ADDR)
361                 set_nat_flag(e, IS_CHECKPOINTED, false);
362         __set_nat_cache_dirty(nm_i, e);
363
364         /* update fsync_mark if its inode nat entry is still alive */
365         if (ni->nid != ni->ino)
366                 e = __lookup_nat_cache(nm_i, ni->ino);
367         if (e) {
368                 if (fsync_done && ni->nid == ni->ino)
369                         set_nat_flag(e, HAS_FSYNCED_INODE, true);
370                 set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
371         }
372         up_write(&nm_i->nat_tree_lock);
373 }
374
375 int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
376 {
377         struct f2fs_nm_info *nm_i = NM_I(sbi);
378         int nr = nr_shrink;
379
380         if (!down_write_trylock(&nm_i->nat_tree_lock))
381                 return 0;
382
383         while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
384                 struct nat_entry *ne;
385                 ne = list_first_entry(&nm_i->nat_entries,
386                                         struct nat_entry, list);
387                 __del_from_nat_cache(nm_i, ne);
388                 nr_shrink--;
389         }
390         up_write(&nm_i->nat_tree_lock);
391         return nr - nr_shrink;
392 }
393
394 /*
395  * This function always returns success
396  */
397 void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
398 {
399         struct f2fs_nm_info *nm_i = NM_I(sbi);
400         struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
401         struct f2fs_journal *journal = curseg->journal;
402         nid_t start_nid = START_NID(nid);
403         struct f2fs_nat_block *nat_blk;
404         struct page *page = NULL;
405         struct f2fs_nat_entry ne;
406         struct nat_entry *e;
407         pgoff_t index;
408         int i;
409
410         ni->nid = nid;
411
412         /* Check nat cache */
413         down_read(&nm_i->nat_tree_lock);
414         e = __lookup_nat_cache(nm_i, nid);
415         if (e) {
416                 ni->ino = nat_get_ino(e);
417                 ni->blk_addr = nat_get_blkaddr(e);
418                 ni->version = nat_get_version(e);
419                 up_read(&nm_i->nat_tree_lock);
420                 return;
421         }
422
423         memset(&ne, 0, sizeof(struct f2fs_nat_entry));
424
425         /* Check current segment summary */
426         down_read(&curseg->journal_rwsem);
427         i = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
428         if (i >= 0) {
429                 ne = nat_in_journal(journal, i);
430                 node_info_from_raw_nat(ni, &ne);
431         }
432         up_read(&curseg->journal_rwsem);
433         if (i >= 0) {
434                 up_read(&nm_i->nat_tree_lock);
435                 goto cache;
436         }
437
438         /* Fill node_info from nat page */
439         index = current_nat_addr(sbi, nid);
440         up_read(&nm_i->nat_tree_lock);
441
442         page = get_meta_page(sbi, index);
443         nat_blk = (struct f2fs_nat_block *)page_address(page);
444         ne = nat_blk->entries[nid - start_nid];
445         node_info_from_raw_nat(ni, &ne);
446         f2fs_put_page(page, 1);
447 cache:
448         /* cache nat entry */
449         cache_nat_entry(sbi, nid, &ne);
450 }
451
452 /*
453  * readahead MAX_RA_NODE number of node pages.
454  */
455 static void ra_node_pages(struct page *parent, int start, int n)
456 {
457         struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
458         struct blk_plug plug;
459         int i, end;
460         nid_t nid;
461
462         blk_start_plug(&plug);
463
464         /* Then, try readahead for siblings of the desired node */
465         end = start + n;
466         end = min(end, NIDS_PER_BLOCK);
467         for (i = start; i < end; i++) {
468                 nid = get_nid(parent, i, false);
469                 ra_node_page(sbi, nid);
470         }
471
472         blk_finish_plug(&plug);
473 }
474
475 pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs)
476 {
477         const long direct_index = ADDRS_PER_INODE(dn->inode);
478         const long direct_blks = ADDRS_PER_BLOCK;
479         const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
480         unsigned int skipped_unit = ADDRS_PER_BLOCK;
481         int cur_level = dn->cur_level;
482         int max_level = dn->max_level;
483         pgoff_t base = 0;
484
485         if (!dn->max_level)
486                 return pgofs + 1;
487
488         while (max_level-- > cur_level)
489                 skipped_unit *= NIDS_PER_BLOCK;
490
491         switch (dn->max_level) {
492         case 3:
493                 base += 2 * indirect_blks;
494         case 2:
495                 base += 2 * direct_blks;
496         case 1:
497                 base += direct_index;
498                 break;
499         default:
500                 f2fs_bug_on(F2FS_I_SB(dn->inode), 1);
501         }
502
503         return ((pgofs - base) / skipped_unit + 1) * skipped_unit + base;
504 }
505
506 /*
507  * The maximum depth is four.
508  * Offset[0] will have raw inode offset.
509  */
510 static int get_node_path(struct inode *inode, long block,
511                                 int offset[4], unsigned int noffset[4])
512 {
513         const long direct_index = ADDRS_PER_INODE(inode);
514         const long direct_blks = ADDRS_PER_BLOCK;
515         const long dptrs_per_blk = NIDS_PER_BLOCK;
516         const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
517         const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK;
518         int n = 0;
519         int level = 0;
520
521         noffset[0] = 0;
522
523         if (block < direct_index) {
524                 offset[n] = block;
525                 goto got;
526         }
527         block -= direct_index;
528         if (block < direct_blks) {
529                 offset[n++] = NODE_DIR1_BLOCK;
530                 noffset[n] = 1;
531                 offset[n] = block;
532                 level = 1;
533                 goto got;
534         }
535         block -= direct_blks;
536         if (block < direct_blks) {
537                 offset[n++] = NODE_DIR2_BLOCK;
538                 noffset[n] = 2;
539                 offset[n] = block;
540                 level = 1;
541                 goto got;
542         }
543         block -= direct_blks;
544         if (block < indirect_blks) {
545                 offset[n++] = NODE_IND1_BLOCK;
546                 noffset[n] = 3;
547                 offset[n++] = block / direct_blks;
548                 noffset[n] = 4 + offset[n - 1];
549                 offset[n] = block % direct_blks;
550                 level = 2;
551                 goto got;
552         }
553         block -= indirect_blks;
554         if (block < indirect_blks) {
555                 offset[n++] = NODE_IND2_BLOCK;
556                 noffset[n] = 4 + dptrs_per_blk;
557                 offset[n++] = block / direct_blks;
558                 noffset[n] = 5 + dptrs_per_blk + offset[n - 1];
559                 offset[n] = block % direct_blks;
560                 level = 2;
561                 goto got;
562         }
563         block -= indirect_blks;
564         if (block < dindirect_blks) {
565                 offset[n++] = NODE_DIND_BLOCK;
566                 noffset[n] = 5 + (dptrs_per_blk * 2);
567                 offset[n++] = block / indirect_blks;
568                 noffset[n] = 6 + (dptrs_per_blk * 2) +
569                               offset[n - 1] * (dptrs_per_blk + 1);
570                 offset[n++] = (block / direct_blks) % dptrs_per_blk;
571                 noffset[n] = 7 + (dptrs_per_blk * 2) +
572                               offset[n - 2] * (dptrs_per_blk + 1) +
573                               offset[n - 1];
574                 offset[n] = block % direct_blks;
575                 level = 3;
576                 goto got;
577         } else {
578                 return -E2BIG;
579         }
580 got:
581         return level;
582 }
583
584 /*
585  * Caller should call f2fs_put_dnode(dn).
586  * Also, it should grab and release a rwsem by calling f2fs_lock_op() and
587  * f2fs_unlock_op() only if ro is not set RDONLY_NODE.
588  * In the case of RDONLY_NODE, we don't need to care about mutex.
589  */
590 int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
591 {
592         struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
593         struct page *npage[4];
594         struct page *parent = NULL;
595         int offset[4];
596         unsigned int noffset[4];
597         nid_t nids[4];
598         int level, i = 0;
599         int err = 0;
600
601         level = get_node_path(dn->inode, index, offset, noffset);
602         if (level < 0)
603                 return level;
604
605         nids[0] = dn->inode->i_ino;
606         npage[0] = dn->inode_page;
607
608         if (!npage[0]) {
609                 npage[0] = get_node_page(sbi, nids[0]);
610                 if (IS_ERR(npage[0]))
611                         return PTR_ERR(npage[0]);
612         }
613
614         /* if inline_data is set, should not report any block indices */
615         if (f2fs_has_inline_data(dn->inode) && index) {
616                 err = -ENOENT;
617                 f2fs_put_page(npage[0], 1);
618                 goto release_out;
619         }
620
621         parent = npage[0];
622         if (level != 0)
623                 nids[1] = get_nid(parent, offset[0], true);
624         dn->inode_page = npage[0];
625         dn->inode_page_locked = true;
626
627         /* get indirect or direct nodes */
628         for (i = 1; i <= level; i++) {
629                 bool done = false;
630
631                 if (!nids[i] && mode == ALLOC_NODE) {
632                         /* alloc new node */
633                         if (!alloc_nid(sbi, &(nids[i]))) {
634                                 err = -ENOSPC;
635                                 goto release_pages;
636                         }
637
638                         dn->nid = nids[i];
639                         npage[i] = new_node_page(dn, noffset[i]);
640                         if (IS_ERR(npage[i])) {
641                                 alloc_nid_failed(sbi, nids[i]);
642                                 err = PTR_ERR(npage[i]);
643                                 goto release_pages;
644                         }
645
646                         set_nid(parent, offset[i - 1], nids[i], i == 1);
647                         alloc_nid_done(sbi, nids[i]);
648                         done = true;
649                 } else if (mode == LOOKUP_NODE_RA && i == level && level > 1) {
650                         npage[i] = get_node_page_ra(parent, offset[i - 1]);
651                         if (IS_ERR(npage[i])) {
652                                 err = PTR_ERR(npage[i]);
653                                 goto release_pages;
654                         }
655                         done = true;
656                 }
657                 if (i == 1) {
658                         dn->inode_page_locked = false;
659                         unlock_page(parent);
660                 } else {
661                         f2fs_put_page(parent, 1);
662                 }
663
664                 if (!done) {
665                         npage[i] = get_node_page(sbi, nids[i]);
666                         if (IS_ERR(npage[i])) {
667                                 err = PTR_ERR(npage[i]);
668                                 f2fs_put_page(npage[0], 0);
669                                 goto release_out;
670                         }
671                 }
672                 if (i < level) {
673                         parent = npage[i];
674                         nids[i + 1] = get_nid(parent, offset[i], false);
675                 }
676         }
677         dn->nid = nids[level];
678         dn->ofs_in_node = offset[level];
679         dn->node_page = npage[level];
680         dn->data_blkaddr = datablock_addr(dn->inode,
681                                 dn->node_page, dn->ofs_in_node);
682         return 0;
683
684 release_pages:
685         f2fs_put_page(parent, 1);
686         if (i > 1)
687                 f2fs_put_page(npage[0], 0);
688 release_out:
689         dn->inode_page = NULL;
690         dn->node_page = NULL;
691         if (err == -ENOENT) {
692                 dn->cur_level = i;
693                 dn->max_level = level;
694                 dn->ofs_in_node = offset[level];
695         }
696         return err;
697 }
698
699 static void truncate_node(struct dnode_of_data *dn)
700 {
701         struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
702         struct node_info ni;
703
704         get_node_info(sbi, dn->nid, &ni);
705         f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
706
707         /* Deallocate node address */
708         invalidate_blocks(sbi, ni.blk_addr);
709         dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino);
710         set_node_addr(sbi, &ni, NULL_ADDR, false);
711
712         if (dn->nid == dn->inode->i_ino) {
713                 remove_orphan_inode(sbi, dn->nid);
714                 dec_valid_inode_count(sbi);
715                 f2fs_inode_synced(dn->inode);
716         }
717
718         clear_node_page_dirty(dn->node_page);
719         set_sbi_flag(sbi, SBI_IS_DIRTY);
720
721         f2fs_put_page(dn->node_page, 1);
722
723         invalidate_mapping_pages(NODE_MAPPING(sbi),
724                         dn->node_page->index, dn->node_page->index);
725
726         dn->node_page = NULL;
727         trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
728 }
729
730 static int truncate_dnode(struct dnode_of_data *dn)
731 {
732         struct page *page;
733
734         if (dn->nid == 0)
735                 return 1;
736
737         /* get direct node */
738         page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
739         if (IS_ERR(page) && PTR_ERR(page) == -ENOENT)
740                 return 1;
741         else if (IS_ERR(page))
742                 return PTR_ERR(page);
743
744         /* Make dnode_of_data for parameter */
745         dn->node_page = page;
746         dn->ofs_in_node = 0;
747         truncate_data_blocks(dn);
748         truncate_node(dn);
749         return 1;
750 }
751
752 static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
753                                                 int ofs, int depth)
754 {
755         struct dnode_of_data rdn = *dn;
756         struct page *page;
757         struct f2fs_node *rn;
758         nid_t child_nid;
759         unsigned int child_nofs;
760         int freed = 0;
761         int i, ret;
762
763         if (dn->nid == 0)
764                 return NIDS_PER_BLOCK + 1;
765
766         trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
767
768         page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
769         if (IS_ERR(page)) {
770                 trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page));
771                 return PTR_ERR(page);
772         }
773
774         ra_node_pages(page, ofs, NIDS_PER_BLOCK);
775
776         rn = F2FS_NODE(page);
777         if (depth < 3) {
778                 for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
779                         child_nid = le32_to_cpu(rn->in.nid[i]);
780                         if (child_nid == 0)
781                                 continue;
782                         rdn.nid = child_nid;
783                         ret = truncate_dnode(&rdn);
784                         if (ret < 0)
785                                 goto out_err;
786                         if (set_nid(page, i, 0, false))
787                                 dn->node_changed = true;
788                 }
789         } else {
790                 child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1;
791                 for (i = ofs; i < NIDS_PER_BLOCK; i++) {
792                         child_nid = le32_to_cpu(rn->in.nid[i]);
793                         if (child_nid == 0) {
794                                 child_nofs += NIDS_PER_BLOCK + 1;
795                                 continue;
796                         }
797                         rdn.nid = child_nid;
798                         ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1);
799                         if (ret == (NIDS_PER_BLOCK + 1)) {
800                                 if (set_nid(page, i, 0, false))
801                                         dn->node_changed = true;
802                                 child_nofs += ret;
803                         } else if (ret < 0 && ret != -ENOENT) {
804                                 goto out_err;
805                         }
806                 }
807                 freed = child_nofs;
808         }
809
810         if (!ofs) {
811                 /* remove current indirect node */
812                 dn->node_page = page;
813                 truncate_node(dn);
814                 freed++;
815         } else {
816                 f2fs_put_page(page, 1);
817         }
818         trace_f2fs_truncate_nodes_exit(dn->inode, freed);
819         return freed;
820
821 out_err:
822         f2fs_put_page(page, 1);
823         trace_f2fs_truncate_nodes_exit(dn->inode, ret);
824         return ret;
825 }
826
827 static int truncate_partial_nodes(struct dnode_of_data *dn,
828                         struct f2fs_inode *ri, int *offset, int depth)
829 {
830         struct page *pages[2];
831         nid_t nid[3];
832         nid_t child_nid;
833         int err = 0;
834         int i;
835         int idx = depth - 2;
836
837         nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
838         if (!nid[0])
839                 return 0;
840
841         /* get indirect nodes in the path */
842         for (i = 0; i < idx + 1; i++) {
843                 /* reference count'll be increased */
844                 pages[i] = get_node_page(F2FS_I_SB(dn->inode), nid[i]);
845                 if (IS_ERR(pages[i])) {
846                         err = PTR_ERR(pages[i]);
847                         idx = i - 1;
848                         goto fail;
849                 }
850                 nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
851         }
852
853         ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK);
854
855         /* free direct nodes linked to a partial indirect node */
856         for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) {
857                 child_nid = get_nid(pages[idx], i, false);
858                 if (!child_nid)
859                         continue;
860                 dn->nid = child_nid;
861                 err = truncate_dnode(dn);
862                 if (err < 0)
863                         goto fail;
864                 if (set_nid(pages[idx], i, 0, false))
865                         dn->node_changed = true;
866         }
867
868         if (offset[idx + 1] == 0) {
869                 dn->node_page = pages[idx];
870                 dn->nid = nid[idx];
871                 truncate_node(dn);
872         } else {
873                 f2fs_put_page(pages[idx], 1);
874         }
875         offset[idx]++;
876         offset[idx + 1] = 0;
877         idx--;
878 fail:
879         for (i = idx; i >= 0; i--)
880                 f2fs_put_page(pages[i], 1);
881
882         trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err);
883
884         return err;
885 }
886
887 /*
888  * All the block addresses of data and nodes should be nullified.
889  */
890 int truncate_inode_blocks(struct inode *inode, pgoff_t from)
891 {
892         struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
893         int err = 0, cont = 1;
894         int level, offset[4], noffset[4];
895         unsigned int nofs = 0;
896         struct f2fs_inode *ri;
897         struct dnode_of_data dn;
898         struct page *page;
899
900         trace_f2fs_truncate_inode_blocks_enter(inode, from);
901
902         level = get_node_path(inode, from, offset, noffset);
903         if (level < 0)
904                 return level;
905
906         page = get_node_page(sbi, inode->i_ino);
907         if (IS_ERR(page)) {
908                 trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page));
909                 return PTR_ERR(page);
910         }
911
912         set_new_dnode(&dn, inode, page, NULL, 0);
913         unlock_page(page);
914
915         ri = F2FS_INODE(page);
916         switch (level) {
917         case 0:
918         case 1:
919                 nofs = noffset[1];
920                 break;
921         case 2:
922                 nofs = noffset[1];
923                 if (!offset[level - 1])
924                         goto skip_partial;
925                 err = truncate_partial_nodes(&dn, ri, offset, level);
926                 if (err < 0 && err != -ENOENT)
927                         goto fail;
928                 nofs += 1 + NIDS_PER_BLOCK;
929                 break;
930         case 3:
931                 nofs = 5 + 2 * NIDS_PER_BLOCK;
932                 if (!offset[level - 1])
933                         goto skip_partial;
934                 err = truncate_partial_nodes(&dn, ri, offset, level);
935                 if (err < 0 && err != -ENOENT)
936                         goto fail;
937                 break;
938         default:
939                 BUG();
940         }
941
942 skip_partial:
943         while (cont) {
944                 dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
945                 switch (offset[0]) {
946                 case NODE_DIR1_BLOCK:
947                 case NODE_DIR2_BLOCK:
948                         err = truncate_dnode(&dn);
949                         break;
950
951                 case NODE_IND1_BLOCK:
952                 case NODE_IND2_BLOCK:
953                         err = truncate_nodes(&dn, nofs, offset[1], 2);
954                         break;
955
956                 case NODE_DIND_BLOCK:
957                         err = truncate_nodes(&dn, nofs, offset[1], 3);
958                         cont = 0;
959                         break;
960
961                 default:
962                         BUG();
963                 }
964                 if (err < 0 && err != -ENOENT)
965                         goto fail;
966                 if (offset[1] == 0 &&
967                                 ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
968                         lock_page(page);
969                         BUG_ON(page->mapping != NODE_MAPPING(sbi));
970                         f2fs_wait_on_page_writeback(page, NODE, true);
971                         ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
972                         set_page_dirty(page);
973                         unlock_page(page);
974                 }
975                 offset[1] = 0;
976                 offset[0]++;
977                 nofs += err;
978         }
979 fail:
980         f2fs_put_page(page, 0);
981         trace_f2fs_truncate_inode_blocks_exit(inode, err);
982         return err > 0 ? 0 : err;
983 }
984
985 /* caller must lock inode page */
986 int truncate_xattr_node(struct inode *inode)
987 {
988         struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
989         nid_t nid = F2FS_I(inode)->i_xattr_nid;
990         struct dnode_of_data dn;
991         struct page *npage;
992
993         if (!nid)
994                 return 0;
995
996         npage = get_node_page(sbi, nid);
997         if (IS_ERR(npage))
998                 return PTR_ERR(npage);
999
1000         f2fs_i_xnid_write(inode, 0);
1001
1002         set_new_dnode(&dn, inode, NULL, npage, nid);
1003         truncate_node(&dn);
1004         return 0;
1005 }
1006
1007 /*
1008  * Caller should grab and release a rwsem by calling f2fs_lock_op() and
1009  * f2fs_unlock_op().
1010  */
1011 int remove_inode_page(struct inode *inode)
1012 {
1013         struct dnode_of_data dn;
1014         int err;
1015
1016         set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
1017         err = get_dnode_of_data(&dn, 0, LOOKUP_NODE);
1018         if (err)
1019                 return err;
1020
1021         err = truncate_xattr_node(inode);
1022         if (err) {
1023                 f2fs_put_dnode(&dn);
1024                 return err;
1025         }
1026
1027         /* remove potential inline_data blocks */
1028         if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1029                                 S_ISLNK(inode->i_mode))
1030                 truncate_data_blocks_range(&dn, 1);
1031
1032         /* 0 is possible, after f2fs_new_inode() has failed */
1033         f2fs_bug_on(F2FS_I_SB(inode),
1034                         inode->i_blocks != 0 && inode->i_blocks != 8);
1035
1036         /* will put inode & node pages */
1037         truncate_node(&dn);
1038         return 0;
1039 }
1040
1041 struct page *new_inode_page(struct inode *inode)
1042 {
1043         struct dnode_of_data dn;
1044
1045         /* allocate inode page for new inode */
1046         set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
1047
1048         /* caller should f2fs_put_page(page, 1); */
1049         return new_node_page(&dn, 0);
1050 }
1051
1052 struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
1053 {
1054         struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
1055         struct node_info new_ni;
1056         struct page *page;
1057         int err;
1058
1059         if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
1060                 return ERR_PTR(-EPERM);
1061
1062         page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false);
1063         if (!page)
1064                 return ERR_PTR(-ENOMEM);
1065
1066         if (unlikely((err = inc_valid_node_count(sbi, dn->inode, !ofs))))
1067                 goto fail;
1068
1069 #ifdef CONFIG_F2FS_CHECK_FS
1070         get_node_info(sbi, dn->nid, &new_ni);
1071         f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR);
1072 #endif
1073         new_ni.nid = dn->nid;
1074         new_ni.ino = dn->inode->i_ino;
1075         new_ni.blk_addr = NULL_ADDR;
1076         new_ni.flag = 0;
1077         new_ni.version = 0;
1078         set_node_addr(sbi, &new_ni, NEW_ADDR, false);
1079
1080         f2fs_wait_on_page_writeback(page, NODE, true);
1081         fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
1082         set_cold_node(dn->inode, page);
1083         if (!PageUptodate(page))
1084                 SetPageUptodate(page);
1085         if (set_page_dirty(page))
1086                 dn->node_changed = true;
1087
1088         if (f2fs_has_xattr_block(ofs))
1089                 f2fs_i_xnid_write(dn->inode, dn->nid);
1090
1091         if (ofs == 0)
1092                 inc_valid_inode_count(sbi);
1093         return page;
1094
1095 fail:
1096         clear_node_page_dirty(page);
1097         f2fs_put_page(page, 1);
1098         return ERR_PTR(err);
1099 }
1100
1101 /*
1102  * Caller should do after getting the following values.
1103  * 0: f2fs_put_page(page, 0)
1104  * LOCKED_PAGE or error: f2fs_put_page(page, 1)
1105  */
1106 static int read_node_page(struct page *page, int op_flags)
1107 {
1108         struct f2fs_sb_info *sbi = F2FS_P_SB(page);
1109         struct node_info ni;
1110         struct f2fs_io_info fio = {
1111                 .sbi = sbi,
1112                 .type = NODE,
1113                 .op = REQ_OP_READ,
1114                 .op_flags = op_flags,
1115                 .page = page,
1116                 .encrypted_page = NULL,
1117         };
1118
1119         if (PageUptodate(page))
1120                 return LOCKED_PAGE;
1121
1122         get_node_info(sbi, page->index, &ni);
1123
1124         if (unlikely(ni.blk_addr == NULL_ADDR)) {
1125                 ClearPageUptodate(page);
1126                 return -ENOENT;
1127         }
1128
1129         fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr;
1130         return f2fs_submit_page_bio(&fio);
1131 }
1132
1133 /*
1134  * Readahead a node page
1135  */
1136 void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
1137 {
1138         struct page *apage;
1139         int err;
1140
1141         if (!nid)
1142                 return;
1143         f2fs_bug_on(sbi, check_nid_range(sbi, nid));
1144
1145         rcu_read_lock();
1146         apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid);
1147         rcu_read_unlock();
1148         if (apage)
1149                 return;
1150
1151         apage = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
1152         if (!apage)
1153                 return;
1154
1155         err = read_node_page(apage, REQ_RAHEAD);
1156         f2fs_put_page(apage, err ? 1 : 0);
1157 }
1158
1159 static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
1160                                         struct page *parent, int start)
1161 {
1162         struct page *page;
1163         int err;
1164
1165         if (!nid)
1166                 return ERR_PTR(-ENOENT);
1167         f2fs_bug_on(sbi, check_nid_range(sbi, nid));
1168 repeat:
1169         page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
1170         if (!page)
1171                 return ERR_PTR(-ENOMEM);
1172
1173         err = read_node_page(page, 0);
1174         if (err < 0) {
1175                 f2fs_put_page(page, 1);
1176                 return ERR_PTR(err);
1177         } else if (err == LOCKED_PAGE) {
1178                 err = 0;
1179                 goto page_hit;
1180         }
1181
1182         if (parent)
1183                 ra_node_pages(parent, start + 1, MAX_RA_NODE);
1184
1185         lock_page(page);
1186
1187         if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1188                 f2fs_put_page(page, 1);
1189                 goto repeat;
1190         }
1191
1192         if (unlikely(!PageUptodate(page))) {
1193                 err = -EIO;
1194                 goto out_err;
1195         }
1196
1197         if (!f2fs_inode_chksum_verify(sbi, page)) {
1198                 err = -EBADMSG;
1199                 goto out_err;
1200         }
1201 page_hit:
1202         if(unlikely(nid != nid_of_node(page))) {
1203                 f2fs_msg(sbi->sb, KERN_WARNING, "inconsistent node block, "
1204                         "nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
1205                         nid, nid_of_node(page), ino_of_node(page),
1206                         ofs_of_node(page), cpver_of_node(page),
1207                         next_blkaddr_of_node(page));
1208                 err = -EINVAL;
1209 out_err:
1210                 ClearPageUptodate(page);
1211                 f2fs_put_page(page, 1);
1212                 return ERR_PTR(err);
1213         }
1214         return page;
1215 }
1216
1217 struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
1218 {
1219         return __get_node_page(sbi, nid, NULL, 0);
1220 }
1221
1222 struct page *get_node_page_ra(struct page *parent, int start)
1223 {
1224         struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
1225         nid_t nid = get_nid(parent, start, false);
1226
1227         return __get_node_page(sbi, nid, parent, start);
1228 }
1229
1230 static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
1231 {
1232         struct inode *inode;
1233         struct page *page;
1234         int ret;
1235
1236         /* should flush inline_data before evict_inode */
1237         inode = ilookup(sbi->sb, ino);
1238         if (!inode)
1239                 return;
1240
1241         page = f2fs_pagecache_get_page(inode->i_mapping, 0,
1242                                         FGP_LOCK|FGP_NOWAIT, 0);
1243         if (!page)
1244                 goto iput_out;
1245
1246         if (!PageUptodate(page))
1247                 goto page_out;
1248
1249         if (!PageDirty(page))
1250                 goto page_out;
1251
1252         if (!clear_page_dirty_for_io(page))
1253                 goto page_out;
1254
1255         ret = f2fs_write_inline_data(inode, page);
1256         inode_dec_dirty_pages(inode);
1257         remove_dirty_inode(inode);
1258         if (ret)
1259                 set_page_dirty(page);
1260 page_out:
1261         f2fs_put_page(page, 1);
1262 iput_out:
1263         iput(inode);
1264 }
1265
1266 static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
1267 {
1268         pgoff_t index;
1269         struct pagevec pvec;
1270         struct page *last_page = NULL;
1271         int nr_pages;
1272
1273         pagevec_init(&pvec);
1274         index = 0;
1275
1276         while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1277                                 PAGECACHE_TAG_DIRTY))) {
1278                 int i;
1279
1280                 for (i = 0; i < nr_pages; i++) {
1281                         struct page *page = pvec.pages[i];
1282
1283                         if (unlikely(f2fs_cp_error(sbi))) {
1284                                 f2fs_put_page(last_page, 0);
1285                                 pagevec_release(&pvec);
1286                                 return ERR_PTR(-EIO);
1287                         }
1288
1289                         if (!IS_DNODE(page) || !is_cold_node(page))
1290                                 continue;
1291                         if (ino_of_node(page) != ino)
1292                                 continue;
1293
1294                         lock_page(page);
1295
1296                         if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1297 continue_unlock:
1298                                 unlock_page(page);
1299                                 continue;
1300                         }
1301                         if (ino_of_node(page) != ino)
1302                                 goto continue_unlock;
1303
1304                         if (!PageDirty(page)) {
1305                                 /* someone wrote it for us */
1306                                 goto continue_unlock;
1307                         }
1308
1309                         if (last_page)
1310                                 f2fs_put_page(last_page, 0);
1311
1312                         get_page(page);
1313                         last_page = page;
1314                         unlock_page(page);
1315                 }
1316                 pagevec_release(&pvec);
1317                 cond_resched();
1318         }
1319         return last_page;
1320 }
1321
1322 static int __write_node_page(struct page *page, bool atomic, bool *submitted,
1323                                 struct writeback_control *wbc, bool do_balance,
1324                                 enum iostat_type io_type)
1325 {
1326         struct f2fs_sb_info *sbi = F2FS_P_SB(page);
1327         nid_t nid;
1328         struct node_info ni;
1329         struct f2fs_io_info fio = {
1330                 .sbi = sbi,
1331                 .ino = ino_of_node(page),
1332                 .type = NODE,
1333                 .op = REQ_OP_WRITE,
1334                 .op_flags = wbc_to_write_flags(wbc),
1335                 .page = page,
1336                 .encrypted_page = NULL,
1337                 .submitted = false,
1338                 .io_type = io_type,
1339         };
1340
1341         trace_f2fs_writepage(page, NODE);
1342
1343         if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
1344                 goto redirty_out;
1345         if (unlikely(f2fs_cp_error(sbi)))
1346                 goto redirty_out;
1347
1348         /* get old block addr of this node page */
1349         nid = nid_of_node(page);
1350         f2fs_bug_on(sbi, page->index != nid);
1351
1352         if (wbc->for_reclaim) {
1353                 if (!down_read_trylock(&sbi->node_write))
1354                         goto redirty_out;
1355         } else {
1356                 down_read(&sbi->node_write);
1357         }
1358
1359         get_node_info(sbi, nid, &ni);
1360
1361         /* This page is already truncated */
1362         if (unlikely(ni.blk_addr == NULL_ADDR)) {
1363                 ClearPageUptodate(page);
1364                 dec_page_count(sbi, F2FS_DIRTY_NODES);
1365                 up_read(&sbi->node_write);
1366                 unlock_page(page);
1367                 return 0;
1368         }
1369
1370         if (atomic && !test_opt(sbi, NOBARRIER))
1371                 fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
1372
1373         set_page_writeback(page);
1374         fio.old_blkaddr = ni.blk_addr;
1375         write_node_page(nid, &fio);
1376         set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
1377         dec_page_count(sbi, F2FS_DIRTY_NODES);
1378         up_read(&sbi->node_write);
1379
1380         if (wbc->for_reclaim) {
1381                 f2fs_submit_merged_write_cond(sbi, page->mapping->host, 0,
1382                                                 page->index, NODE);
1383                 submitted = NULL;
1384         }
1385
1386         unlock_page(page);
1387
1388         if (unlikely(f2fs_cp_error(sbi))) {
1389                 f2fs_submit_merged_write(sbi, NODE);
1390                 submitted = NULL;
1391         }
1392         if (submitted)
1393                 *submitted = fio.submitted;
1394
1395         if (do_balance)
1396                 f2fs_balance_fs(sbi, false);
1397         return 0;
1398
1399 redirty_out:
1400         redirty_page_for_writepage(wbc, page);
1401         return AOP_WRITEPAGE_ACTIVATE;
1402 }
1403
1404 void move_node_page(struct page *node_page, int gc_type)
1405 {
1406         if (gc_type == FG_GC) {
1407                 struct writeback_control wbc = {
1408                         .sync_mode = WB_SYNC_ALL,
1409                         .nr_to_write = 1,
1410                         .for_reclaim = 0,
1411                 };
1412
1413                 set_page_dirty(node_page);
1414                 f2fs_wait_on_page_writeback(node_page, NODE, true);
1415
1416                 f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page));
1417                 if (!clear_page_dirty_for_io(node_page))
1418                         goto out_page;
1419
1420                 if (__write_node_page(node_page, false, NULL,
1421                                         &wbc, false, FS_GC_NODE_IO))
1422                         unlock_page(node_page);
1423                 goto release_page;
1424         } else {
1425                 /* set page dirty and write it */
1426                 if (!PageWriteback(node_page))
1427                         set_page_dirty(node_page);
1428         }
1429 out_page:
1430         unlock_page(node_page);
1431 release_page:
1432         f2fs_put_page(node_page, 0);
1433 }
1434
1435 static int f2fs_write_node_page(struct page *page,
1436                                 struct writeback_control *wbc)
1437 {
1438         return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO);
1439 }
1440
1441 int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
1442                         struct writeback_control *wbc, bool atomic)
1443 {
1444         pgoff_t index;
1445         pgoff_t last_idx = ULONG_MAX;
1446         struct pagevec pvec;
1447         int ret = 0;
1448         struct page *last_page = NULL;
1449         bool marked = false;
1450         nid_t ino = inode->i_ino;
1451         int nr_pages;
1452
1453         if (atomic) {
1454                 last_page = last_fsync_dnode(sbi, ino);
1455                 if (IS_ERR_OR_NULL(last_page))
1456                         return PTR_ERR_OR_ZERO(last_page);
1457         }
1458 retry:
1459         pagevec_init(&pvec);
1460         index = 0;
1461
1462         while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1463                                 PAGECACHE_TAG_DIRTY))) {
1464                 int i;
1465
1466                 for (i = 0; i < nr_pages; i++) {
1467                         struct page *page = pvec.pages[i];
1468                         bool submitted = false;
1469
1470                         if (unlikely(f2fs_cp_error(sbi))) {
1471                                 f2fs_put_page(last_page, 0);
1472                                 pagevec_release(&pvec);
1473                                 ret = -EIO;
1474                                 goto out;
1475                         }
1476
1477                         if (!IS_DNODE(page) || !is_cold_node(page))
1478                                 continue;
1479                         if (ino_of_node(page) != ino)
1480                                 continue;
1481
1482                         lock_page(page);
1483
1484                         if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1485 continue_unlock:
1486                                 unlock_page(page);
1487                                 continue;
1488                         }
1489                         if (ino_of_node(page) != ino)
1490                                 goto continue_unlock;
1491
1492                         if (!PageDirty(page) && page != last_page) {
1493                                 /* someone wrote it for us */
1494                                 goto continue_unlock;
1495                         }
1496
1497                         f2fs_wait_on_page_writeback(page, NODE, true);
1498                         BUG_ON(PageWriteback(page));
1499
1500                         set_fsync_mark(page, 0);
1501                         set_dentry_mark(page, 0);
1502
1503                         if (!atomic || page == last_page) {
1504                                 set_fsync_mark(page, 1);
1505                                 if (IS_INODE(page)) {
1506                                         if (is_inode_flag_set(inode,
1507                                                                 FI_DIRTY_INODE))
1508                                                 update_inode(inode, page);
1509                                         set_dentry_mark(page,
1510                                                 need_dentry_mark(sbi, ino));
1511                                 }
1512                                 /*  may be written by other thread */
1513                                 if (!PageDirty(page))
1514                                         set_page_dirty(page);
1515                         }
1516
1517                         if (!clear_page_dirty_for_io(page))
1518                                 goto continue_unlock;
1519
1520                         ret = __write_node_page(page, atomic &&
1521                                                 page == last_page,
1522                                                 &submitted, wbc, true,
1523                                                 FS_NODE_IO);
1524                         if (ret) {
1525                                 unlock_page(page);
1526                                 f2fs_put_page(last_page, 0);
1527                                 break;
1528                         } else if (submitted) {
1529                                 last_idx = page->index;
1530                         }
1531
1532                         if (page == last_page) {
1533                                 f2fs_put_page(page, 0);
1534                                 marked = true;
1535                                 break;
1536                         }
1537                 }
1538                 pagevec_release(&pvec);
1539                 cond_resched();
1540
1541                 if (ret || marked)
1542                         break;
1543         }
1544         if (!ret && atomic && !marked) {
1545                 f2fs_msg(sbi->sb, KERN_DEBUG,
1546                         "Retry to write fsync mark: ino=%u, idx=%lx",
1547                                         ino, last_page->index);
1548                 lock_page(last_page);
1549                 f2fs_wait_on_page_writeback(last_page, NODE, true);
1550                 set_page_dirty(last_page);
1551                 unlock_page(last_page);
1552                 goto retry;
1553         }
1554 out:
1555         if (last_idx != ULONG_MAX)
1556                 f2fs_submit_merged_write_cond(sbi, NULL, ino, last_idx, NODE);
1557         return ret ? -EIO: 0;
1558 }
1559
1560 int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc,
1561                                 bool do_balance, enum iostat_type io_type)
1562 {
1563         pgoff_t index;
1564         struct pagevec pvec;
1565         int step = 0;
1566         int nwritten = 0;
1567         int ret = 0;
1568         int nr_pages;
1569
1570         pagevec_init(&pvec);
1571
1572 next_step:
1573         index = 0;
1574
1575         while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1576                                 PAGECACHE_TAG_DIRTY))) {
1577                 int i;
1578
1579                 for (i = 0; i < nr_pages; i++) {
1580                         struct page *page = pvec.pages[i];
1581                         bool submitted = false;
1582
1583                         if (unlikely(f2fs_cp_error(sbi))) {
1584                                 pagevec_release(&pvec);
1585                                 ret = -EIO;
1586                                 goto out;
1587                         }
1588
1589                         /*
1590                          * flushing sequence with step:
1591                          * 0. indirect nodes
1592                          * 1. dentry dnodes
1593                          * 2. file dnodes
1594                          */
1595                         if (step == 0 && IS_DNODE(page))
1596                                 continue;
1597                         if (step == 1 && (!IS_DNODE(page) ||
1598                                                 is_cold_node(page)))
1599                                 continue;
1600                         if (step == 2 && (!IS_DNODE(page) ||
1601                                                 !is_cold_node(page)))
1602                                 continue;
1603 lock_node:
1604                         if (!trylock_page(page))
1605                                 continue;
1606
1607                         if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1608 continue_unlock:
1609                                 unlock_page(page);
1610                                 continue;
1611                         }
1612
1613                         if (!PageDirty(page)) {
1614                                 /* someone wrote it for us */
1615                                 goto continue_unlock;
1616                         }
1617
1618                         /* flush inline_data */
1619                         if (is_inline_node(page)) {
1620                                 clear_inline_node(page);
1621                                 unlock_page(page);
1622                                 flush_inline_data(sbi, ino_of_node(page));
1623                                 goto lock_node;
1624                         }
1625
1626                         f2fs_wait_on_page_writeback(page, NODE, true);
1627
1628                         BUG_ON(PageWriteback(page));
1629                         if (!clear_page_dirty_for_io(page))
1630                                 goto continue_unlock;
1631
1632                         set_fsync_mark(page, 0);
1633                         set_dentry_mark(page, 0);
1634
1635                         ret = __write_node_page(page, false, &submitted,
1636                                                 wbc, do_balance, io_type);
1637                         if (ret)
1638                                 unlock_page(page);
1639                         else if (submitted)
1640                                 nwritten++;
1641
1642                         if (--wbc->nr_to_write == 0)
1643                                 break;
1644                 }
1645                 pagevec_release(&pvec);
1646                 cond_resched();
1647
1648                 if (wbc->nr_to_write == 0) {
1649                         step = 2;
1650                         break;
1651                 }
1652         }
1653
1654         if (step < 2) {
1655                 step++;
1656                 goto next_step;
1657         }
1658 out:
1659         if (nwritten)
1660                 f2fs_submit_merged_write(sbi, NODE);
1661         return ret;
1662 }
1663
1664 int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
1665 {
1666         pgoff_t index = 0;
1667         struct pagevec pvec;
1668         int ret2, ret = 0;
1669         int nr_pages;
1670
1671         pagevec_init(&pvec);
1672
1673         while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1674                                 PAGECACHE_TAG_WRITEBACK))) {
1675                 int i;
1676
1677                 for (i = 0; i < nr_pages; i++) {
1678                         struct page *page = pvec.pages[i];
1679
1680                         if (ino && ino_of_node(page) == ino) {
1681                                 f2fs_wait_on_page_writeback(page, NODE, true);
1682                                 if (TestClearPageError(page))
1683                                         ret = -EIO;
1684                         }
1685                 }
1686                 pagevec_release(&pvec);
1687                 cond_resched();
1688         }
1689
1690         ret2 = filemap_check_errors(NODE_MAPPING(sbi));
1691         if (!ret)
1692                 ret = ret2;
1693         return ret;
1694 }
1695
1696 static int f2fs_write_node_pages(struct address_space *mapping,
1697                             struct writeback_control *wbc)
1698 {
1699         struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
1700         struct blk_plug plug;
1701         long diff;
1702
1703         if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
1704                 goto skip_write;
1705
1706         /* balancing f2fs's metadata in background */
1707         f2fs_balance_fs_bg(sbi);
1708
1709         /* collect a number of dirty node pages and write together */
1710         if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
1711                 goto skip_write;
1712
1713         trace_f2fs_writepages(mapping->host, wbc, NODE);
1714
1715         diff = nr_pages_to_write(sbi, NODE, wbc);
1716         wbc->sync_mode = WB_SYNC_NONE;
1717         blk_start_plug(&plug);
1718         sync_node_pages(sbi, wbc, true, FS_NODE_IO);
1719         blk_finish_plug(&plug);
1720         wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
1721         return 0;
1722
1723 skip_write:
1724         wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES);
1725         trace_f2fs_writepages(mapping->host, wbc, NODE);
1726         return 0;
1727 }
1728
1729 static int f2fs_set_node_page_dirty(struct page *page)
1730 {
1731         trace_f2fs_set_page_dirty(page, NODE);
1732
1733         if (!PageUptodate(page))
1734                 SetPageUptodate(page);
1735         if (!PageDirty(page)) {
1736                 f2fs_set_page_dirty_nobuffers(page);
1737                 inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
1738                 SetPagePrivate(page);
1739                 f2fs_trace_pid(page);
1740                 return 1;
1741         }
1742         return 0;
1743 }
1744
1745 /*
1746  * Structure of the f2fs node operations
1747  */
1748 const struct address_space_operations f2fs_node_aops = {
1749         .writepage      = f2fs_write_node_page,
1750         .writepages     = f2fs_write_node_pages,
1751         .set_page_dirty = f2fs_set_node_page_dirty,
1752         .invalidatepage = f2fs_invalidate_page,
1753         .releasepage    = f2fs_release_page,
1754 #ifdef CONFIG_MIGRATION
1755         .migratepage    = f2fs_migrate_page,
1756 #endif
1757 };
1758
1759 static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
1760                                                 nid_t n)
1761 {
1762         return radix_tree_lookup(&nm_i->free_nid_root, n);
1763 }
1764
1765 static int __insert_free_nid(struct f2fs_sb_info *sbi,
1766                         struct free_nid *i, enum nid_state state)
1767 {
1768         struct f2fs_nm_info *nm_i = NM_I(sbi);
1769
1770         int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
1771         if (err)
1772                 return err;
1773
1774         f2fs_bug_on(sbi, state != i->state);
1775         nm_i->nid_cnt[state]++;
1776         if (state == FREE_NID)
1777                 list_add_tail(&i->list, &nm_i->free_nid_list);
1778         return 0;
1779 }
1780
1781 static void __remove_free_nid(struct f2fs_sb_info *sbi,
1782                         struct free_nid *i, enum nid_state state)
1783 {
1784         struct f2fs_nm_info *nm_i = NM_I(sbi);
1785
1786         f2fs_bug_on(sbi, state != i->state);
1787         nm_i->nid_cnt[state]--;
1788         if (state == FREE_NID)
1789                 list_del(&i->list);
1790         radix_tree_delete(&nm_i->free_nid_root, i->nid);
1791 }
1792
1793 static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i,
1794                         enum nid_state org_state, enum nid_state dst_state)
1795 {
1796         struct f2fs_nm_info *nm_i = NM_I(sbi);
1797
1798         f2fs_bug_on(sbi, org_state != i->state);
1799         i->state = dst_state;
1800         nm_i->nid_cnt[org_state]--;
1801         nm_i->nid_cnt[dst_state]++;
1802
1803         switch (dst_state) {
1804         case PREALLOC_NID:
1805                 list_del(&i->list);
1806                 break;
1807         case FREE_NID:
1808                 list_add_tail(&i->list, &nm_i->free_nid_list);
1809                 break;
1810         default:
1811                 BUG_ON(1);
1812         }
1813 }
1814
1815 /* return if the nid is recognized as free */
1816 static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
1817 {
1818         struct f2fs_nm_info *nm_i = NM_I(sbi);
1819         struct free_nid *i, *e;
1820         struct nat_entry *ne;
1821         int err = -EINVAL;
1822         bool ret = false;
1823
1824         /* 0 nid should not be used */
1825         if (unlikely(nid == 0))
1826                 return false;
1827
1828         i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
1829         i->nid = nid;
1830         i->state = FREE_NID;
1831
1832         if (radix_tree_preload(GFP_NOFS))
1833                 goto err;
1834
1835         spin_lock(&nm_i->nid_list_lock);
1836
1837         if (build) {
1838                 /*
1839                  *   Thread A             Thread B
1840                  *  - f2fs_create
1841                  *   - f2fs_new_inode
1842                  *    - alloc_nid
1843                  *     - __insert_nid_to_list(PREALLOC_NID)
1844                  *                     - f2fs_balance_fs_bg
1845                  *                      - build_free_nids
1846                  *                       - __build_free_nids
1847                  *                        - scan_nat_page
1848                  *                         - add_free_nid
1849                  *                          - __lookup_nat_cache
1850                  *  - f2fs_add_link
1851                  *   - init_inode_metadata
1852                  *    - new_inode_page
1853                  *     - new_node_page
1854                  *      - set_node_addr
1855                  *  - alloc_nid_done
1856                  *   - __remove_nid_from_list(PREALLOC_NID)
1857                  *                         - __insert_nid_to_list(FREE_NID)
1858                  */
1859                 ne = __lookup_nat_cache(nm_i, nid);
1860                 if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
1861                                 nat_get_blkaddr(ne) != NULL_ADDR))
1862                         goto err_out;
1863
1864                 e = __lookup_free_nid_list(nm_i, nid);
1865                 if (e) {
1866                         if (e->state == FREE_NID)
1867                                 ret = true;
1868                         goto err_out;
1869                 }
1870         }
1871         ret = true;
1872         err = __insert_free_nid(sbi, i, FREE_NID);
1873 err_out:
1874         spin_unlock(&nm_i->nid_list_lock);
1875         radix_tree_preload_end();
1876 err:
1877         if (err)
1878                 kmem_cache_free(free_nid_slab, i);
1879         return ret;
1880 }
1881
1882 static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
1883 {
1884         struct f2fs_nm_info *nm_i = NM_I(sbi);
1885         struct free_nid *i;
1886         bool need_free = false;
1887
1888         spin_lock(&nm_i->nid_list_lock);
1889         i = __lookup_free_nid_list(nm_i, nid);
1890         if (i && i->state == FREE_NID) {
1891                 __remove_free_nid(sbi, i, FREE_NID);
1892                 need_free = true;
1893         }
1894         spin_unlock(&nm_i->nid_list_lock);
1895
1896         if (need_free)
1897                 kmem_cache_free(free_nid_slab, i);
1898 }
1899
1900 static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
1901                                                         bool set, bool build)
1902 {
1903         struct f2fs_nm_info *nm_i = NM_I(sbi);
1904         unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid);
1905         unsigned int nid_ofs = nid - START_NID(nid);
1906
1907         if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap))
1908                 return;
1909
1910         if (set) {
1911                 if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
1912                         return;
1913                 __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
1914                 nm_i->free_nid_count[nat_ofs]++;
1915         } else {
1916                 if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
1917                         return;
1918                 __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
1919                 if (!build)
1920                         nm_i->free_nid_count[nat_ofs]--;
1921         }
1922 }
1923
1924 static void scan_nat_page(struct f2fs_sb_info *sbi,
1925                         struct page *nat_page, nid_t start_nid)
1926 {
1927         struct f2fs_nm_info *nm_i = NM_I(sbi);
1928         struct f2fs_nat_block *nat_blk = page_address(nat_page);
1929         block_t blk_addr;
1930         unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid);
1931         int i;
1932
1933         if (test_bit_le(nat_ofs, nm_i->nat_block_bitmap))
1934                 return;
1935
1936         __set_bit_le(nat_ofs, nm_i->nat_block_bitmap);
1937
1938         i = start_nid % NAT_ENTRY_PER_BLOCK;
1939
1940         for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
1941                 bool freed = false;
1942
1943                 if (unlikely(start_nid >= nm_i->max_nid))
1944                         break;
1945
1946                 blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
1947                 f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
1948                 if (blk_addr == NULL_ADDR)
1949                         freed = add_free_nid(sbi, start_nid, true);
1950                 spin_lock(&NM_I(sbi)->nid_list_lock);
1951                 update_free_nid_bitmap(sbi, start_nid, freed, true);
1952                 spin_unlock(&NM_I(sbi)->nid_list_lock);
1953         }
1954 }
1955
1956 static void scan_curseg_cache(struct f2fs_sb_info *sbi)
1957 {
1958         struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1959         struct f2fs_journal *journal = curseg->journal;
1960         int i;
1961
1962         down_read(&curseg->journal_rwsem);
1963         for (i = 0; i < nats_in_cursum(journal); i++) {
1964                 block_t addr;
1965                 nid_t nid;
1966
1967                 addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
1968                 nid = le32_to_cpu(nid_in_journal(journal, i));
1969                 if (addr == NULL_ADDR)
1970                         add_free_nid(sbi, nid, true);
1971                 else
1972                         remove_free_nid(sbi, nid);
1973         }
1974         up_read(&curseg->journal_rwsem);
1975 }
1976
1977 static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
1978 {
1979         struct f2fs_nm_info *nm_i = NM_I(sbi);
1980         unsigned int i, idx;
1981         nid_t nid;
1982
1983         down_read(&nm_i->nat_tree_lock);
1984
1985         for (i = 0; i < nm_i->nat_blocks; i++) {
1986                 if (!test_bit_le(i, nm_i->nat_block_bitmap))
1987                         continue;
1988                 if (!nm_i->free_nid_count[i])
1989                         continue;
1990                 for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) {
1991                         idx = find_next_bit_le(nm_i->free_nid_bitmap[i],
1992                                                 NAT_ENTRY_PER_BLOCK, idx);
1993                         if (idx >= NAT_ENTRY_PER_BLOCK)
1994                                 break;
1995
1996                         nid = i * NAT_ENTRY_PER_BLOCK + idx;
1997                         add_free_nid(sbi, nid, true);
1998
1999                         if (nm_i->nid_cnt[FREE_NID] >= MAX_FREE_NIDS)
2000                                 goto out;
2001                 }
2002         }
2003 out:
2004         scan_curseg_cache(sbi);
2005
2006         up_read(&nm_i->nat_tree_lock);
2007 }
2008
2009 static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
2010 {
2011         struct f2fs_nm_info *nm_i = NM_I(sbi);
2012         int i = 0;
2013         nid_t nid = nm_i->next_scan_nid;
2014
2015         if (unlikely(nid >= nm_i->max_nid))
2016                 nid = 0;
2017
2018         /* Enough entries */
2019         if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
2020                 return;
2021
2022         if (!sync && !available_free_memory(sbi, FREE_NIDS))
2023                 return;
2024
2025         if (!mount) {
2026                 /* try to find free nids in free_nid_bitmap */
2027                 scan_free_nid_bits(sbi);
2028
2029                 if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
2030                         return;
2031         }
2032
2033         /* readahead nat pages to be scanned */
2034         ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
2035                                                         META_NAT, true);
2036
2037         down_read(&nm_i->nat_tree_lock);
2038
2039         while (1) {
2040                 struct page *page = get_current_nat_page(sbi, nid);
2041
2042                 scan_nat_page(sbi, page, nid);
2043                 f2fs_put_page(page, 1);
2044
2045                 nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
2046                 if (unlikely(nid >= nm_i->max_nid))
2047                         nid = 0;
2048
2049                 if (++i >= FREE_NID_PAGES)
2050                         break;
2051         }
2052
2053         /* go to the next free nat pages to find free nids abundantly */
2054         nm_i->next_scan_nid = nid;
2055
2056         /* find free nids from current sum_pages */
2057         scan_curseg_cache(sbi);
2058
2059         up_read(&nm_i->nat_tree_lock);
2060
2061         ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
2062                                         nm_i->ra_nid_pages, META_NAT, false);
2063 }
2064
2065 void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
2066 {
2067         mutex_lock(&NM_I(sbi)->build_lock);
2068         __build_free_nids(sbi, sync, mount);
2069         mutex_unlock(&NM_I(sbi)->build_lock);
2070 }
2071
2072 /*
2073  * If this function returns success, caller can obtain a new nid
2074  * from second parameter of this function.
2075  * The returned nid could be used ino as well as nid when inode is created.
2076  */
2077 bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
2078 {
2079         struct f2fs_nm_info *nm_i = NM_I(sbi);
2080         struct free_nid *i = NULL;
2081 retry:
2082 #ifdef CONFIG_F2FS_FAULT_INJECTION
2083         if (time_to_inject(sbi, FAULT_ALLOC_NID)) {
2084                 f2fs_show_injection_info(FAULT_ALLOC_NID);
2085                 return false;
2086         }
2087 #endif
2088         spin_lock(&nm_i->nid_list_lock);
2089
2090         if (unlikely(nm_i->available_nids == 0)) {
2091                 spin_unlock(&nm_i->nid_list_lock);
2092                 return false;
2093         }
2094
2095         /* We should not use stale free nids created by build_free_nids */
2096         if (nm_i->nid_cnt[FREE_NID] && !on_build_free_nids(nm_i)) {
2097                 f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
2098                 i = list_first_entry(&nm_i->free_nid_list,
2099                                         struct free_nid, list);
2100                 *nid = i->nid;
2101
2102                 __move_free_nid(sbi, i, FREE_NID, PREALLOC_NID);
2103                 nm_i->available_nids--;
2104
2105                 update_free_nid_bitmap(sbi, *nid, false, false);
2106
2107                 spin_unlock(&nm_i->nid_list_lock);
2108                 return true;
2109         }
2110         spin_unlock(&nm_i->nid_list_lock);
2111
2112         /* Let's scan nat pages and its caches to get free nids */
2113         build_free_nids(sbi, true, false);
2114         goto retry;
2115 }
2116
2117 /*
2118  * alloc_nid() should be called prior to this function.
2119  */
2120 void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
2121 {
2122         struct f2fs_nm_info *nm_i = NM_I(sbi);
2123         struct free_nid *i;
2124
2125         spin_lock(&nm_i->nid_list_lock);
2126         i = __lookup_free_nid_list(nm_i, nid);
2127         f2fs_bug_on(sbi, !i);
2128         __remove_free_nid(sbi, i, PREALLOC_NID);
2129         spin_unlock(&nm_i->nid_list_lock);
2130
2131         kmem_cache_free(free_nid_slab, i);
2132 }
2133
2134 /*
2135  * alloc_nid() should be called prior to this function.
2136  */
2137 void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
2138 {
2139         struct f2fs_nm_info *nm_i = NM_I(sbi);
2140         struct free_nid *i;
2141         bool need_free = false;
2142
2143         if (!nid)
2144                 return;
2145
2146         spin_lock(&nm_i->nid_list_lock);
2147         i = __lookup_free_nid_list(nm_i, nid);
2148         f2fs_bug_on(sbi, !i);
2149
2150         if (!available_free_memory(sbi, FREE_NIDS)) {
2151                 __remove_free_nid(sbi, i, PREALLOC_NID);
2152                 need_free = true;
2153         } else {
2154                 __move_free_nid(sbi, i, PREALLOC_NID, FREE_NID);
2155         }
2156
2157         nm_i->available_nids++;
2158
2159         update_free_nid_bitmap(sbi, nid, true, false);
2160
2161         spin_unlock(&nm_i->nid_list_lock);
2162
2163         if (need_free)
2164                 kmem_cache_free(free_nid_slab, i);
2165 }
2166
2167 int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
2168 {
2169         struct f2fs_nm_info *nm_i = NM_I(sbi);
2170         struct free_nid *i, *next;
2171         int nr = nr_shrink;
2172
2173         if (nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
2174                 return 0;
2175
2176         if (!mutex_trylock(&nm_i->build_lock))
2177                 return 0;
2178
2179         spin_lock(&nm_i->nid_list_lock);
2180         list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
2181                 if (nr_shrink <= 0 ||
2182                                 nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
2183                         break;
2184
2185                 __remove_free_nid(sbi, i, FREE_NID);
2186                 kmem_cache_free(free_nid_slab, i);
2187                 nr_shrink--;
2188         }
2189         spin_unlock(&nm_i->nid_list_lock);
2190         mutex_unlock(&nm_i->build_lock);
2191
2192         return nr - nr_shrink;
2193 }
2194
2195 void recover_inline_xattr(struct inode *inode, struct page *page)
2196 {
2197         void *src_addr, *dst_addr;
2198         size_t inline_size;
2199         struct page *ipage;
2200         struct f2fs_inode *ri;
2201
2202         ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
2203         f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage));
2204
2205         ri = F2FS_INODE(page);
2206         if (!(ri->i_inline & F2FS_INLINE_XATTR)) {
2207                 clear_inode_flag(inode, FI_INLINE_XATTR);
2208                 goto update_inode;
2209         }
2210
2211         dst_addr = inline_xattr_addr(inode, ipage);
2212         src_addr = inline_xattr_addr(inode, page);
2213         inline_size = inline_xattr_size(inode);
2214
2215         f2fs_wait_on_page_writeback(ipage, NODE, true);
2216         memcpy(dst_addr, src_addr, inline_size);
2217 update_inode:
2218         update_inode(inode, ipage);
2219         f2fs_put_page(ipage, 1);
2220 }
2221
2222 int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
2223 {
2224         struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
2225         nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
2226         nid_t new_xnid;
2227         struct dnode_of_data dn;
2228         struct node_info ni;
2229         struct page *xpage;
2230
2231         if (!prev_xnid)
2232                 goto recover_xnid;
2233
2234         /* 1: invalidate the previous xattr nid */
2235         get_node_info(sbi, prev_xnid, &ni);
2236         f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
2237         invalidate_blocks(sbi, ni.blk_addr);
2238         dec_valid_node_count(sbi, inode, false);
2239         set_node_addr(sbi, &ni, NULL_ADDR, false);
2240
2241 recover_xnid:
2242         /* 2: update xattr nid in inode */
2243         if (!alloc_nid(sbi, &new_xnid))
2244                 return -ENOSPC;
2245
2246         set_new_dnode(&dn, inode, NULL, NULL, new_xnid);
2247         xpage = new_node_page(&dn, XATTR_NODE_OFFSET);
2248         if (IS_ERR(xpage)) {
2249                 alloc_nid_failed(sbi, new_xnid);
2250                 return PTR_ERR(xpage);
2251         }
2252
2253         alloc_nid_done(sbi, new_xnid);
2254         update_inode_page(inode);
2255
2256         /* 3: update and set xattr node page dirty */
2257         memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE);
2258
2259         set_page_dirty(xpage);
2260         f2fs_put_page(xpage, 1);
2261
2262         return 0;
2263 }
2264
2265 int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
2266 {
2267         struct f2fs_inode *src, *dst;
2268         nid_t ino = ino_of_node(page);
2269         struct node_info old_ni, new_ni;
2270         struct page *ipage;
2271
2272         get_node_info(sbi, ino, &old_ni);
2273
2274         if (unlikely(old_ni.blk_addr != NULL_ADDR))
2275                 return -EINVAL;
2276 retry:
2277         ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false);
2278         if (!ipage) {
2279                 congestion_wait(BLK_RW_ASYNC, HZ/50);
2280                 goto retry;
2281         }
2282
2283         /* Should not use this inode from free nid list */
2284         remove_free_nid(sbi, ino);
2285
2286         if (!PageUptodate(ipage))
2287                 SetPageUptodate(ipage);
2288         fill_node_footer(ipage, ino, ino, 0, true);
2289
2290         src = F2FS_INODE(page);
2291         dst = F2FS_INODE(ipage);
2292
2293         memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src);
2294         dst->i_size = 0;
2295         dst->i_blocks = cpu_to_le64(1);
2296         dst->i_links = cpu_to_le32(1);
2297         dst->i_xattr_nid = 0;
2298         dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR);
2299         if (dst->i_inline & F2FS_EXTRA_ATTR) {
2300                 dst->i_extra_isize = src->i_extra_isize;
2301
2302                 if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) &&
2303                         F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
2304                                                         i_inline_xattr_size))
2305                         dst->i_inline_xattr_size = src->i_inline_xattr_size;
2306
2307                 if (f2fs_sb_has_project_quota(sbi->sb) &&
2308                         F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
2309                                                                 i_projid))
2310                         dst->i_projid = src->i_projid;
2311         }
2312
2313         new_ni = old_ni;
2314         new_ni.ino = ino;
2315
2316         if (unlikely(inc_valid_node_count(sbi, NULL, true)))
2317                 WARN_ON(1);
2318         set_node_addr(sbi, &new_ni, NEW_ADDR, false);
2319         inc_valid_inode_count(sbi);
2320         set_page_dirty(ipage);
2321         f2fs_put_page(ipage, 1);
2322         return 0;
2323 }
2324
2325 int restore_node_summary(struct f2fs_sb_info *sbi,
2326                         unsigned int segno, struct f2fs_summary_block *sum)
2327 {
2328         struct f2fs_node *rn;
2329         struct f2fs_summary *sum_entry;
2330         block_t addr;
2331         int i, idx, last_offset, nrpages;
2332
2333         /* scan the node segment */
2334         last_offset = sbi->blocks_per_seg;
2335         addr = START_BLOCK(sbi, segno);
2336         sum_entry = &sum->entries[0];
2337
2338         for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
2339                 nrpages = min(last_offset - i, BIO_MAX_PAGES);
2340
2341                 /* readahead node pages */
2342                 ra_meta_pages(sbi, addr, nrpages, META_POR, true);
2343
2344                 for (idx = addr; idx < addr + nrpages; idx++) {
2345                         struct page *page = get_tmp_page(sbi, idx);
2346
2347                         rn = F2FS_NODE(page);
2348                         sum_entry->nid = rn->footer.nid;
2349                         sum_entry->version = 0;
2350                         sum_entry->ofs_in_node = 0;
2351                         sum_entry++;
2352                         f2fs_put_page(page, 1);
2353                 }
2354
2355                 invalidate_mapping_pages(META_MAPPING(sbi), addr,
2356                                                         addr + nrpages);
2357         }
2358         return 0;
2359 }
2360
2361 static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
2362 {
2363         struct f2fs_nm_info *nm_i = NM_I(sbi);
2364         struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
2365         struct f2fs_journal *journal = curseg->journal;
2366         int i;
2367
2368         down_write(&curseg->journal_rwsem);
2369         for (i = 0; i < nats_in_cursum(journal); i++) {
2370                 struct nat_entry *ne;
2371                 struct f2fs_nat_entry raw_ne;
2372                 nid_t nid = le32_to_cpu(nid_in_journal(journal, i));
2373
2374                 raw_ne = nat_in_journal(journal, i);
2375
2376                 ne = __lookup_nat_cache(nm_i, nid);
2377                 if (!ne) {
2378                         ne = __alloc_nat_entry(nid, true);
2379                         __init_nat_entry(nm_i, ne, &raw_ne, true);
2380                 }
2381
2382                 /*
2383                  * if a free nat in journal has not been used after last
2384                  * checkpoint, we should remove it from available nids,
2385                  * since later we will add it again.
2386                  */
2387                 if (!get_nat_flag(ne, IS_DIRTY) &&
2388                                 le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) {
2389                         spin_lock(&nm_i->nid_list_lock);
2390                         nm_i->available_nids--;
2391                         spin_unlock(&nm_i->nid_list_lock);
2392                 }
2393
2394                 __set_nat_cache_dirty(nm_i, ne);
2395         }
2396         update_nats_in_cursum(journal, -i);
2397         up_write(&curseg->journal_rwsem);
2398 }
2399
2400 static void __adjust_nat_entry_set(struct nat_entry_set *nes,
2401                                                 struct list_head *head, int max)
2402 {
2403         struct nat_entry_set *cur;
2404
2405         if (nes->entry_cnt >= max)
2406                 goto add_out;
2407
2408         list_for_each_entry(cur, head, set_list) {
2409                 if (cur->entry_cnt >= nes->entry_cnt) {
2410                         list_add(&nes->set_list, cur->set_list.prev);
2411                         return;
2412                 }
2413         }
2414 add_out:
2415         list_add_tail(&nes->set_list, head);
2416 }
2417
2418 static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
2419                                                 struct page *page)
2420 {
2421         struct f2fs_nm_info *nm_i = NM_I(sbi);
2422         unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK;
2423         struct f2fs_nat_block *nat_blk = page_address(page);
2424         int valid = 0;
2425         int i = 0;
2426
2427         if (!enabled_nat_bits(sbi, NULL))
2428                 return;
2429
2430         if (nat_index == 0) {
2431                 valid = 1;
2432                 i = 1;
2433         }
2434         for (; i < NAT_ENTRY_PER_BLOCK; i++) {
2435                 if (nat_blk->entries[i].block_addr != NULL_ADDR)
2436                         valid++;
2437         }
2438         if (valid == 0) {
2439                 __set_bit_le(nat_index, nm_i->empty_nat_bits);
2440                 __clear_bit_le(nat_index, nm_i->full_nat_bits);
2441                 return;
2442         }
2443
2444         __clear_bit_le(nat_index, nm_i->empty_nat_bits);
2445         if (valid == NAT_ENTRY_PER_BLOCK)
2446                 __set_bit_le(nat_index, nm_i->full_nat_bits);
2447         else
2448                 __clear_bit_le(nat_index, nm_i->full_nat_bits);
2449 }
2450
2451 static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
2452                 struct nat_entry_set *set, struct cp_control *cpc)
2453 {
2454         struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
2455         struct f2fs_journal *journal = curseg->journal;
2456         nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
2457         bool to_journal = true;
2458         struct f2fs_nat_block *nat_blk;
2459         struct nat_entry *ne, *cur;
2460         struct page *page = NULL;
2461
2462         /*
2463          * there are two steps to flush nat entries:
2464          * #1, flush nat entries to journal in current hot data summary block.
2465          * #2, flush nat entries to nat page.
2466          */
2467         if (enabled_nat_bits(sbi, cpc) ||
2468                 !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
2469                 to_journal = false;
2470
2471         if (to_journal) {
2472                 down_write(&curseg->journal_rwsem);
2473         } else {
2474                 page = get_next_nat_page(sbi, start_nid);
2475                 nat_blk = page_address(page);
2476                 f2fs_bug_on(sbi, !nat_blk);
2477         }
2478
2479         /* flush dirty nats in nat entry set */
2480         list_for_each_entry_safe(ne, cur, &set->entry_list, list) {
2481                 struct f2fs_nat_entry *raw_ne;
2482                 nid_t nid = nat_get_nid(ne);
2483                 int offset;
2484
2485                 f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR);
2486
2487                 if (to_journal) {
2488                         offset = lookup_journal_in_cursum(journal,
2489                                                         NAT_JOURNAL, nid, 1);
2490                         f2fs_bug_on(sbi, offset < 0);
2491                         raw_ne = &nat_in_journal(journal, offset);
2492                         nid_in_journal(journal, offset) = cpu_to_le32(nid);
2493                 } else {
2494                         raw_ne = &nat_blk->entries[nid - start_nid];
2495                 }
2496                 raw_nat_from_node_info(raw_ne, &ne->ni);
2497                 nat_reset_flag(ne);
2498                 __clear_nat_cache_dirty(NM_I(sbi), set, ne);
2499                 if (nat_get_blkaddr(ne) == NULL_ADDR) {
2500                         add_free_nid(sbi, nid, false);
2501                         spin_lock(&NM_I(sbi)->nid_list_lock);
2502                         NM_I(sbi)->available_nids++;
2503                         update_free_nid_bitmap(sbi, nid, true, false);
2504                         spin_unlock(&NM_I(sbi)->nid_list_lock);
2505                 } else {
2506                         spin_lock(&NM_I(sbi)->nid_list_lock);
2507                         update_free_nid_bitmap(sbi, nid, false, false);
2508                         spin_unlock(&NM_I(sbi)->nid_list_lock);
2509                 }
2510         }
2511
2512         if (to_journal) {
2513                 up_write(&curseg->journal_rwsem);
2514         } else {
2515                 __update_nat_bits(sbi, start_nid, page);
2516                 f2fs_put_page(page, 1);
2517         }
2518
2519         /* Allow dirty nats by node block allocation in write_begin */
2520         if (!set->entry_cnt) {
2521                 radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
2522                 kmem_cache_free(nat_entry_set_slab, set);
2523         }
2524 }
2525
2526 /*
2527  * This function is called during the checkpointing process.
2528  */
2529 void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
2530 {
2531         struct f2fs_nm_info *nm_i = NM_I(sbi);
2532         struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
2533         struct f2fs_journal *journal = curseg->journal;
2534         struct nat_entry_set *setvec[SETVEC_SIZE];
2535         struct nat_entry_set *set, *tmp;
2536         unsigned int found;
2537         nid_t set_idx = 0;
2538         LIST_HEAD(sets);
2539
2540         if (!nm_i->dirty_nat_cnt)
2541                 return;
2542
2543         down_write(&nm_i->nat_tree_lock);
2544
2545         /*
2546          * if there are no enough space in journal to store dirty nat
2547          * entries, remove all entries from journal and merge them
2548          * into nat entry set.
2549          */
2550         if (enabled_nat_bits(sbi, cpc) ||
2551                 !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
2552                 remove_nats_in_journal(sbi);
2553
2554         while ((found = __gang_lookup_nat_set(nm_i,
2555                                         set_idx, SETVEC_SIZE, setvec))) {
2556                 unsigned idx;
2557                 set_idx = setvec[found - 1]->set + 1;
2558                 for (idx = 0; idx < found; idx++)
2559                         __adjust_nat_entry_set(setvec[idx], &sets,
2560                                                 MAX_NAT_JENTRIES(journal));
2561         }
2562
2563         /* flush dirty nats in nat entry set */
2564         list_for_each_entry_safe(set, tmp, &sets, set_list)
2565                 __flush_nat_entry_set(sbi, set, cpc);
2566
2567         up_write(&nm_i->nat_tree_lock);
2568         /* Allow dirty nats by node block allocation in write_begin */
2569 }
2570
2571 static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
2572 {
2573         struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
2574         struct f2fs_nm_info *nm_i = NM_I(sbi);
2575         unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE;
2576         unsigned int i;
2577         __u64 cp_ver = cur_cp_version(ckpt);
2578         block_t nat_bits_addr;
2579
2580         if (!enabled_nat_bits(sbi, NULL))
2581                 return 0;
2582
2583         nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 +
2584                                                 F2FS_BLKSIZE - 1);
2585         nm_i->nat_bits = kzalloc(nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS,
2586                                                 GFP_KERNEL);
2587         if (!nm_i->nat_bits)
2588                 return -ENOMEM;
2589
2590         nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg -
2591                                                 nm_i->nat_bits_blocks;
2592         for (i = 0; i < nm_i->nat_bits_blocks; i++) {
2593                 struct page *page = get_meta_page(sbi, nat_bits_addr++);
2594
2595                 memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS),
2596                                         page_address(page), F2FS_BLKSIZE);
2597                 f2fs_put_page(page, 1);
2598         }
2599
2600         cp_ver |= (cur_cp_crc(ckpt) << 32);
2601         if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) {
2602                 disable_nat_bits(sbi, true);
2603                 return 0;
2604         }
2605
2606         nm_i->full_nat_bits = nm_i->nat_bits + 8;
2607         nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
2608
2609         f2fs_msg(sbi->sb, KERN_NOTICE, "Found nat_bits in checkpoint");
2610         return 0;
2611 }
2612
2613 static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi)
2614 {
2615         struct f2fs_nm_info *nm_i = NM_I(sbi);
2616         unsigned int i = 0;
2617         nid_t nid, last_nid;
2618
2619         if (!enabled_nat_bits(sbi, NULL))
2620                 return;
2621
2622         for (i = 0; i < nm_i->nat_blocks; i++) {
2623                 i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i);
2624                 if (i >= nm_i->nat_blocks)
2625                         break;
2626
2627                 __set_bit_le(i, nm_i->nat_block_bitmap);
2628
2629                 nid = i * NAT_ENTRY_PER_BLOCK;
2630                 last_nid = nid + NAT_ENTRY_PER_BLOCK;
2631
2632                 spin_lock(&NM_I(sbi)->nid_list_lock);
2633                 for (; nid < last_nid; nid++)
2634                         update_free_nid_bitmap(sbi, nid, true, true);
2635                 spin_unlock(&NM_I(sbi)->nid_list_lock);
2636         }
2637
2638         for (i = 0; i < nm_i->nat_blocks; i++) {
2639                 i = find_next_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i);
2640                 if (i >= nm_i->nat_blocks)
2641                         break;
2642
2643                 __set_bit_le(i, nm_i->nat_block_bitmap);
2644         }
2645 }
2646
2647 static int init_node_manager(struct f2fs_sb_info *sbi)
2648 {
2649         struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi);
2650         struct f2fs_nm_info *nm_i = NM_I(sbi);
2651         unsigned char *version_bitmap;
2652         unsigned int nat_segs;
2653         int err;
2654
2655         nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
2656
2657         /* segment_count_nat includes pair segment so divide to 2. */
2658         nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
2659         nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
2660         nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks;
2661
2662         /* not used nids: 0, node, meta, (and root counted as valid node) */
2663         nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
2664                                                         F2FS_RESERVED_NODE_NUM;
2665         nm_i->nid_cnt[FREE_NID] = 0;
2666         nm_i->nid_cnt[PREALLOC_NID] = 0;
2667         nm_i->nat_cnt = 0;
2668         nm_i->ram_thresh = DEF_RAM_THRESHOLD;
2669         nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
2670         nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
2671
2672         INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
2673         INIT_LIST_HEAD(&nm_i->free_nid_list);
2674         INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO);
2675         INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO);
2676         INIT_LIST_HEAD(&nm_i->nat_entries);
2677
2678         mutex_init(&nm_i->build_lock);
2679         spin_lock_init(&nm_i->nid_list_lock);
2680         init_rwsem(&nm_i->nat_tree_lock);
2681
2682         nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
2683         nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
2684         version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP);
2685         if (!version_bitmap)
2686                 return -EFAULT;
2687
2688         nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size,
2689                                         GFP_KERNEL);
2690         if (!nm_i->nat_bitmap)
2691                 return -ENOMEM;
2692
2693         err = __get_nat_bitmaps(sbi);
2694         if (err)
2695                 return err;
2696
2697 #ifdef CONFIG_F2FS_CHECK_FS
2698         nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size,
2699                                         GFP_KERNEL);
2700         if (!nm_i->nat_bitmap_mir)
2701                 return -ENOMEM;
2702 #endif
2703
2704         return 0;
2705 }
2706
2707 static int init_free_nid_cache(struct f2fs_sb_info *sbi)
2708 {
2709         struct f2fs_nm_info *nm_i = NM_I(sbi);
2710
2711         nm_i->free_nid_bitmap = kvzalloc(nm_i->nat_blocks *
2712                                         NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL);
2713         if (!nm_i->free_nid_bitmap)
2714                 return -ENOMEM;
2715
2716         nm_i->nat_block_bitmap = kvzalloc(nm_i->nat_blocks / 8,
2717                                                                 GFP_KERNEL);
2718         if (!nm_i->nat_block_bitmap)
2719                 return -ENOMEM;
2720
2721         nm_i->free_nid_count = kvzalloc(nm_i->nat_blocks *
2722                                         sizeof(unsigned short), GFP_KERNEL);
2723         if (!nm_i->free_nid_count)
2724                 return -ENOMEM;
2725         return 0;
2726 }
2727
2728 int build_node_manager(struct f2fs_sb_info *sbi)
2729 {
2730         int err;
2731
2732         sbi->nm_info = kzalloc(sizeof(struct f2fs_nm_info), GFP_KERNEL);
2733         if (!sbi->nm_info)
2734                 return -ENOMEM;
2735
2736         err = init_node_manager(sbi);
2737         if (err)
2738                 return err;
2739
2740         err = init_free_nid_cache(sbi);
2741         if (err)
2742                 return err;
2743
2744         /* load free nid status from nat_bits table */
2745         load_free_nid_bitmap(sbi);
2746
2747         build_free_nids(sbi, true, true);
2748         return 0;
2749 }
2750
2751 void destroy_node_manager(struct f2fs_sb_info *sbi)
2752 {
2753         struct f2fs_nm_info *nm_i = NM_I(sbi);
2754         struct free_nid *i, *next_i;
2755         struct nat_entry *natvec[NATVEC_SIZE];
2756         struct nat_entry_set *setvec[SETVEC_SIZE];
2757         nid_t nid = 0;
2758         unsigned int found;
2759
2760         if (!nm_i)
2761                 return;
2762
2763         /* destroy free nid list */
2764         spin_lock(&nm_i->nid_list_lock);
2765         list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
2766                 __remove_free_nid(sbi, i, FREE_NID);
2767                 spin_unlock(&nm_i->nid_list_lock);
2768                 kmem_cache_free(free_nid_slab, i);
2769                 spin_lock(&nm_i->nid_list_lock);
2770         }
2771         f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID]);
2772         f2fs_bug_on(sbi, nm_i->nid_cnt[PREALLOC_NID]);
2773         f2fs_bug_on(sbi, !list_empty(&nm_i->free_nid_list));
2774         spin_unlock(&nm_i->nid_list_lock);
2775
2776         /* destroy nat cache */
2777         down_write(&nm_i->nat_tree_lock);
2778         while ((found = __gang_lookup_nat_cache(nm_i,
2779                                         nid, NATVEC_SIZE, natvec))) {
2780                 unsigned idx;
2781
2782                 nid = nat_get_nid(natvec[found - 1]) + 1;
2783                 for (idx = 0; idx < found; idx++)
2784                         __del_from_nat_cache(nm_i, natvec[idx]);
2785         }
2786         f2fs_bug_on(sbi, nm_i->nat_cnt);
2787
2788         /* destroy nat set cache */
2789         nid = 0;
2790         while ((found = __gang_lookup_nat_set(nm_i,
2791                                         nid, SETVEC_SIZE, setvec))) {
2792                 unsigned idx;
2793
2794                 nid = setvec[found - 1]->set + 1;
2795                 for (idx = 0; idx < found; idx++) {
2796                         /* entry_cnt is not zero, when cp_error was occurred */
2797                         f2fs_bug_on(sbi, !list_empty(&setvec[idx]->entry_list));
2798                         radix_tree_delete(&nm_i->nat_set_root, setvec[idx]->set);
2799                         kmem_cache_free(nat_entry_set_slab, setvec[idx]);
2800                 }
2801         }
2802         up_write(&nm_i->nat_tree_lock);
2803
2804         kvfree(nm_i->nat_block_bitmap);
2805         kvfree(nm_i->free_nid_bitmap);
2806         kvfree(nm_i->free_nid_count);
2807
2808         kfree(nm_i->nat_bitmap);
2809         kfree(nm_i->nat_bits);
2810 #ifdef CONFIG_F2FS_CHECK_FS
2811         kfree(nm_i->nat_bitmap_mir);
2812 #endif
2813         sbi->nm_info = NULL;
2814         kfree(nm_i);
2815 }
2816
2817 int __init create_node_manager_caches(void)
2818 {
2819         nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
2820                         sizeof(struct nat_entry));
2821         if (!nat_entry_slab)
2822                 goto fail;
2823
2824         free_nid_slab = f2fs_kmem_cache_create("free_nid",
2825                         sizeof(struct free_nid));
2826         if (!free_nid_slab)
2827                 goto destroy_nat_entry;
2828
2829         nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set",
2830                         sizeof(struct nat_entry_set));
2831         if (!nat_entry_set_slab)
2832                 goto destroy_free_nid;
2833         return 0;
2834
2835 destroy_free_nid:
2836         kmem_cache_destroy(free_nid_slab);
2837 destroy_nat_entry:
2838         kmem_cache_destroy(nat_entry_slab);
2839 fail:
2840         return -ENOMEM;
2841 }
2842
2843 void destroy_node_manager_caches(void)
2844 {
2845         kmem_cache_destroy(nat_entry_set_slab);
2846         kmem_cache_destroy(free_nid_slab);
2847         kmem_cache_destroy(nat_entry_slab);
2848 }