f2fs: separate nat entry mem alloc from nat_tree_lock
[linux] / fs / f2fs / node.c
1 /*
2  * fs/f2fs/node.c
3  *
4  * Copyright (c) 2012 Samsung Electronics Co., Ltd.
5  *             http://www.samsung.com/
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  */
11 #include <linux/fs.h>
12 #include <linux/f2fs_fs.h>
13 #include <linux/mpage.h>
14 #include <linux/backing-dev.h>
15 #include <linux/blkdev.h>
16 #include <linux/pagevec.h>
17 #include <linux/swap.h>
18
19 #include "f2fs.h"
20 #include "node.h"
21 #include "segment.h"
22 #include "xattr.h"
23 #include "trace.h"
24 #include <trace/events/f2fs.h>
25
26 #define on_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock)
27
28 static struct kmem_cache *nat_entry_slab;
29 static struct kmem_cache *free_nid_slab;
30 static struct kmem_cache *nat_entry_set_slab;
31
32 bool available_free_memory(struct f2fs_sb_info *sbi, int type)
33 {
34         struct f2fs_nm_info *nm_i = NM_I(sbi);
35         struct sysinfo val;
36         unsigned long avail_ram;
37         unsigned long mem_size = 0;
38         bool res = false;
39
40         si_meminfo(&val);
41
42         /* only uses low memory */
43         avail_ram = val.totalram - val.totalhigh;
44
45         /*
46          * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
47          */
48         if (type == FREE_NIDS) {
49                 mem_size = (nm_i->nid_cnt[FREE_NID] *
50                                 sizeof(struct free_nid)) >> PAGE_SHIFT;
51                 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
52         } else if (type == NAT_ENTRIES) {
53                 mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
54                                                         PAGE_SHIFT;
55                 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
56                 if (excess_cached_nats(sbi))
57                         res = false;
58         } else if (type == DIRTY_DENTS) {
59                 if (sbi->sb->s_bdi->wb.dirty_exceeded)
60                         return false;
61                 mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
62                 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
63         } else if (type == INO_ENTRIES) {
64                 int i;
65
66                 for (i = 0; i < MAX_INO_ENTRY; i++)
67                         mem_size += sbi->im[i].ino_num *
68                                                 sizeof(struct ino_entry);
69                 mem_size >>= PAGE_SHIFT;
70                 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
71         } else if (type == EXTENT_CACHE) {
72                 mem_size = (atomic_read(&sbi->total_ext_tree) *
73                                 sizeof(struct extent_tree) +
74                                 atomic_read(&sbi->total_ext_node) *
75                                 sizeof(struct extent_node)) >> PAGE_SHIFT;
76                 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
77         } else if (type == INMEM_PAGES) {
78                 /* it allows 20% / total_ram for inmemory pages */
79                 mem_size = get_pages(sbi, F2FS_INMEM_PAGES);
80                 res = mem_size < (val.totalram / 5);
81         } else {
82                 if (!sbi->sb->s_bdi->wb.dirty_exceeded)
83                         return true;
84         }
85         return res;
86 }
87
88 static void clear_node_page_dirty(struct page *page)
89 {
90         struct address_space *mapping = page->mapping;
91         unsigned int long flags;
92
93         if (PageDirty(page)) {
94                 spin_lock_irqsave(&mapping->tree_lock, flags);
95                 radix_tree_tag_clear(&mapping->page_tree,
96                                 page_index(page),
97                                 PAGECACHE_TAG_DIRTY);
98                 spin_unlock_irqrestore(&mapping->tree_lock, flags);
99
100                 clear_page_dirty_for_io(page);
101                 dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
102         }
103         ClearPageUptodate(page);
104 }
105
106 static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
107 {
108         pgoff_t index = current_nat_addr(sbi, nid);
109         return get_meta_page(sbi, index);
110 }
111
112 static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
113 {
114         struct page *src_page;
115         struct page *dst_page;
116         pgoff_t src_off;
117         pgoff_t dst_off;
118         void *src_addr;
119         void *dst_addr;
120         struct f2fs_nm_info *nm_i = NM_I(sbi);
121
122         src_off = current_nat_addr(sbi, nid);
123         dst_off = next_nat_addr(sbi, src_off);
124
125         /* get current nat block page with lock */
126         src_page = get_meta_page(sbi, src_off);
127         dst_page = grab_meta_page(sbi, dst_off);
128         f2fs_bug_on(sbi, PageDirty(src_page));
129
130         src_addr = page_address(src_page);
131         dst_addr = page_address(dst_page);
132         memcpy(dst_addr, src_addr, PAGE_SIZE);
133         set_page_dirty(dst_page);
134         f2fs_put_page(src_page, 1);
135
136         set_to_next_nat(nm_i, nid);
137
138         return dst_page;
139 }
140
141 static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail)
142 {
143         struct nat_entry *new;
144
145         if (no_fail)
146                 new = f2fs_kmem_cache_alloc(nat_entry_slab,
147                                                 GFP_NOFS | __GFP_ZERO);
148         else
149                 new = kmem_cache_alloc(nat_entry_slab,
150                                                 GFP_NOFS | __GFP_ZERO);
151         if (new) {
152                 nat_set_nid(new, nid);
153                 nat_reset_flag(new);
154         }
155         return new;
156 }
157
158 static void __free_nat_entry(struct nat_entry *e)
159 {
160         kmem_cache_free(nat_entry_slab, e);
161 }
162
163 /* must be locked by nat_tree_lock */
164 static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i,
165         struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail)
166 {
167         if (no_fail)
168                 f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne);
169         else if (radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne))
170                 return NULL;
171
172         if (raw_ne)
173                 node_info_from_raw_nat(&ne->ni, raw_ne);
174         list_add_tail(&ne->list, &nm_i->nat_entries);
175         nm_i->nat_cnt++;
176         return ne;
177 }
178
179 static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
180 {
181         return radix_tree_lookup(&nm_i->nat_root, n);
182 }
183
184 static unsigned int __gang_lookup_nat_cache(struct f2fs_nm_info *nm_i,
185                 nid_t start, unsigned int nr, struct nat_entry **ep)
186 {
187         return radix_tree_gang_lookup(&nm_i->nat_root, (void **)ep, start, nr);
188 }
189
190 static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
191 {
192         list_del(&e->list);
193         radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
194         nm_i->nat_cnt--;
195         __free_nat_entry(e);
196 }
197
198 static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
199                                                 struct nat_entry *ne)
200 {
201         nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
202         struct nat_entry_set *head;
203
204         head = radix_tree_lookup(&nm_i->nat_set_root, set);
205         if (!head) {
206                 head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS);
207
208                 INIT_LIST_HEAD(&head->entry_list);
209                 INIT_LIST_HEAD(&head->set_list);
210                 head->set = set;
211                 head->entry_cnt = 0;
212                 f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
213         }
214
215         if (get_nat_flag(ne, IS_DIRTY))
216                 goto refresh_list;
217
218         nm_i->dirty_nat_cnt++;
219         head->entry_cnt++;
220         set_nat_flag(ne, IS_DIRTY, true);
221 refresh_list:
222         if (nat_get_blkaddr(ne) == NEW_ADDR)
223                 list_del_init(&ne->list);
224         else
225                 list_move_tail(&ne->list, &head->entry_list);
226 }
227
228 static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
229                 struct nat_entry_set *set, struct nat_entry *ne)
230 {
231         list_move_tail(&ne->list, &nm_i->nat_entries);
232         set_nat_flag(ne, IS_DIRTY, false);
233         set->entry_cnt--;
234         nm_i->dirty_nat_cnt--;
235 }
236
237 static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
238                 nid_t start, unsigned int nr, struct nat_entry_set **ep)
239 {
240         return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep,
241                                                         start, nr);
242 }
243
244 int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
245 {
246         struct f2fs_nm_info *nm_i = NM_I(sbi);
247         struct nat_entry *e;
248         bool need = false;
249
250         down_read(&nm_i->nat_tree_lock);
251         e = __lookup_nat_cache(nm_i, nid);
252         if (e) {
253                 if (!get_nat_flag(e, IS_CHECKPOINTED) &&
254                                 !get_nat_flag(e, HAS_FSYNCED_INODE))
255                         need = true;
256         }
257         up_read(&nm_i->nat_tree_lock);
258         return need;
259 }
260
261 bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
262 {
263         struct f2fs_nm_info *nm_i = NM_I(sbi);
264         struct nat_entry *e;
265         bool is_cp = true;
266
267         down_read(&nm_i->nat_tree_lock);
268         e = __lookup_nat_cache(nm_i, nid);
269         if (e && !get_nat_flag(e, IS_CHECKPOINTED))
270                 is_cp = false;
271         up_read(&nm_i->nat_tree_lock);
272         return is_cp;
273 }
274
275 bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
276 {
277         struct f2fs_nm_info *nm_i = NM_I(sbi);
278         struct nat_entry *e;
279         bool need_update = true;
280
281         down_read(&nm_i->nat_tree_lock);
282         e = __lookup_nat_cache(nm_i, ino);
283         if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
284                         (get_nat_flag(e, IS_CHECKPOINTED) ||
285                          get_nat_flag(e, HAS_FSYNCED_INODE)))
286                 need_update = false;
287         up_read(&nm_i->nat_tree_lock);
288         return need_update;
289 }
290
291 /* must be locked by nat_tree_lock */
292 static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
293                                                 struct f2fs_nat_entry *ne)
294 {
295         struct f2fs_nm_info *nm_i = NM_I(sbi);
296         struct nat_entry *new, *e;
297
298         new = __alloc_nat_entry(nid, false);
299         if (!new)
300                 return;
301
302         down_write(&nm_i->nat_tree_lock);
303         e = __lookup_nat_cache(nm_i, nid);
304         if (!e)
305                 e = __init_nat_entry(nm_i, new, ne, false);
306         else
307                 f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) ||
308                                 nat_get_blkaddr(e) !=
309                                         le32_to_cpu(ne->block_addr) ||
310                                 nat_get_version(e) != ne->version);
311         up_write(&nm_i->nat_tree_lock);
312         if (e != new)
313                 __free_nat_entry(new);
314 }
315
316 static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
317                         block_t new_blkaddr, bool fsync_done)
318 {
319         struct f2fs_nm_info *nm_i = NM_I(sbi);
320         struct nat_entry *e;
321         struct nat_entry *new = __alloc_nat_entry(ni->nid, true);
322
323         down_write(&nm_i->nat_tree_lock);
324         e = __lookup_nat_cache(nm_i, ni->nid);
325         if (!e) {
326                 e = __init_nat_entry(nm_i, new, NULL, true);
327                 copy_node_info(&e->ni, ni);
328                 f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
329         } else if (new_blkaddr == NEW_ADDR) {
330                 /*
331                  * when nid is reallocated,
332                  * previous nat entry can be remained in nat cache.
333                  * So, reinitialize it with new information.
334                  */
335                 copy_node_info(&e->ni, ni);
336                 f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
337         }
338         /* let's free early to reduce memory consumption */
339         if (e != new)
340                 __free_nat_entry(new);
341
342         /* sanity check */
343         f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr);
344         f2fs_bug_on(sbi, nat_get_blkaddr(e) == NULL_ADDR &&
345                         new_blkaddr == NULL_ADDR);
346         f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR &&
347                         new_blkaddr == NEW_ADDR);
348         f2fs_bug_on(sbi, nat_get_blkaddr(e) != NEW_ADDR &&
349                         nat_get_blkaddr(e) != NULL_ADDR &&
350                         new_blkaddr == NEW_ADDR);
351
352         /* increment version no as node is removed */
353         if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
354                 unsigned char version = nat_get_version(e);
355                 nat_set_version(e, inc_node_version(version));
356         }
357
358         /* change address */
359         nat_set_blkaddr(e, new_blkaddr);
360         if (new_blkaddr == NEW_ADDR || new_blkaddr == NULL_ADDR)
361                 set_nat_flag(e, IS_CHECKPOINTED, false);
362         __set_nat_cache_dirty(nm_i, e);
363
364         /* update fsync_mark if its inode nat entry is still alive */
365         if (ni->nid != ni->ino)
366                 e = __lookup_nat_cache(nm_i, ni->ino);
367         if (e) {
368                 if (fsync_done && ni->nid == ni->ino)
369                         set_nat_flag(e, HAS_FSYNCED_INODE, true);
370                 set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
371         }
372         up_write(&nm_i->nat_tree_lock);
373 }
374
375 int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
376 {
377         struct f2fs_nm_info *nm_i = NM_I(sbi);
378         int nr = nr_shrink;
379
380         if (!down_write_trylock(&nm_i->nat_tree_lock))
381                 return 0;
382
383         while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
384                 struct nat_entry *ne;
385                 ne = list_first_entry(&nm_i->nat_entries,
386                                         struct nat_entry, list);
387                 __del_from_nat_cache(nm_i, ne);
388                 nr_shrink--;
389         }
390         up_write(&nm_i->nat_tree_lock);
391         return nr - nr_shrink;
392 }
393
394 /*
395  * This function always returns success
396  */
397 void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
398 {
399         struct f2fs_nm_info *nm_i = NM_I(sbi);
400         struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
401         struct f2fs_journal *journal = curseg->journal;
402         nid_t start_nid = START_NID(nid);
403         struct f2fs_nat_block *nat_blk;
404         struct page *page = NULL;
405         struct f2fs_nat_entry ne;
406         struct nat_entry *e;
407         pgoff_t index;
408         int i;
409
410         ni->nid = nid;
411
412         /* Check nat cache */
413         down_read(&nm_i->nat_tree_lock);
414         e = __lookup_nat_cache(nm_i, nid);
415         if (e) {
416                 ni->ino = nat_get_ino(e);
417                 ni->blk_addr = nat_get_blkaddr(e);
418                 ni->version = nat_get_version(e);
419                 up_read(&nm_i->nat_tree_lock);
420                 return;
421         }
422
423         memset(&ne, 0, sizeof(struct f2fs_nat_entry));
424
425         /* Check current segment summary */
426         down_read(&curseg->journal_rwsem);
427         i = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0);
428         if (i >= 0) {
429                 ne = nat_in_journal(journal, i);
430                 node_info_from_raw_nat(ni, &ne);
431         }
432         up_read(&curseg->journal_rwsem);
433         if (i >= 0) {
434                 up_read(&nm_i->nat_tree_lock);
435                 goto cache;
436         }
437
438         /* Fill node_info from nat page */
439         index = current_nat_addr(sbi, nid);
440         up_read(&nm_i->nat_tree_lock);
441
442         page = get_meta_page(sbi, index);
443         nat_blk = (struct f2fs_nat_block *)page_address(page);
444         ne = nat_blk->entries[nid - start_nid];
445         node_info_from_raw_nat(ni, &ne);
446         f2fs_put_page(page, 1);
447 cache:
448         /* cache nat entry */
449         cache_nat_entry(sbi, nid, &ne);
450 }
451
452 /*
453  * readahead MAX_RA_NODE number of node pages.
454  */
455 static void ra_node_pages(struct page *parent, int start, int n)
456 {
457         struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
458         struct blk_plug plug;
459         int i, end;
460         nid_t nid;
461
462         blk_start_plug(&plug);
463
464         /* Then, try readahead for siblings of the desired node */
465         end = start + n;
466         end = min(end, NIDS_PER_BLOCK);
467         for (i = start; i < end; i++) {
468                 nid = get_nid(parent, i, false);
469                 ra_node_page(sbi, nid);
470         }
471
472         blk_finish_plug(&plug);
473 }
474
475 pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs)
476 {
477         const long direct_index = ADDRS_PER_INODE(dn->inode);
478         const long direct_blks = ADDRS_PER_BLOCK;
479         const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
480         unsigned int skipped_unit = ADDRS_PER_BLOCK;
481         int cur_level = dn->cur_level;
482         int max_level = dn->max_level;
483         pgoff_t base = 0;
484
485         if (!dn->max_level)
486                 return pgofs + 1;
487
488         while (max_level-- > cur_level)
489                 skipped_unit *= NIDS_PER_BLOCK;
490
491         switch (dn->max_level) {
492         case 3:
493                 base += 2 * indirect_blks;
494         case 2:
495                 base += 2 * direct_blks;
496         case 1:
497                 base += direct_index;
498                 break;
499         default:
500                 f2fs_bug_on(F2FS_I_SB(dn->inode), 1);
501         }
502
503         return ((pgofs - base) / skipped_unit + 1) * skipped_unit + base;
504 }
505
506 /*
507  * The maximum depth is four.
508  * Offset[0] will have raw inode offset.
509  */
510 static int get_node_path(struct inode *inode, long block,
511                                 int offset[4], unsigned int noffset[4])
512 {
513         const long direct_index = ADDRS_PER_INODE(inode);
514         const long direct_blks = ADDRS_PER_BLOCK;
515         const long dptrs_per_blk = NIDS_PER_BLOCK;
516         const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK;
517         const long dindirect_blks = indirect_blks * NIDS_PER_BLOCK;
518         int n = 0;
519         int level = 0;
520
521         noffset[0] = 0;
522
523         if (block < direct_index) {
524                 offset[n] = block;
525                 goto got;
526         }
527         block -= direct_index;
528         if (block < direct_blks) {
529                 offset[n++] = NODE_DIR1_BLOCK;
530                 noffset[n] = 1;
531                 offset[n] = block;
532                 level = 1;
533                 goto got;
534         }
535         block -= direct_blks;
536         if (block < direct_blks) {
537                 offset[n++] = NODE_DIR2_BLOCK;
538                 noffset[n] = 2;
539                 offset[n] = block;
540                 level = 1;
541                 goto got;
542         }
543         block -= direct_blks;
544         if (block < indirect_blks) {
545                 offset[n++] = NODE_IND1_BLOCK;
546                 noffset[n] = 3;
547                 offset[n++] = block / direct_blks;
548                 noffset[n] = 4 + offset[n - 1];
549                 offset[n] = block % direct_blks;
550                 level = 2;
551                 goto got;
552         }
553         block -= indirect_blks;
554         if (block < indirect_blks) {
555                 offset[n++] = NODE_IND2_BLOCK;
556                 noffset[n] = 4 + dptrs_per_blk;
557                 offset[n++] = block / direct_blks;
558                 noffset[n] = 5 + dptrs_per_blk + offset[n - 1];
559                 offset[n] = block % direct_blks;
560                 level = 2;
561                 goto got;
562         }
563         block -= indirect_blks;
564         if (block < dindirect_blks) {
565                 offset[n++] = NODE_DIND_BLOCK;
566                 noffset[n] = 5 + (dptrs_per_blk * 2);
567                 offset[n++] = block / indirect_blks;
568                 noffset[n] = 6 + (dptrs_per_blk * 2) +
569                               offset[n - 1] * (dptrs_per_blk + 1);
570                 offset[n++] = (block / direct_blks) % dptrs_per_blk;
571                 noffset[n] = 7 + (dptrs_per_blk * 2) +
572                               offset[n - 2] * (dptrs_per_blk + 1) +
573                               offset[n - 1];
574                 offset[n] = block % direct_blks;
575                 level = 3;
576                 goto got;
577         } else {
578                 return -E2BIG;
579         }
580 got:
581         return level;
582 }
583
584 /*
585  * Caller should call f2fs_put_dnode(dn).
586  * Also, it should grab and release a rwsem by calling f2fs_lock_op() and
587  * f2fs_unlock_op() only if ro is not set RDONLY_NODE.
588  * In the case of RDONLY_NODE, we don't need to care about mutex.
589  */
590 int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
591 {
592         struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
593         struct page *npage[4];
594         struct page *parent = NULL;
595         int offset[4];
596         unsigned int noffset[4];
597         nid_t nids[4];
598         int level, i = 0;
599         int err = 0;
600
601         level = get_node_path(dn->inode, index, offset, noffset);
602         if (level < 0)
603                 return level;
604
605         nids[0] = dn->inode->i_ino;
606         npage[0] = dn->inode_page;
607
608         if (!npage[0]) {
609                 npage[0] = get_node_page(sbi, nids[0]);
610                 if (IS_ERR(npage[0]))
611                         return PTR_ERR(npage[0]);
612         }
613
614         /* if inline_data is set, should not report any block indices */
615         if (f2fs_has_inline_data(dn->inode) && index) {
616                 err = -ENOENT;
617                 f2fs_put_page(npage[0], 1);
618                 goto release_out;
619         }
620
621         parent = npage[0];
622         if (level != 0)
623                 nids[1] = get_nid(parent, offset[0], true);
624         dn->inode_page = npage[0];
625         dn->inode_page_locked = true;
626
627         /* get indirect or direct nodes */
628         for (i = 1; i <= level; i++) {
629                 bool done = false;
630
631                 if (!nids[i] && mode == ALLOC_NODE) {
632                         /* alloc new node */
633                         if (!alloc_nid(sbi, &(nids[i]))) {
634                                 err = -ENOSPC;
635                                 goto release_pages;
636                         }
637
638                         dn->nid = nids[i];
639                         npage[i] = new_node_page(dn, noffset[i]);
640                         if (IS_ERR(npage[i])) {
641                                 alloc_nid_failed(sbi, nids[i]);
642                                 err = PTR_ERR(npage[i]);
643                                 goto release_pages;
644                         }
645
646                         set_nid(parent, offset[i - 1], nids[i], i == 1);
647                         alloc_nid_done(sbi, nids[i]);
648                         done = true;
649                 } else if (mode == LOOKUP_NODE_RA && i == level && level > 1) {
650                         npage[i] = get_node_page_ra(parent, offset[i - 1]);
651                         if (IS_ERR(npage[i])) {
652                                 err = PTR_ERR(npage[i]);
653                                 goto release_pages;
654                         }
655                         done = true;
656                 }
657                 if (i == 1) {
658                         dn->inode_page_locked = false;
659                         unlock_page(parent);
660                 } else {
661                         f2fs_put_page(parent, 1);
662                 }
663
664                 if (!done) {
665                         npage[i] = get_node_page(sbi, nids[i]);
666                         if (IS_ERR(npage[i])) {
667                                 err = PTR_ERR(npage[i]);
668                                 f2fs_put_page(npage[0], 0);
669                                 goto release_out;
670                         }
671                 }
672                 if (i < level) {
673                         parent = npage[i];
674                         nids[i + 1] = get_nid(parent, offset[i], false);
675                 }
676         }
677         dn->nid = nids[level];
678         dn->ofs_in_node = offset[level];
679         dn->node_page = npage[level];
680         dn->data_blkaddr = datablock_addr(dn->inode,
681                                 dn->node_page, dn->ofs_in_node);
682         return 0;
683
684 release_pages:
685         f2fs_put_page(parent, 1);
686         if (i > 1)
687                 f2fs_put_page(npage[0], 0);
688 release_out:
689         dn->inode_page = NULL;
690         dn->node_page = NULL;
691         if (err == -ENOENT) {
692                 dn->cur_level = i;
693                 dn->max_level = level;
694                 dn->ofs_in_node = offset[level];
695         }
696         return err;
697 }
698
699 static void truncate_node(struct dnode_of_data *dn)
700 {
701         struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
702         struct node_info ni;
703
704         get_node_info(sbi, dn->nid, &ni);
705         f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
706
707         /* Deallocate node address */
708         invalidate_blocks(sbi, ni.blk_addr);
709         dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino);
710         set_node_addr(sbi, &ni, NULL_ADDR, false);
711
712         if (dn->nid == dn->inode->i_ino) {
713                 remove_orphan_inode(sbi, dn->nid);
714                 dec_valid_inode_count(sbi);
715                 f2fs_inode_synced(dn->inode);
716         }
717
718         clear_node_page_dirty(dn->node_page);
719         set_sbi_flag(sbi, SBI_IS_DIRTY);
720
721         f2fs_put_page(dn->node_page, 1);
722
723         invalidate_mapping_pages(NODE_MAPPING(sbi),
724                         dn->node_page->index, dn->node_page->index);
725
726         dn->node_page = NULL;
727         trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
728 }
729
730 static int truncate_dnode(struct dnode_of_data *dn)
731 {
732         struct page *page;
733
734         if (dn->nid == 0)
735                 return 1;
736
737         /* get direct node */
738         page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
739         if (IS_ERR(page) && PTR_ERR(page) == -ENOENT)
740                 return 1;
741         else if (IS_ERR(page))
742                 return PTR_ERR(page);
743
744         /* Make dnode_of_data for parameter */
745         dn->node_page = page;
746         dn->ofs_in_node = 0;
747         truncate_data_blocks(dn);
748         truncate_node(dn);
749         return 1;
750 }
751
752 static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
753                                                 int ofs, int depth)
754 {
755         struct dnode_of_data rdn = *dn;
756         struct page *page;
757         struct f2fs_node *rn;
758         nid_t child_nid;
759         unsigned int child_nofs;
760         int freed = 0;
761         int i, ret;
762
763         if (dn->nid == 0)
764                 return NIDS_PER_BLOCK + 1;
765
766         trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
767
768         page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
769         if (IS_ERR(page)) {
770                 trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page));
771                 return PTR_ERR(page);
772         }
773
774         ra_node_pages(page, ofs, NIDS_PER_BLOCK);
775
776         rn = F2FS_NODE(page);
777         if (depth < 3) {
778                 for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) {
779                         child_nid = le32_to_cpu(rn->in.nid[i]);
780                         if (child_nid == 0)
781                                 continue;
782                         rdn.nid = child_nid;
783                         ret = truncate_dnode(&rdn);
784                         if (ret < 0)
785                                 goto out_err;
786                         if (set_nid(page, i, 0, false))
787                                 dn->node_changed = true;
788                 }
789         } else {
790                 child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1;
791                 for (i = ofs; i < NIDS_PER_BLOCK; i++) {
792                         child_nid = le32_to_cpu(rn->in.nid[i]);
793                         if (child_nid == 0) {
794                                 child_nofs += NIDS_PER_BLOCK + 1;
795                                 continue;
796                         }
797                         rdn.nid = child_nid;
798                         ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1);
799                         if (ret == (NIDS_PER_BLOCK + 1)) {
800                                 if (set_nid(page, i, 0, false))
801                                         dn->node_changed = true;
802                                 child_nofs += ret;
803                         } else if (ret < 0 && ret != -ENOENT) {
804                                 goto out_err;
805                         }
806                 }
807                 freed = child_nofs;
808         }
809
810         if (!ofs) {
811                 /* remove current indirect node */
812                 dn->node_page = page;
813                 truncate_node(dn);
814                 freed++;
815         } else {
816                 f2fs_put_page(page, 1);
817         }
818         trace_f2fs_truncate_nodes_exit(dn->inode, freed);
819         return freed;
820
821 out_err:
822         f2fs_put_page(page, 1);
823         trace_f2fs_truncate_nodes_exit(dn->inode, ret);
824         return ret;
825 }
826
827 static int truncate_partial_nodes(struct dnode_of_data *dn,
828                         struct f2fs_inode *ri, int *offset, int depth)
829 {
830         struct page *pages[2];
831         nid_t nid[3];
832         nid_t child_nid;
833         int err = 0;
834         int i;
835         int idx = depth - 2;
836
837         nid[0] = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
838         if (!nid[0])
839                 return 0;
840
841         /* get indirect nodes in the path */
842         for (i = 0; i < idx + 1; i++) {
843                 /* reference count'll be increased */
844                 pages[i] = get_node_page(F2FS_I_SB(dn->inode), nid[i]);
845                 if (IS_ERR(pages[i])) {
846                         err = PTR_ERR(pages[i]);
847                         idx = i - 1;
848                         goto fail;
849                 }
850                 nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
851         }
852
853         ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK);
854
855         /* free direct nodes linked to a partial indirect node */
856         for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) {
857                 child_nid = get_nid(pages[idx], i, false);
858                 if (!child_nid)
859                         continue;
860                 dn->nid = child_nid;
861                 err = truncate_dnode(dn);
862                 if (err < 0)
863                         goto fail;
864                 if (set_nid(pages[idx], i, 0, false))
865                         dn->node_changed = true;
866         }
867
868         if (offset[idx + 1] == 0) {
869                 dn->node_page = pages[idx];
870                 dn->nid = nid[idx];
871                 truncate_node(dn);
872         } else {
873                 f2fs_put_page(pages[idx], 1);
874         }
875         offset[idx]++;
876         offset[idx + 1] = 0;
877         idx--;
878 fail:
879         for (i = idx; i >= 0; i--)
880                 f2fs_put_page(pages[i], 1);
881
882         trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err);
883
884         return err;
885 }
886
887 /*
888  * All the block addresses of data and nodes should be nullified.
889  */
890 int truncate_inode_blocks(struct inode *inode, pgoff_t from)
891 {
892         struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
893         int err = 0, cont = 1;
894         int level, offset[4], noffset[4];
895         unsigned int nofs = 0;
896         struct f2fs_inode *ri;
897         struct dnode_of_data dn;
898         struct page *page;
899
900         trace_f2fs_truncate_inode_blocks_enter(inode, from);
901
902         level = get_node_path(inode, from, offset, noffset);
903         if (level < 0)
904                 return level;
905
906         page = get_node_page(sbi, inode->i_ino);
907         if (IS_ERR(page)) {
908                 trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page));
909                 return PTR_ERR(page);
910         }
911
912         set_new_dnode(&dn, inode, page, NULL, 0);
913         unlock_page(page);
914
915         ri = F2FS_INODE(page);
916         switch (level) {
917         case 0:
918         case 1:
919                 nofs = noffset[1];
920                 break;
921         case 2:
922                 nofs = noffset[1];
923                 if (!offset[level - 1])
924                         goto skip_partial;
925                 err = truncate_partial_nodes(&dn, ri, offset, level);
926                 if (err < 0 && err != -ENOENT)
927                         goto fail;
928                 nofs += 1 + NIDS_PER_BLOCK;
929                 break;
930         case 3:
931                 nofs = 5 + 2 * NIDS_PER_BLOCK;
932                 if (!offset[level - 1])
933                         goto skip_partial;
934                 err = truncate_partial_nodes(&dn, ri, offset, level);
935                 if (err < 0 && err != -ENOENT)
936                         goto fail;
937                 break;
938         default:
939                 BUG();
940         }
941
942 skip_partial:
943         while (cont) {
944                 dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
945                 switch (offset[0]) {
946                 case NODE_DIR1_BLOCK:
947                 case NODE_DIR2_BLOCK:
948                         err = truncate_dnode(&dn);
949                         break;
950
951                 case NODE_IND1_BLOCK:
952                 case NODE_IND2_BLOCK:
953                         err = truncate_nodes(&dn, nofs, offset[1], 2);
954                         break;
955
956                 case NODE_DIND_BLOCK:
957                         err = truncate_nodes(&dn, nofs, offset[1], 3);
958                         cont = 0;
959                         break;
960
961                 default:
962                         BUG();
963                 }
964                 if (err < 0 && err != -ENOENT)
965                         goto fail;
966                 if (offset[1] == 0 &&
967                                 ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
968                         lock_page(page);
969                         BUG_ON(page->mapping != NODE_MAPPING(sbi));
970                         f2fs_wait_on_page_writeback(page, NODE, true);
971                         ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
972                         set_page_dirty(page);
973                         unlock_page(page);
974                 }
975                 offset[1] = 0;
976                 offset[0]++;
977                 nofs += err;
978         }
979 fail:
980         f2fs_put_page(page, 0);
981         trace_f2fs_truncate_inode_blocks_exit(inode, err);
982         return err > 0 ? 0 : err;
983 }
984
985 /* caller must lock inode page */
986 int truncate_xattr_node(struct inode *inode)
987 {
988         struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
989         nid_t nid = F2FS_I(inode)->i_xattr_nid;
990         struct dnode_of_data dn;
991         struct page *npage;
992
993         if (!nid)
994                 return 0;
995
996         npage = get_node_page(sbi, nid);
997         if (IS_ERR(npage))
998                 return PTR_ERR(npage);
999
1000         f2fs_i_xnid_write(inode, 0);
1001
1002         set_new_dnode(&dn, inode, NULL, npage, nid);
1003         truncate_node(&dn);
1004         return 0;
1005 }
1006
1007 /*
1008  * Caller should grab and release a rwsem by calling f2fs_lock_op() and
1009  * f2fs_unlock_op().
1010  */
1011 int remove_inode_page(struct inode *inode)
1012 {
1013         struct dnode_of_data dn;
1014         int err;
1015
1016         set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
1017         err = get_dnode_of_data(&dn, 0, LOOKUP_NODE);
1018         if (err)
1019                 return err;
1020
1021         err = truncate_xattr_node(inode);
1022         if (err) {
1023                 f2fs_put_dnode(&dn);
1024                 return err;
1025         }
1026
1027         /* remove potential inline_data blocks */
1028         if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1029                                 S_ISLNK(inode->i_mode))
1030                 truncate_data_blocks_range(&dn, 1);
1031
1032         /* 0 is possible, after f2fs_new_inode() has failed */
1033         f2fs_bug_on(F2FS_I_SB(inode),
1034                         inode->i_blocks != 0 && inode->i_blocks != 8);
1035
1036         /* will put inode & node pages */
1037         truncate_node(&dn);
1038         return 0;
1039 }
1040
1041 struct page *new_inode_page(struct inode *inode)
1042 {
1043         struct dnode_of_data dn;
1044
1045         /* allocate inode page for new inode */
1046         set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
1047
1048         /* caller should f2fs_put_page(page, 1); */
1049         return new_node_page(&dn, 0);
1050 }
1051
1052 struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
1053 {
1054         struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
1055         struct node_info new_ni;
1056         struct page *page;
1057         int err;
1058
1059         if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
1060                 return ERR_PTR(-EPERM);
1061
1062         page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false);
1063         if (!page)
1064                 return ERR_PTR(-ENOMEM);
1065
1066         if (unlikely((err = inc_valid_node_count(sbi, dn->inode, !ofs))))
1067                 goto fail;
1068
1069 #ifdef CONFIG_F2FS_CHECK_FS
1070         get_node_info(sbi, dn->nid, &new_ni);
1071         f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR);
1072 #endif
1073         new_ni.nid = dn->nid;
1074         new_ni.ino = dn->inode->i_ino;
1075         new_ni.blk_addr = NULL_ADDR;
1076         new_ni.flag = 0;
1077         new_ni.version = 0;
1078         set_node_addr(sbi, &new_ni, NEW_ADDR, false);
1079
1080         f2fs_wait_on_page_writeback(page, NODE, true);
1081         fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
1082         set_cold_node(dn->inode, page);
1083         if (!PageUptodate(page))
1084                 SetPageUptodate(page);
1085         if (set_page_dirty(page))
1086                 dn->node_changed = true;
1087
1088         if (f2fs_has_xattr_block(ofs))
1089                 f2fs_i_xnid_write(dn->inode, dn->nid);
1090
1091         if (ofs == 0)
1092                 inc_valid_inode_count(sbi);
1093         return page;
1094
1095 fail:
1096         clear_node_page_dirty(page);
1097         f2fs_put_page(page, 1);
1098         return ERR_PTR(err);
1099 }
1100
1101 /*
1102  * Caller should do after getting the following values.
1103  * 0: f2fs_put_page(page, 0)
1104  * LOCKED_PAGE or error: f2fs_put_page(page, 1)
1105  */
1106 static int read_node_page(struct page *page, int op_flags)
1107 {
1108         struct f2fs_sb_info *sbi = F2FS_P_SB(page);
1109         struct node_info ni;
1110         struct f2fs_io_info fio = {
1111                 .sbi = sbi,
1112                 .type = NODE,
1113                 .op = REQ_OP_READ,
1114                 .op_flags = op_flags,
1115                 .page = page,
1116                 .encrypted_page = NULL,
1117         };
1118
1119         if (PageUptodate(page))
1120                 return LOCKED_PAGE;
1121
1122         get_node_info(sbi, page->index, &ni);
1123
1124         if (unlikely(ni.blk_addr == NULL_ADDR)) {
1125                 ClearPageUptodate(page);
1126                 return -ENOENT;
1127         }
1128
1129         fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr;
1130         return f2fs_submit_page_bio(&fio);
1131 }
1132
1133 /*
1134  * Readahead a node page
1135  */
1136 void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
1137 {
1138         struct page *apage;
1139         int err;
1140
1141         if (!nid)
1142                 return;
1143         f2fs_bug_on(sbi, check_nid_range(sbi, nid));
1144
1145         rcu_read_lock();
1146         apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid);
1147         rcu_read_unlock();
1148         if (apage)
1149                 return;
1150
1151         apage = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
1152         if (!apage)
1153                 return;
1154
1155         err = read_node_page(apage, REQ_RAHEAD);
1156         f2fs_put_page(apage, err ? 1 : 0);
1157 }
1158
1159 static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
1160                                         struct page *parent, int start)
1161 {
1162         struct page *page;
1163         int err;
1164
1165         if (!nid)
1166                 return ERR_PTR(-ENOENT);
1167         f2fs_bug_on(sbi, check_nid_range(sbi, nid));
1168 repeat:
1169         page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false);
1170         if (!page)
1171                 return ERR_PTR(-ENOMEM);
1172
1173         err = read_node_page(page, 0);
1174         if (err < 0) {
1175                 f2fs_put_page(page, 1);
1176                 return ERR_PTR(err);
1177         } else if (err == LOCKED_PAGE) {
1178                 err = 0;
1179                 goto page_hit;
1180         }
1181
1182         if (parent)
1183                 ra_node_pages(parent, start + 1, MAX_RA_NODE);
1184
1185         lock_page(page);
1186
1187         if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1188                 f2fs_put_page(page, 1);
1189                 goto repeat;
1190         }
1191
1192         if (unlikely(!PageUptodate(page))) {
1193                 err = -EIO;
1194                 goto out_err;
1195         }
1196
1197         if (!f2fs_inode_chksum_verify(sbi, page)) {
1198                 err = -EBADMSG;
1199                 goto out_err;
1200         }
1201 page_hit:
1202         if(unlikely(nid != nid_of_node(page))) {
1203                 f2fs_msg(sbi->sb, KERN_WARNING, "inconsistent node block, "
1204                         "nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]",
1205                         nid, nid_of_node(page), ino_of_node(page),
1206                         ofs_of_node(page), cpver_of_node(page),
1207                         next_blkaddr_of_node(page));
1208                 err = -EINVAL;
1209 out_err:
1210                 ClearPageUptodate(page);
1211                 f2fs_put_page(page, 1);
1212                 return ERR_PTR(err);
1213         }
1214         return page;
1215 }
1216
1217 struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
1218 {
1219         return __get_node_page(sbi, nid, NULL, 0);
1220 }
1221
1222 struct page *get_node_page_ra(struct page *parent, int start)
1223 {
1224         struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
1225         nid_t nid = get_nid(parent, start, false);
1226
1227         return __get_node_page(sbi, nid, parent, start);
1228 }
1229
1230 static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
1231 {
1232         struct inode *inode;
1233         struct page *page;
1234         int ret;
1235
1236         /* should flush inline_data before evict_inode */
1237         inode = ilookup(sbi->sb, ino);
1238         if (!inode)
1239                 return;
1240
1241         page = f2fs_pagecache_get_page(inode->i_mapping, 0,
1242                                         FGP_LOCK|FGP_NOWAIT, 0);
1243         if (!page)
1244                 goto iput_out;
1245
1246         if (!PageUptodate(page))
1247                 goto page_out;
1248
1249         if (!PageDirty(page))
1250                 goto page_out;
1251
1252         if (!clear_page_dirty_for_io(page))
1253                 goto page_out;
1254
1255         ret = f2fs_write_inline_data(inode, page);
1256         inode_dec_dirty_pages(inode);
1257         remove_dirty_inode(inode);
1258         if (ret)
1259                 set_page_dirty(page);
1260 page_out:
1261         f2fs_put_page(page, 1);
1262 iput_out:
1263         iput(inode);
1264 }
1265
1266 static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
1267 {
1268         pgoff_t index, end;
1269         struct pagevec pvec;
1270         struct page *last_page = NULL;
1271
1272         pagevec_init(&pvec, 0);
1273         index = 0;
1274         end = ULONG_MAX;
1275
1276         while (index <= end) {
1277                 int i, nr_pages;
1278                 nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1279                                 PAGECACHE_TAG_DIRTY,
1280                                 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1281                 if (nr_pages == 0)
1282                         break;
1283
1284                 for (i = 0; i < nr_pages; i++) {
1285                         struct page *page = pvec.pages[i];
1286
1287                         if (unlikely(f2fs_cp_error(sbi))) {
1288                                 f2fs_put_page(last_page, 0);
1289                                 pagevec_release(&pvec);
1290                                 return ERR_PTR(-EIO);
1291                         }
1292
1293                         if (!IS_DNODE(page) || !is_cold_node(page))
1294                                 continue;
1295                         if (ino_of_node(page) != ino)
1296                                 continue;
1297
1298                         lock_page(page);
1299
1300                         if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1301 continue_unlock:
1302                                 unlock_page(page);
1303                                 continue;
1304                         }
1305                         if (ino_of_node(page) != ino)
1306                                 goto continue_unlock;
1307
1308                         if (!PageDirty(page)) {
1309                                 /* someone wrote it for us */
1310                                 goto continue_unlock;
1311                         }
1312
1313                         if (last_page)
1314                                 f2fs_put_page(last_page, 0);
1315
1316                         get_page(page);
1317                         last_page = page;
1318                         unlock_page(page);
1319                 }
1320                 pagevec_release(&pvec);
1321                 cond_resched();
1322         }
1323         return last_page;
1324 }
1325
1326 static int __write_node_page(struct page *page, bool atomic, bool *submitted,
1327                                 struct writeback_control *wbc, bool do_balance,
1328                                 enum iostat_type io_type)
1329 {
1330         struct f2fs_sb_info *sbi = F2FS_P_SB(page);
1331         nid_t nid;
1332         struct node_info ni;
1333         struct f2fs_io_info fio = {
1334                 .sbi = sbi,
1335                 .ino = ino_of_node(page),
1336                 .type = NODE,
1337                 .op = REQ_OP_WRITE,
1338                 .op_flags = wbc_to_write_flags(wbc),
1339                 .page = page,
1340                 .encrypted_page = NULL,
1341                 .submitted = false,
1342                 .io_type = io_type,
1343         };
1344
1345         trace_f2fs_writepage(page, NODE);
1346
1347         if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
1348                 goto redirty_out;
1349         if (unlikely(f2fs_cp_error(sbi)))
1350                 goto redirty_out;
1351
1352         /* get old block addr of this node page */
1353         nid = nid_of_node(page);
1354         f2fs_bug_on(sbi, page->index != nid);
1355
1356         if (wbc->for_reclaim) {
1357                 if (!down_read_trylock(&sbi->node_write))
1358                         goto redirty_out;
1359         } else {
1360                 down_read(&sbi->node_write);
1361         }
1362
1363         get_node_info(sbi, nid, &ni);
1364
1365         /* This page is already truncated */
1366         if (unlikely(ni.blk_addr == NULL_ADDR)) {
1367                 ClearPageUptodate(page);
1368                 dec_page_count(sbi, F2FS_DIRTY_NODES);
1369                 up_read(&sbi->node_write);
1370                 unlock_page(page);
1371                 return 0;
1372         }
1373
1374         if (atomic && !test_opt(sbi, NOBARRIER))
1375                 fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
1376
1377         set_page_writeback(page);
1378         fio.old_blkaddr = ni.blk_addr;
1379         write_node_page(nid, &fio);
1380         set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
1381         dec_page_count(sbi, F2FS_DIRTY_NODES);
1382         up_read(&sbi->node_write);
1383
1384         if (wbc->for_reclaim) {
1385                 f2fs_submit_merged_write_cond(sbi, page->mapping->host, 0,
1386                                                 page->index, NODE);
1387                 submitted = NULL;
1388         }
1389
1390         unlock_page(page);
1391
1392         if (unlikely(f2fs_cp_error(sbi))) {
1393                 f2fs_submit_merged_write(sbi, NODE);
1394                 submitted = NULL;
1395         }
1396         if (submitted)
1397                 *submitted = fio.submitted;
1398
1399         if (do_balance)
1400                 f2fs_balance_fs(sbi, false);
1401         return 0;
1402
1403 redirty_out:
1404         redirty_page_for_writepage(wbc, page);
1405         return AOP_WRITEPAGE_ACTIVATE;
1406 }
1407
1408 void move_node_page(struct page *node_page, int gc_type)
1409 {
1410         if (gc_type == FG_GC) {
1411                 struct writeback_control wbc = {
1412                         .sync_mode = WB_SYNC_ALL,
1413                         .nr_to_write = 1,
1414                         .for_reclaim = 0,
1415                 };
1416
1417                 set_page_dirty(node_page);
1418                 f2fs_wait_on_page_writeback(node_page, NODE, true);
1419
1420                 f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page));
1421                 if (!clear_page_dirty_for_io(node_page))
1422                         goto out_page;
1423
1424                 if (__write_node_page(node_page, false, NULL,
1425                                         &wbc, false, FS_GC_NODE_IO))
1426                         unlock_page(node_page);
1427                 goto release_page;
1428         } else {
1429                 /* set page dirty and write it */
1430                 if (!PageWriteback(node_page))
1431                         set_page_dirty(node_page);
1432         }
1433 out_page:
1434         unlock_page(node_page);
1435 release_page:
1436         f2fs_put_page(node_page, 0);
1437 }
1438
1439 static int f2fs_write_node_page(struct page *page,
1440                                 struct writeback_control *wbc)
1441 {
1442         return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO);
1443 }
1444
1445 int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode,
1446                         struct writeback_control *wbc, bool atomic)
1447 {
1448         pgoff_t index, end;
1449         pgoff_t last_idx = ULONG_MAX;
1450         struct pagevec pvec;
1451         int ret = 0;
1452         struct page *last_page = NULL;
1453         bool marked = false;
1454         nid_t ino = inode->i_ino;
1455
1456         if (atomic) {
1457                 last_page = last_fsync_dnode(sbi, ino);
1458                 if (IS_ERR_OR_NULL(last_page))
1459                         return PTR_ERR_OR_ZERO(last_page);
1460         }
1461 retry:
1462         pagevec_init(&pvec, 0);
1463         index = 0;
1464         end = ULONG_MAX;
1465
1466         while (index <= end) {
1467                 int i, nr_pages;
1468                 nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1469                                 PAGECACHE_TAG_DIRTY,
1470                                 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1471                 if (nr_pages == 0)
1472                         break;
1473
1474                 for (i = 0; i < nr_pages; i++) {
1475                         struct page *page = pvec.pages[i];
1476                         bool submitted = false;
1477
1478                         if (unlikely(f2fs_cp_error(sbi))) {
1479                                 f2fs_put_page(last_page, 0);
1480                                 pagevec_release(&pvec);
1481                                 ret = -EIO;
1482                                 goto out;
1483                         }
1484
1485                         if (!IS_DNODE(page) || !is_cold_node(page))
1486                                 continue;
1487                         if (ino_of_node(page) != ino)
1488                                 continue;
1489
1490                         lock_page(page);
1491
1492                         if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1493 continue_unlock:
1494                                 unlock_page(page);
1495                                 continue;
1496                         }
1497                         if (ino_of_node(page) != ino)
1498                                 goto continue_unlock;
1499
1500                         if (!PageDirty(page) && page != last_page) {
1501                                 /* someone wrote it for us */
1502                                 goto continue_unlock;
1503                         }
1504
1505                         f2fs_wait_on_page_writeback(page, NODE, true);
1506                         BUG_ON(PageWriteback(page));
1507
1508                         set_fsync_mark(page, 0);
1509                         set_dentry_mark(page, 0);
1510
1511                         if (!atomic || page == last_page) {
1512                                 set_fsync_mark(page, 1);
1513                                 if (IS_INODE(page)) {
1514                                         if (is_inode_flag_set(inode,
1515                                                                 FI_DIRTY_INODE))
1516                                                 update_inode(inode, page);
1517                                         set_dentry_mark(page,
1518                                                 need_dentry_mark(sbi, ino));
1519                                 }
1520                                 /*  may be written by other thread */
1521                                 if (!PageDirty(page))
1522                                         set_page_dirty(page);
1523                         }
1524
1525                         if (!clear_page_dirty_for_io(page))
1526                                 goto continue_unlock;
1527
1528                         ret = __write_node_page(page, atomic &&
1529                                                 page == last_page,
1530                                                 &submitted, wbc, true,
1531                                                 FS_NODE_IO);
1532                         if (ret) {
1533                                 unlock_page(page);
1534                                 f2fs_put_page(last_page, 0);
1535                                 break;
1536                         } else if (submitted) {
1537                                 last_idx = page->index;
1538                         }
1539
1540                         if (page == last_page) {
1541                                 f2fs_put_page(page, 0);
1542                                 marked = true;
1543                                 break;
1544                         }
1545                 }
1546                 pagevec_release(&pvec);
1547                 cond_resched();
1548
1549                 if (ret || marked)
1550                         break;
1551         }
1552         if (!ret && atomic && !marked) {
1553                 f2fs_msg(sbi->sb, KERN_DEBUG,
1554                         "Retry to write fsync mark: ino=%u, idx=%lx",
1555                                         ino, last_page->index);
1556                 lock_page(last_page);
1557                 f2fs_wait_on_page_writeback(last_page, NODE, true);
1558                 set_page_dirty(last_page);
1559                 unlock_page(last_page);
1560                 goto retry;
1561         }
1562 out:
1563         if (last_idx != ULONG_MAX)
1564                 f2fs_submit_merged_write_cond(sbi, NULL, ino, last_idx, NODE);
1565         return ret ? -EIO: 0;
1566 }
1567
1568 int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc,
1569                                 bool do_balance, enum iostat_type io_type)
1570 {
1571         pgoff_t index, end;
1572         struct pagevec pvec;
1573         int step = 0;
1574         int nwritten = 0;
1575         int ret = 0;
1576
1577         pagevec_init(&pvec, 0);
1578
1579 next_step:
1580         index = 0;
1581         end = ULONG_MAX;
1582
1583         while (index <= end) {
1584                 int i, nr_pages;
1585                 nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1586                                 PAGECACHE_TAG_DIRTY,
1587                                 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1588                 if (nr_pages == 0)
1589                         break;
1590
1591                 for (i = 0; i < nr_pages; i++) {
1592                         struct page *page = pvec.pages[i];
1593                         bool submitted = false;
1594
1595                         if (unlikely(f2fs_cp_error(sbi))) {
1596                                 pagevec_release(&pvec);
1597                                 ret = -EIO;
1598                                 goto out;
1599                         }
1600
1601                         /*
1602                          * flushing sequence with step:
1603                          * 0. indirect nodes
1604                          * 1. dentry dnodes
1605                          * 2. file dnodes
1606                          */
1607                         if (step == 0 && IS_DNODE(page))
1608                                 continue;
1609                         if (step == 1 && (!IS_DNODE(page) ||
1610                                                 is_cold_node(page)))
1611                                 continue;
1612                         if (step == 2 && (!IS_DNODE(page) ||
1613                                                 !is_cold_node(page)))
1614                                 continue;
1615 lock_node:
1616                         if (!trylock_page(page))
1617                                 continue;
1618
1619                         if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
1620 continue_unlock:
1621                                 unlock_page(page);
1622                                 continue;
1623                         }
1624
1625                         if (!PageDirty(page)) {
1626                                 /* someone wrote it for us */
1627                                 goto continue_unlock;
1628                         }
1629
1630                         /* flush inline_data */
1631                         if (is_inline_node(page)) {
1632                                 clear_inline_node(page);
1633                                 unlock_page(page);
1634                                 flush_inline_data(sbi, ino_of_node(page));
1635                                 goto lock_node;
1636                         }
1637
1638                         f2fs_wait_on_page_writeback(page, NODE, true);
1639
1640                         BUG_ON(PageWriteback(page));
1641                         if (!clear_page_dirty_for_io(page))
1642                                 goto continue_unlock;
1643
1644                         set_fsync_mark(page, 0);
1645                         set_dentry_mark(page, 0);
1646
1647                         ret = __write_node_page(page, false, &submitted,
1648                                                 wbc, do_balance, io_type);
1649                         if (ret)
1650                                 unlock_page(page);
1651                         else if (submitted)
1652                                 nwritten++;
1653
1654                         if (--wbc->nr_to_write == 0)
1655                                 break;
1656                 }
1657                 pagevec_release(&pvec);
1658                 cond_resched();
1659
1660                 if (wbc->nr_to_write == 0) {
1661                         step = 2;
1662                         break;
1663                 }
1664         }
1665
1666         if (step < 2) {
1667                 step++;
1668                 goto next_step;
1669         }
1670 out:
1671         if (nwritten)
1672                 f2fs_submit_merged_write(sbi, NODE);
1673         return ret;
1674 }
1675
1676 int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
1677 {
1678         pgoff_t index = 0, end = ULONG_MAX;
1679         struct pagevec pvec;
1680         int ret2, ret = 0;
1681
1682         pagevec_init(&pvec, 0);
1683
1684         while (index <= end) {
1685                 int i, nr_pages;
1686                 nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
1687                                 PAGECACHE_TAG_WRITEBACK,
1688                                 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
1689                 if (nr_pages == 0)
1690                         break;
1691
1692                 for (i = 0; i < nr_pages; i++) {
1693                         struct page *page = pvec.pages[i];
1694
1695                         /* until radix tree lookup accepts end_index */
1696                         if (unlikely(page->index > end))
1697                                 continue;
1698
1699                         if (ino && ino_of_node(page) == ino) {
1700                                 f2fs_wait_on_page_writeback(page, NODE, true);
1701                                 if (TestClearPageError(page))
1702                                         ret = -EIO;
1703                         }
1704                 }
1705                 pagevec_release(&pvec);
1706                 cond_resched();
1707         }
1708
1709         ret2 = filemap_check_errors(NODE_MAPPING(sbi));
1710         if (!ret)
1711                 ret = ret2;
1712         return ret;
1713 }
1714
1715 static int f2fs_write_node_pages(struct address_space *mapping,
1716                             struct writeback_control *wbc)
1717 {
1718         struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
1719         struct blk_plug plug;
1720         long diff;
1721
1722         if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
1723                 goto skip_write;
1724
1725         /* balancing f2fs's metadata in background */
1726         f2fs_balance_fs_bg(sbi);
1727
1728         /* collect a number of dirty node pages and write together */
1729         if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
1730                 goto skip_write;
1731
1732         trace_f2fs_writepages(mapping->host, wbc, NODE);
1733
1734         diff = nr_pages_to_write(sbi, NODE, wbc);
1735         wbc->sync_mode = WB_SYNC_NONE;
1736         blk_start_plug(&plug);
1737         sync_node_pages(sbi, wbc, true, FS_NODE_IO);
1738         blk_finish_plug(&plug);
1739         wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
1740         return 0;
1741
1742 skip_write:
1743         wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES);
1744         trace_f2fs_writepages(mapping->host, wbc, NODE);
1745         return 0;
1746 }
1747
1748 static int f2fs_set_node_page_dirty(struct page *page)
1749 {
1750         trace_f2fs_set_page_dirty(page, NODE);
1751
1752         if (!PageUptodate(page))
1753                 SetPageUptodate(page);
1754         if (!PageDirty(page)) {
1755                 f2fs_set_page_dirty_nobuffers(page);
1756                 inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
1757                 SetPagePrivate(page);
1758                 f2fs_trace_pid(page);
1759                 return 1;
1760         }
1761         return 0;
1762 }
1763
1764 /*
1765  * Structure of the f2fs node operations
1766  */
1767 const struct address_space_operations f2fs_node_aops = {
1768         .writepage      = f2fs_write_node_page,
1769         .writepages     = f2fs_write_node_pages,
1770         .set_page_dirty = f2fs_set_node_page_dirty,
1771         .invalidatepage = f2fs_invalidate_page,
1772         .releasepage    = f2fs_release_page,
1773 #ifdef CONFIG_MIGRATION
1774         .migratepage    = f2fs_migrate_page,
1775 #endif
1776 };
1777
1778 static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
1779                                                 nid_t n)
1780 {
1781         return radix_tree_lookup(&nm_i->free_nid_root, n);
1782 }
1783
1784 static int __insert_free_nid(struct f2fs_sb_info *sbi,
1785                         struct free_nid *i, enum nid_state state)
1786 {
1787         struct f2fs_nm_info *nm_i = NM_I(sbi);
1788
1789         int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
1790         if (err)
1791                 return err;
1792
1793         f2fs_bug_on(sbi, state != i->state);
1794         nm_i->nid_cnt[state]++;
1795         if (state == FREE_NID)
1796                 list_add_tail(&i->list, &nm_i->free_nid_list);
1797         return 0;
1798 }
1799
1800 static void __remove_free_nid(struct f2fs_sb_info *sbi,
1801                         struct free_nid *i, enum nid_state state)
1802 {
1803         struct f2fs_nm_info *nm_i = NM_I(sbi);
1804
1805         f2fs_bug_on(sbi, state != i->state);
1806         nm_i->nid_cnt[state]--;
1807         if (state == FREE_NID)
1808                 list_del(&i->list);
1809         radix_tree_delete(&nm_i->free_nid_root, i->nid);
1810 }
1811
1812 static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i,
1813                         enum nid_state org_state, enum nid_state dst_state)
1814 {
1815         struct f2fs_nm_info *nm_i = NM_I(sbi);
1816
1817         f2fs_bug_on(sbi, org_state != i->state);
1818         i->state = dst_state;
1819         nm_i->nid_cnt[org_state]--;
1820         nm_i->nid_cnt[dst_state]++;
1821
1822         switch (dst_state) {
1823         case PREALLOC_NID:
1824                 list_del(&i->list);
1825                 break;
1826         case FREE_NID:
1827                 list_add_tail(&i->list, &nm_i->free_nid_list);
1828                 break;
1829         default:
1830                 BUG_ON(1);
1831         }
1832 }
1833
1834 /* return if the nid is recognized as free */
1835 static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
1836 {
1837         struct f2fs_nm_info *nm_i = NM_I(sbi);
1838         struct free_nid *i, *e;
1839         struct nat_entry *ne;
1840         int err = -EINVAL;
1841         bool ret = false;
1842
1843         /* 0 nid should not be used */
1844         if (unlikely(nid == 0))
1845                 return false;
1846
1847         i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
1848         i->nid = nid;
1849         i->state = FREE_NID;
1850
1851         if (radix_tree_preload(GFP_NOFS))
1852                 goto err;
1853
1854         spin_lock(&nm_i->nid_list_lock);
1855
1856         if (build) {
1857                 /*
1858                  *   Thread A             Thread B
1859                  *  - f2fs_create
1860                  *   - f2fs_new_inode
1861                  *    - alloc_nid
1862                  *     - __insert_nid_to_list(PREALLOC_NID)
1863                  *                     - f2fs_balance_fs_bg
1864                  *                      - build_free_nids
1865                  *                       - __build_free_nids
1866                  *                        - scan_nat_page
1867                  *                         - add_free_nid
1868                  *                          - __lookup_nat_cache
1869                  *  - f2fs_add_link
1870                  *   - init_inode_metadata
1871                  *    - new_inode_page
1872                  *     - new_node_page
1873                  *      - set_node_addr
1874                  *  - alloc_nid_done
1875                  *   - __remove_nid_from_list(PREALLOC_NID)
1876                  *                         - __insert_nid_to_list(FREE_NID)
1877                  */
1878                 ne = __lookup_nat_cache(nm_i, nid);
1879                 if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
1880                                 nat_get_blkaddr(ne) != NULL_ADDR))
1881                         goto err_out;
1882
1883                 e = __lookup_free_nid_list(nm_i, nid);
1884                 if (e) {
1885                         if (e->state == FREE_NID)
1886                                 ret = true;
1887                         goto err_out;
1888                 }
1889         }
1890         ret = true;
1891         err = __insert_free_nid(sbi, i, FREE_NID);
1892 err_out:
1893         spin_unlock(&nm_i->nid_list_lock);
1894         radix_tree_preload_end();
1895 err:
1896         if (err)
1897                 kmem_cache_free(free_nid_slab, i);
1898         return ret;
1899 }
1900
1901 static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
1902 {
1903         struct f2fs_nm_info *nm_i = NM_I(sbi);
1904         struct free_nid *i;
1905         bool need_free = false;
1906
1907         spin_lock(&nm_i->nid_list_lock);
1908         i = __lookup_free_nid_list(nm_i, nid);
1909         if (i && i->state == FREE_NID) {
1910                 __remove_free_nid(sbi, i, FREE_NID);
1911                 need_free = true;
1912         }
1913         spin_unlock(&nm_i->nid_list_lock);
1914
1915         if (need_free)
1916                 kmem_cache_free(free_nid_slab, i);
1917 }
1918
1919 static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
1920                                                         bool set, bool build)
1921 {
1922         struct f2fs_nm_info *nm_i = NM_I(sbi);
1923         unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid);
1924         unsigned int nid_ofs = nid - START_NID(nid);
1925
1926         if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap))
1927                 return;
1928
1929         if (set) {
1930                 if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
1931                         return;
1932                 __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
1933                 nm_i->free_nid_count[nat_ofs]++;
1934         } else {
1935                 if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
1936                         return;
1937                 __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
1938                 if (!build)
1939                         nm_i->free_nid_count[nat_ofs]--;
1940         }
1941 }
1942
1943 static void scan_nat_page(struct f2fs_sb_info *sbi,
1944                         struct page *nat_page, nid_t start_nid)
1945 {
1946         struct f2fs_nm_info *nm_i = NM_I(sbi);
1947         struct f2fs_nat_block *nat_blk = page_address(nat_page);
1948         block_t blk_addr;
1949         unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid);
1950         int i;
1951
1952         if (test_bit_le(nat_ofs, nm_i->nat_block_bitmap))
1953                 return;
1954
1955         __set_bit_le(nat_ofs, nm_i->nat_block_bitmap);
1956
1957         i = start_nid % NAT_ENTRY_PER_BLOCK;
1958
1959         for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
1960                 bool freed = false;
1961
1962                 if (unlikely(start_nid >= nm_i->max_nid))
1963                         break;
1964
1965                 blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
1966                 f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
1967                 if (blk_addr == NULL_ADDR)
1968                         freed = add_free_nid(sbi, start_nid, true);
1969                 spin_lock(&NM_I(sbi)->nid_list_lock);
1970                 update_free_nid_bitmap(sbi, start_nid, freed, true);
1971                 spin_unlock(&NM_I(sbi)->nid_list_lock);
1972         }
1973 }
1974
1975 static void scan_curseg_cache(struct f2fs_sb_info *sbi)
1976 {
1977         struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1978         struct f2fs_journal *journal = curseg->journal;
1979         int i;
1980
1981         down_read(&curseg->journal_rwsem);
1982         for (i = 0; i < nats_in_cursum(journal); i++) {
1983                 block_t addr;
1984                 nid_t nid;
1985
1986                 addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
1987                 nid = le32_to_cpu(nid_in_journal(journal, i));
1988                 if (addr == NULL_ADDR)
1989                         add_free_nid(sbi, nid, true);
1990                 else
1991                         remove_free_nid(sbi, nid);
1992         }
1993         up_read(&curseg->journal_rwsem);
1994 }
1995
1996 static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
1997 {
1998         struct f2fs_nm_info *nm_i = NM_I(sbi);
1999         unsigned int i, idx;
2000         nid_t nid;
2001
2002         down_read(&nm_i->nat_tree_lock);
2003
2004         for (i = 0; i < nm_i->nat_blocks; i++) {
2005                 if (!test_bit_le(i, nm_i->nat_block_bitmap))
2006                         continue;
2007                 if (!nm_i->free_nid_count[i])
2008                         continue;
2009                 for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) {
2010                         idx = find_next_bit_le(nm_i->free_nid_bitmap[i],
2011                                                 NAT_ENTRY_PER_BLOCK, idx);
2012                         if (idx >= NAT_ENTRY_PER_BLOCK)
2013                                 break;
2014
2015                         nid = i * NAT_ENTRY_PER_BLOCK + idx;
2016                         add_free_nid(sbi, nid, true);
2017
2018                         if (nm_i->nid_cnt[FREE_NID] >= MAX_FREE_NIDS)
2019                                 goto out;
2020                 }
2021         }
2022 out:
2023         scan_curseg_cache(sbi);
2024
2025         up_read(&nm_i->nat_tree_lock);
2026 }
2027
2028 static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
2029 {
2030         struct f2fs_nm_info *nm_i = NM_I(sbi);
2031         int i = 0;
2032         nid_t nid = nm_i->next_scan_nid;
2033
2034         if (unlikely(nid >= nm_i->max_nid))
2035                 nid = 0;
2036
2037         /* Enough entries */
2038         if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
2039                 return;
2040
2041         if (!sync && !available_free_memory(sbi, FREE_NIDS))
2042                 return;
2043
2044         if (!mount) {
2045                 /* try to find free nids in free_nid_bitmap */
2046                 scan_free_nid_bits(sbi);
2047
2048                 if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
2049                         return;
2050         }
2051
2052         /* readahead nat pages to be scanned */
2053         ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
2054                                                         META_NAT, true);
2055
2056         down_read(&nm_i->nat_tree_lock);
2057
2058         while (1) {
2059                 struct page *page = get_current_nat_page(sbi, nid);
2060
2061                 scan_nat_page(sbi, page, nid);
2062                 f2fs_put_page(page, 1);
2063
2064                 nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
2065                 if (unlikely(nid >= nm_i->max_nid))
2066                         nid = 0;
2067
2068                 if (++i >= FREE_NID_PAGES)
2069                         break;
2070         }
2071
2072         /* go to the next free nat pages to find free nids abundantly */
2073         nm_i->next_scan_nid = nid;
2074
2075         /* find free nids from current sum_pages */
2076         scan_curseg_cache(sbi);
2077
2078         up_read(&nm_i->nat_tree_lock);
2079
2080         ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
2081                                         nm_i->ra_nid_pages, META_NAT, false);
2082 }
2083
2084 void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
2085 {
2086         mutex_lock(&NM_I(sbi)->build_lock);
2087         __build_free_nids(sbi, sync, mount);
2088         mutex_unlock(&NM_I(sbi)->build_lock);
2089 }
2090
2091 /*
2092  * If this function returns success, caller can obtain a new nid
2093  * from second parameter of this function.
2094  * The returned nid could be used ino as well as nid when inode is created.
2095  */
2096 bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
2097 {
2098         struct f2fs_nm_info *nm_i = NM_I(sbi);
2099         struct free_nid *i = NULL;
2100 retry:
2101 #ifdef CONFIG_F2FS_FAULT_INJECTION
2102         if (time_to_inject(sbi, FAULT_ALLOC_NID)) {
2103                 f2fs_show_injection_info(FAULT_ALLOC_NID);
2104                 return false;
2105         }
2106 #endif
2107         spin_lock(&nm_i->nid_list_lock);
2108
2109         if (unlikely(nm_i->available_nids == 0)) {
2110                 spin_unlock(&nm_i->nid_list_lock);
2111                 return false;
2112         }
2113
2114         /* We should not use stale free nids created by build_free_nids */
2115         if (nm_i->nid_cnt[FREE_NID] && !on_build_free_nids(nm_i)) {
2116                 f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
2117                 i = list_first_entry(&nm_i->free_nid_list,
2118                                         struct free_nid, list);
2119                 *nid = i->nid;
2120
2121                 __move_free_nid(sbi, i, FREE_NID, PREALLOC_NID);
2122                 nm_i->available_nids--;
2123
2124                 update_free_nid_bitmap(sbi, *nid, false, false);
2125
2126                 spin_unlock(&nm_i->nid_list_lock);
2127                 return true;
2128         }
2129         spin_unlock(&nm_i->nid_list_lock);
2130
2131         /* Let's scan nat pages and its caches to get free nids */
2132         build_free_nids(sbi, true, false);
2133         goto retry;
2134 }
2135
2136 /*
2137  * alloc_nid() should be called prior to this function.
2138  */
2139 void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
2140 {
2141         struct f2fs_nm_info *nm_i = NM_I(sbi);
2142         struct free_nid *i;
2143
2144         spin_lock(&nm_i->nid_list_lock);
2145         i = __lookup_free_nid_list(nm_i, nid);
2146         f2fs_bug_on(sbi, !i);
2147         __remove_free_nid(sbi, i, PREALLOC_NID);
2148         spin_unlock(&nm_i->nid_list_lock);
2149
2150         kmem_cache_free(free_nid_slab, i);
2151 }
2152
2153 /*
2154  * alloc_nid() should be called prior to this function.
2155  */
2156 void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
2157 {
2158         struct f2fs_nm_info *nm_i = NM_I(sbi);
2159         struct free_nid *i;
2160         bool need_free = false;
2161
2162         if (!nid)
2163                 return;
2164
2165         spin_lock(&nm_i->nid_list_lock);
2166         i = __lookup_free_nid_list(nm_i, nid);
2167         f2fs_bug_on(sbi, !i);
2168
2169         if (!available_free_memory(sbi, FREE_NIDS)) {
2170                 __remove_free_nid(sbi, i, PREALLOC_NID);
2171                 need_free = true;
2172         } else {
2173                 __move_free_nid(sbi, i, PREALLOC_NID, FREE_NID);
2174         }
2175
2176         nm_i->available_nids++;
2177
2178         update_free_nid_bitmap(sbi, nid, true, false);
2179
2180         spin_unlock(&nm_i->nid_list_lock);
2181
2182         if (need_free)
2183                 kmem_cache_free(free_nid_slab, i);
2184 }
2185
2186 int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
2187 {
2188         struct f2fs_nm_info *nm_i = NM_I(sbi);
2189         struct free_nid *i, *next;
2190         int nr = nr_shrink;
2191
2192         if (nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
2193                 return 0;
2194
2195         if (!mutex_trylock(&nm_i->build_lock))
2196                 return 0;
2197
2198         spin_lock(&nm_i->nid_list_lock);
2199         list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
2200                 if (nr_shrink <= 0 ||
2201                                 nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
2202                         break;
2203
2204                 __remove_free_nid(sbi, i, FREE_NID);
2205                 kmem_cache_free(free_nid_slab, i);
2206                 nr_shrink--;
2207         }
2208         spin_unlock(&nm_i->nid_list_lock);
2209         mutex_unlock(&nm_i->build_lock);
2210
2211         return nr - nr_shrink;
2212 }
2213
2214 void recover_inline_xattr(struct inode *inode, struct page *page)
2215 {
2216         void *src_addr, *dst_addr;
2217         size_t inline_size;
2218         struct page *ipage;
2219         struct f2fs_inode *ri;
2220
2221         ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
2222         f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage));
2223
2224         ri = F2FS_INODE(page);
2225         if (!(ri->i_inline & F2FS_INLINE_XATTR)) {
2226                 clear_inode_flag(inode, FI_INLINE_XATTR);
2227                 goto update_inode;
2228         }
2229
2230         dst_addr = inline_xattr_addr(inode, ipage);
2231         src_addr = inline_xattr_addr(inode, page);
2232         inline_size = inline_xattr_size(inode);
2233
2234         f2fs_wait_on_page_writeback(ipage, NODE, true);
2235         memcpy(dst_addr, src_addr, inline_size);
2236 update_inode:
2237         update_inode(inode, ipage);
2238         f2fs_put_page(ipage, 1);
2239 }
2240
2241 int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
2242 {
2243         struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
2244         nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
2245         nid_t new_xnid;
2246         struct dnode_of_data dn;
2247         struct node_info ni;
2248         struct page *xpage;
2249
2250         if (!prev_xnid)
2251                 goto recover_xnid;
2252
2253         /* 1: invalidate the previous xattr nid */
2254         get_node_info(sbi, prev_xnid, &ni);
2255         f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
2256         invalidate_blocks(sbi, ni.blk_addr);
2257         dec_valid_node_count(sbi, inode, false);
2258         set_node_addr(sbi, &ni, NULL_ADDR, false);
2259
2260 recover_xnid:
2261         /* 2: update xattr nid in inode */
2262         if (!alloc_nid(sbi, &new_xnid))
2263                 return -ENOSPC;
2264
2265         set_new_dnode(&dn, inode, NULL, NULL, new_xnid);
2266         xpage = new_node_page(&dn, XATTR_NODE_OFFSET);
2267         if (IS_ERR(xpage)) {
2268                 alloc_nid_failed(sbi, new_xnid);
2269                 return PTR_ERR(xpage);
2270         }
2271
2272         alloc_nid_done(sbi, new_xnid);
2273         update_inode_page(inode);
2274
2275         /* 3: update and set xattr node page dirty */
2276         memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE);
2277
2278         set_page_dirty(xpage);
2279         f2fs_put_page(xpage, 1);
2280
2281         return 0;
2282 }
2283
2284 int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
2285 {
2286         struct f2fs_inode *src, *dst;
2287         nid_t ino = ino_of_node(page);
2288         struct node_info old_ni, new_ni;
2289         struct page *ipage;
2290
2291         get_node_info(sbi, ino, &old_ni);
2292
2293         if (unlikely(old_ni.blk_addr != NULL_ADDR))
2294                 return -EINVAL;
2295 retry:
2296         ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false);
2297         if (!ipage) {
2298                 congestion_wait(BLK_RW_ASYNC, HZ/50);
2299                 goto retry;
2300         }
2301
2302         /* Should not use this inode from free nid list */
2303         remove_free_nid(sbi, ino);
2304
2305         if (!PageUptodate(ipage))
2306                 SetPageUptodate(ipage);
2307         fill_node_footer(ipage, ino, ino, 0, true);
2308
2309         src = F2FS_INODE(page);
2310         dst = F2FS_INODE(ipage);
2311
2312         memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src);
2313         dst->i_size = 0;
2314         dst->i_blocks = cpu_to_le64(1);
2315         dst->i_links = cpu_to_le32(1);
2316         dst->i_xattr_nid = 0;
2317         dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR);
2318         if (dst->i_inline & F2FS_EXTRA_ATTR) {
2319                 dst->i_extra_isize = src->i_extra_isize;
2320
2321                 if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) &&
2322                         F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
2323                                                         i_inline_xattr_size))
2324                         dst->i_inline_xattr_size = src->i_inline_xattr_size;
2325
2326                 if (f2fs_sb_has_project_quota(sbi->sb) &&
2327                         F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
2328                                                                 i_projid))
2329                         dst->i_projid = src->i_projid;
2330         }
2331
2332         new_ni = old_ni;
2333         new_ni.ino = ino;
2334
2335         if (unlikely(inc_valid_node_count(sbi, NULL, true)))
2336                 WARN_ON(1);
2337         set_node_addr(sbi, &new_ni, NEW_ADDR, false);
2338         inc_valid_inode_count(sbi);
2339         set_page_dirty(ipage);
2340         f2fs_put_page(ipage, 1);
2341         return 0;
2342 }
2343
2344 int restore_node_summary(struct f2fs_sb_info *sbi,
2345                         unsigned int segno, struct f2fs_summary_block *sum)
2346 {
2347         struct f2fs_node *rn;
2348         struct f2fs_summary *sum_entry;
2349         block_t addr;
2350         int i, idx, last_offset, nrpages;
2351
2352         /* scan the node segment */
2353         last_offset = sbi->blocks_per_seg;
2354         addr = START_BLOCK(sbi, segno);
2355         sum_entry = &sum->entries[0];
2356
2357         for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
2358                 nrpages = min(last_offset - i, BIO_MAX_PAGES);
2359
2360                 /* readahead node pages */
2361                 ra_meta_pages(sbi, addr, nrpages, META_POR, true);
2362
2363                 for (idx = addr; idx < addr + nrpages; idx++) {
2364                         struct page *page = get_tmp_page(sbi, idx);
2365
2366                         rn = F2FS_NODE(page);
2367                         sum_entry->nid = rn->footer.nid;
2368                         sum_entry->version = 0;
2369                         sum_entry->ofs_in_node = 0;
2370                         sum_entry++;
2371                         f2fs_put_page(page, 1);
2372                 }
2373
2374                 invalidate_mapping_pages(META_MAPPING(sbi), addr,
2375                                                         addr + nrpages);
2376         }
2377         return 0;
2378 }
2379
2380 static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
2381 {
2382         struct f2fs_nm_info *nm_i = NM_I(sbi);
2383         struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
2384         struct f2fs_journal *journal = curseg->journal;
2385         int i;
2386
2387         down_write(&curseg->journal_rwsem);
2388         for (i = 0; i < nats_in_cursum(journal); i++) {
2389                 struct nat_entry *ne;
2390                 struct f2fs_nat_entry raw_ne;
2391                 nid_t nid = le32_to_cpu(nid_in_journal(journal, i));
2392
2393                 raw_ne = nat_in_journal(journal, i);
2394
2395                 ne = __lookup_nat_cache(nm_i, nid);
2396                 if (!ne) {
2397                         ne = __alloc_nat_entry(nid, true);
2398                         __init_nat_entry(nm_i, ne, &raw_ne, true);
2399                 }
2400
2401                 /*
2402                  * if a free nat in journal has not been used after last
2403                  * checkpoint, we should remove it from available nids,
2404                  * since later we will add it again.
2405                  */
2406                 if (!get_nat_flag(ne, IS_DIRTY) &&
2407                                 le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) {
2408                         spin_lock(&nm_i->nid_list_lock);
2409                         nm_i->available_nids--;
2410                         spin_unlock(&nm_i->nid_list_lock);
2411                 }
2412
2413                 __set_nat_cache_dirty(nm_i, ne);
2414         }
2415         update_nats_in_cursum(journal, -i);
2416         up_write(&curseg->journal_rwsem);
2417 }
2418
2419 static void __adjust_nat_entry_set(struct nat_entry_set *nes,
2420                                                 struct list_head *head, int max)
2421 {
2422         struct nat_entry_set *cur;
2423
2424         if (nes->entry_cnt >= max)
2425                 goto add_out;
2426
2427         list_for_each_entry(cur, head, set_list) {
2428                 if (cur->entry_cnt >= nes->entry_cnt) {
2429                         list_add(&nes->set_list, cur->set_list.prev);
2430                         return;
2431                 }
2432         }
2433 add_out:
2434         list_add_tail(&nes->set_list, head);
2435 }
2436
2437 static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
2438                                                 struct page *page)
2439 {
2440         struct f2fs_nm_info *nm_i = NM_I(sbi);
2441         unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK;
2442         struct f2fs_nat_block *nat_blk = page_address(page);
2443         int valid = 0;
2444         int i = 0;
2445
2446         if (!enabled_nat_bits(sbi, NULL))
2447                 return;
2448
2449         if (nat_index == 0) {
2450                 valid = 1;
2451                 i = 1;
2452         }
2453         for (; i < NAT_ENTRY_PER_BLOCK; i++) {
2454                 if (nat_blk->entries[i].block_addr != NULL_ADDR)
2455                         valid++;
2456         }
2457         if (valid == 0) {
2458                 __set_bit_le(nat_index, nm_i->empty_nat_bits);
2459                 __clear_bit_le(nat_index, nm_i->full_nat_bits);
2460                 return;
2461         }
2462
2463         __clear_bit_le(nat_index, nm_i->empty_nat_bits);
2464         if (valid == NAT_ENTRY_PER_BLOCK)
2465                 __set_bit_le(nat_index, nm_i->full_nat_bits);
2466         else
2467                 __clear_bit_le(nat_index, nm_i->full_nat_bits);
2468 }
2469
2470 static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
2471                 struct nat_entry_set *set, struct cp_control *cpc)
2472 {
2473         struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
2474         struct f2fs_journal *journal = curseg->journal;
2475         nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
2476         bool to_journal = true;
2477         struct f2fs_nat_block *nat_blk;
2478         struct nat_entry *ne, *cur;
2479         struct page *page = NULL;
2480
2481         /*
2482          * there are two steps to flush nat entries:
2483          * #1, flush nat entries to journal in current hot data summary block.
2484          * #2, flush nat entries to nat page.
2485          */
2486         if (enabled_nat_bits(sbi, cpc) ||
2487                 !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
2488                 to_journal = false;
2489
2490         if (to_journal) {
2491                 down_write(&curseg->journal_rwsem);
2492         } else {
2493                 page = get_next_nat_page(sbi, start_nid);
2494                 nat_blk = page_address(page);
2495                 f2fs_bug_on(sbi, !nat_blk);
2496         }
2497
2498         /* flush dirty nats in nat entry set */
2499         list_for_each_entry_safe(ne, cur, &set->entry_list, list) {
2500                 struct f2fs_nat_entry *raw_ne;
2501                 nid_t nid = nat_get_nid(ne);
2502                 int offset;
2503
2504                 f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR);
2505
2506                 if (to_journal) {
2507                         offset = lookup_journal_in_cursum(journal,
2508                                                         NAT_JOURNAL, nid, 1);
2509                         f2fs_bug_on(sbi, offset < 0);
2510                         raw_ne = &nat_in_journal(journal, offset);
2511                         nid_in_journal(journal, offset) = cpu_to_le32(nid);
2512                 } else {
2513                         raw_ne = &nat_blk->entries[nid - start_nid];
2514                 }
2515                 raw_nat_from_node_info(raw_ne, &ne->ni);
2516                 nat_reset_flag(ne);
2517                 __clear_nat_cache_dirty(NM_I(sbi), set, ne);
2518                 if (nat_get_blkaddr(ne) == NULL_ADDR) {
2519                         add_free_nid(sbi, nid, false);
2520                         spin_lock(&NM_I(sbi)->nid_list_lock);
2521                         NM_I(sbi)->available_nids++;
2522                         update_free_nid_bitmap(sbi, nid, true, false);
2523                         spin_unlock(&NM_I(sbi)->nid_list_lock);
2524                 } else {
2525                         spin_lock(&NM_I(sbi)->nid_list_lock);
2526                         update_free_nid_bitmap(sbi, nid, false, false);
2527                         spin_unlock(&NM_I(sbi)->nid_list_lock);
2528                 }
2529         }
2530
2531         if (to_journal) {
2532                 up_write(&curseg->journal_rwsem);
2533         } else {
2534                 __update_nat_bits(sbi, start_nid, page);
2535                 f2fs_put_page(page, 1);
2536         }
2537
2538         /* Allow dirty nats by node block allocation in write_begin */
2539         if (!set->entry_cnt) {
2540                 radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
2541                 kmem_cache_free(nat_entry_set_slab, set);
2542         }
2543 }
2544
2545 /*
2546  * This function is called during the checkpointing process.
2547  */
2548 void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
2549 {
2550         struct f2fs_nm_info *nm_i = NM_I(sbi);
2551         struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
2552         struct f2fs_journal *journal = curseg->journal;
2553         struct nat_entry_set *setvec[SETVEC_SIZE];
2554         struct nat_entry_set *set, *tmp;
2555         unsigned int found;
2556         nid_t set_idx = 0;
2557         LIST_HEAD(sets);
2558
2559         if (!nm_i->dirty_nat_cnt)
2560                 return;
2561
2562         down_write(&nm_i->nat_tree_lock);
2563
2564         /*
2565          * if there are no enough space in journal to store dirty nat
2566          * entries, remove all entries from journal and merge them
2567          * into nat entry set.
2568          */
2569         if (enabled_nat_bits(sbi, cpc) ||
2570                 !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
2571                 remove_nats_in_journal(sbi);
2572
2573         while ((found = __gang_lookup_nat_set(nm_i,
2574                                         set_idx, SETVEC_SIZE, setvec))) {
2575                 unsigned idx;
2576                 set_idx = setvec[found - 1]->set + 1;
2577                 for (idx = 0; idx < found; idx++)
2578                         __adjust_nat_entry_set(setvec[idx], &sets,
2579                                                 MAX_NAT_JENTRIES(journal));
2580         }
2581
2582         /* flush dirty nats in nat entry set */
2583         list_for_each_entry_safe(set, tmp, &sets, set_list)
2584                 __flush_nat_entry_set(sbi, set, cpc);
2585
2586         up_write(&nm_i->nat_tree_lock);
2587         /* Allow dirty nats by node block allocation in write_begin */
2588 }
2589
2590 static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
2591 {
2592         struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
2593         struct f2fs_nm_info *nm_i = NM_I(sbi);
2594         unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE;
2595         unsigned int i;
2596         __u64 cp_ver = cur_cp_version(ckpt);
2597         block_t nat_bits_addr;
2598
2599         if (!enabled_nat_bits(sbi, NULL))
2600                 return 0;
2601
2602         nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 +
2603                                                 F2FS_BLKSIZE - 1);
2604         nm_i->nat_bits = kzalloc(nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS,
2605                                                 GFP_KERNEL);
2606         if (!nm_i->nat_bits)
2607                 return -ENOMEM;
2608
2609         nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg -
2610                                                 nm_i->nat_bits_blocks;
2611         for (i = 0; i < nm_i->nat_bits_blocks; i++) {
2612                 struct page *page = get_meta_page(sbi, nat_bits_addr++);
2613
2614                 memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS),
2615                                         page_address(page), F2FS_BLKSIZE);
2616                 f2fs_put_page(page, 1);
2617         }
2618
2619         cp_ver |= (cur_cp_crc(ckpt) << 32);
2620         if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) {
2621                 disable_nat_bits(sbi, true);
2622                 return 0;
2623         }
2624
2625         nm_i->full_nat_bits = nm_i->nat_bits + 8;
2626         nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes;
2627
2628         f2fs_msg(sbi->sb, KERN_NOTICE, "Found nat_bits in checkpoint");
2629         return 0;
2630 }
2631
2632 static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi)
2633 {
2634         struct f2fs_nm_info *nm_i = NM_I(sbi);
2635         unsigned int i = 0;
2636         nid_t nid, last_nid;
2637
2638         if (!enabled_nat_bits(sbi, NULL))
2639                 return;
2640
2641         for (i = 0; i < nm_i->nat_blocks; i++) {
2642                 i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i);
2643                 if (i >= nm_i->nat_blocks)
2644                         break;
2645
2646                 __set_bit_le(i, nm_i->nat_block_bitmap);
2647
2648                 nid = i * NAT_ENTRY_PER_BLOCK;
2649                 last_nid = nid + NAT_ENTRY_PER_BLOCK;
2650
2651                 spin_lock(&NM_I(sbi)->nid_list_lock);
2652                 for (; nid < last_nid; nid++)
2653                         update_free_nid_bitmap(sbi, nid, true, true);
2654                 spin_unlock(&NM_I(sbi)->nid_list_lock);
2655         }
2656
2657         for (i = 0; i < nm_i->nat_blocks; i++) {
2658                 i = find_next_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i);
2659                 if (i >= nm_i->nat_blocks)
2660                         break;
2661
2662                 __set_bit_le(i, nm_i->nat_block_bitmap);
2663         }
2664 }
2665
2666 static int init_node_manager(struct f2fs_sb_info *sbi)
2667 {
2668         struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi);
2669         struct f2fs_nm_info *nm_i = NM_I(sbi);
2670         unsigned char *version_bitmap;
2671         unsigned int nat_segs;
2672         int err;
2673
2674         nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
2675
2676         /* segment_count_nat includes pair segment so divide to 2. */
2677         nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
2678         nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
2679         nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks;
2680
2681         /* not used nids: 0, node, meta, (and root counted as valid node) */
2682         nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
2683                                                         F2FS_RESERVED_NODE_NUM;
2684         nm_i->nid_cnt[FREE_NID] = 0;
2685         nm_i->nid_cnt[PREALLOC_NID] = 0;
2686         nm_i->nat_cnt = 0;
2687         nm_i->ram_thresh = DEF_RAM_THRESHOLD;
2688         nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
2689         nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
2690
2691         INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
2692         INIT_LIST_HEAD(&nm_i->free_nid_list);
2693         INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO);
2694         INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO);
2695         INIT_LIST_HEAD(&nm_i->nat_entries);
2696
2697         mutex_init(&nm_i->build_lock);
2698         spin_lock_init(&nm_i->nid_list_lock);
2699         init_rwsem(&nm_i->nat_tree_lock);
2700
2701         nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
2702         nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
2703         version_bitmap = __bitmap_ptr(sbi, NAT_BITMAP);
2704         if (!version_bitmap)
2705                 return -EFAULT;
2706
2707         nm_i->nat_bitmap = kmemdup(version_bitmap, nm_i->bitmap_size,
2708                                         GFP_KERNEL);
2709         if (!nm_i->nat_bitmap)
2710                 return -ENOMEM;
2711
2712         err = __get_nat_bitmaps(sbi);
2713         if (err)
2714                 return err;
2715
2716 #ifdef CONFIG_F2FS_CHECK_FS
2717         nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size,
2718                                         GFP_KERNEL);
2719         if (!nm_i->nat_bitmap_mir)
2720                 return -ENOMEM;
2721 #endif
2722
2723         return 0;
2724 }
2725
2726 static int init_free_nid_cache(struct f2fs_sb_info *sbi)
2727 {
2728         struct f2fs_nm_info *nm_i = NM_I(sbi);
2729
2730         nm_i->free_nid_bitmap = kvzalloc(nm_i->nat_blocks *
2731                                         NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL);
2732         if (!nm_i->free_nid_bitmap)
2733                 return -ENOMEM;
2734
2735         nm_i->nat_block_bitmap = kvzalloc(nm_i->nat_blocks / 8,
2736                                                                 GFP_KERNEL);
2737         if (!nm_i->nat_block_bitmap)
2738                 return -ENOMEM;
2739
2740         nm_i->free_nid_count = kvzalloc(nm_i->nat_blocks *
2741                                         sizeof(unsigned short), GFP_KERNEL);
2742         if (!nm_i->free_nid_count)
2743                 return -ENOMEM;
2744         return 0;
2745 }
2746
2747 int build_node_manager(struct f2fs_sb_info *sbi)
2748 {
2749         int err;
2750
2751         sbi->nm_info = kzalloc(sizeof(struct f2fs_nm_info), GFP_KERNEL);
2752         if (!sbi->nm_info)
2753                 return -ENOMEM;
2754
2755         err = init_node_manager(sbi);
2756         if (err)
2757                 return err;
2758
2759         err = init_free_nid_cache(sbi);
2760         if (err)
2761                 return err;
2762
2763         /* load free nid status from nat_bits table */
2764         load_free_nid_bitmap(sbi);
2765
2766         build_free_nids(sbi, true, true);
2767         return 0;
2768 }
2769
2770 void destroy_node_manager(struct f2fs_sb_info *sbi)
2771 {
2772         struct f2fs_nm_info *nm_i = NM_I(sbi);
2773         struct free_nid *i, *next_i;
2774         struct nat_entry *natvec[NATVEC_SIZE];
2775         struct nat_entry_set *setvec[SETVEC_SIZE];
2776         nid_t nid = 0;
2777         unsigned int found;
2778
2779         if (!nm_i)
2780                 return;
2781
2782         /* destroy free nid list */
2783         spin_lock(&nm_i->nid_list_lock);
2784         list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
2785                 __remove_free_nid(sbi, i, FREE_NID);
2786                 spin_unlock(&nm_i->nid_list_lock);
2787                 kmem_cache_free(free_nid_slab, i);
2788                 spin_lock(&nm_i->nid_list_lock);
2789         }
2790         f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID]);
2791         f2fs_bug_on(sbi, nm_i->nid_cnt[PREALLOC_NID]);
2792         f2fs_bug_on(sbi, !list_empty(&nm_i->free_nid_list));
2793         spin_unlock(&nm_i->nid_list_lock);
2794
2795         /* destroy nat cache */
2796         down_write(&nm_i->nat_tree_lock);
2797         while ((found = __gang_lookup_nat_cache(nm_i,
2798                                         nid, NATVEC_SIZE, natvec))) {
2799                 unsigned idx;
2800
2801                 nid = nat_get_nid(natvec[found - 1]) + 1;
2802                 for (idx = 0; idx < found; idx++)
2803                         __del_from_nat_cache(nm_i, natvec[idx]);
2804         }
2805         f2fs_bug_on(sbi, nm_i->nat_cnt);
2806
2807         /* destroy nat set cache */
2808         nid = 0;
2809         while ((found = __gang_lookup_nat_set(nm_i,
2810                                         nid, SETVEC_SIZE, setvec))) {
2811                 unsigned idx;
2812
2813                 nid = setvec[found - 1]->set + 1;
2814                 for (idx = 0; idx < found; idx++) {
2815                         /* entry_cnt is not zero, when cp_error was occurred */
2816                         f2fs_bug_on(sbi, !list_empty(&setvec[idx]->entry_list));
2817                         radix_tree_delete(&nm_i->nat_set_root, setvec[idx]->set);
2818                         kmem_cache_free(nat_entry_set_slab, setvec[idx]);
2819                 }
2820         }
2821         up_write(&nm_i->nat_tree_lock);
2822
2823         kvfree(nm_i->nat_block_bitmap);
2824         kvfree(nm_i->free_nid_bitmap);
2825         kvfree(nm_i->free_nid_count);
2826
2827         kfree(nm_i->nat_bitmap);
2828         kfree(nm_i->nat_bits);
2829 #ifdef CONFIG_F2FS_CHECK_FS
2830         kfree(nm_i->nat_bitmap_mir);
2831 #endif
2832         sbi->nm_info = NULL;
2833         kfree(nm_i);
2834 }
2835
2836 int __init create_node_manager_caches(void)
2837 {
2838         nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
2839                         sizeof(struct nat_entry));
2840         if (!nat_entry_slab)
2841                 goto fail;
2842
2843         free_nid_slab = f2fs_kmem_cache_create("free_nid",
2844                         sizeof(struct free_nid));
2845         if (!free_nid_slab)
2846                 goto destroy_nat_entry;
2847
2848         nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set",
2849                         sizeof(struct nat_entry_set));
2850         if (!nat_entry_set_slab)
2851                 goto destroy_free_nid;
2852         return 0;
2853
2854 destroy_free_nid:
2855         kmem_cache_destroy(free_nid_slab);
2856 destroy_nat_entry:
2857         kmem_cache_destroy(nat_entry_slab);
2858 fail:
2859         return -ENOMEM;
2860 }
2861
2862 void destroy_node_manager_caches(void)
2863 {
2864         kmem_cache_destroy(nat_entry_set_slab);
2865         kmem_cache_destroy(free_nid_slab);
2866         kmem_cache_destroy(nat_entry_slab);
2867 }