diff --git a/Dump/ufs/ffs/ffs_alloc.c b/Dump/ufs/ffs/ffs_alloc.c new file mode 100644 index 0000000..042d4e6 --- /dev/null +++ b/Dump/ufs/ffs/ffs_alloc.c @@ -0,0 +1,3237 @@ +/*- + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Marshall + * Kirk McKusick and Network Associates Laboratories, the Security + * Research Division of Network Associates, Inc. under DARPA/SPAWAR + * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS + * research program + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_alloc.c 8.18 (Berkeley) 5/26/95 + */ + +#include +__FBSDID("$FreeBSD: releng/11.2/sys/ufs/ffs/ffs_alloc.c 331722 2018-03-29 02:50:57Z eadler $"); + +#include "opt_quota.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +typedef ufs2_daddr_t allocfcn_t(struct inode *ip, u_int cg, ufs2_daddr_t bpref, + int size, int rsize); + +static ufs2_daddr_t ffs_alloccg(struct inode *, u_int, ufs2_daddr_t, int, int); +static ufs2_daddr_t + ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int); +static void ffs_blkfree_cg(struct ufsmount *, struct fs *, + struct vnode *, ufs2_daddr_t, long, ino_t, + struct workhead *); +static void ffs_blkfree_trim_completed(struct bio *); +static void ffs_blkfree_trim_task(void *ctx, int pending __unused); +#ifdef INVARIANTS +static int ffs_checkblk(struct inode *, ufs2_daddr_t, long); +#endif +static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int); +static ino_t ffs_dirpref(struct inode *); +static ufs2_daddr_t ffs_fragextend(struct inode *, u_int, ufs2_daddr_t, + int, int); +static ufs2_daddr_t ffs_hashalloc + (struct inode *, u_int, ufs2_daddr_t, int, int, allocfcn_t *); +static ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int, + int); +static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int); +static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *); +static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *); + +/* + * Allocate a block in the filesystem. + * + * The size of the requested block is given, which must be some + * multiple of fs_fsize and <= fs_bsize. + * A preference may be optionally specified. If a preference is given + * the following hierarchy is used to allocate a block: + * 1) allocate the requested block. + * 2) allocate a rotationally optimal block in the same cylinder. + * 3) allocate a block in the same cylinder group. + * 4) quadradically rehash into other cylinder groups, until an + * available block is located. + * If no block preference is given the following hierarchy is used + * to allocate a block: + * 1) allocate a block in the cylinder group that contains the + * inode for the file. + * 2) quadradically rehash into other cylinder groups, until an + * available block is located. + */ +int +ffs_alloc(ip, lbn, bpref, size, flags, cred, bnp) + struct inode *ip; + ufs2_daddr_t lbn, bpref; + int size, flags; + struct ucred *cred; + ufs2_daddr_t *bnp; +{ + struct fs *fs; + struct ufsmount *ump; + ufs2_daddr_t bno; + u_int cg, reclaimed; + static struct timeval lastfail; + static int curfail; + int64_t delta; +#ifdef QUOTA + int error; +#endif + + *bnp = 0; + ump = ITOUMP(ip); + fs = ump->um_fs; + mtx_assert(UFS_MTX(ump), MA_OWNED); +#ifdef INVARIANTS + if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { + printf("dev = %s, bsize = %ld, size = %d, fs = %s\n", + devtoname(ump->um_dev), (long)fs->fs_bsize, size, + fs->fs_fsmnt); + panic("ffs_alloc: bad size"); + } + if (cred == NOCRED) + panic("ffs_alloc: missing credential"); +#endif /* INVARIANTS */ + reclaimed = 0; +retry: +#ifdef QUOTA + UFS_UNLOCK(ump); + error = chkdq(ip, btodb(size), cred, 0); + if (error) + return (error); + UFS_LOCK(ump); +#endif + if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) + goto nospace; + if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) && + freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0) + goto nospace; + if (bpref >= fs->fs_size) + bpref = 0; + if (bpref == 0) + cg = ino_to_cg(fs, ip->i_number); + else + cg = dtog(fs, bpref); + bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg); + if (bno > 0) { + delta = btodb(size); + DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); + if (flags & IO_EXT) + ip->i_flag |= IN_CHANGE; + else + ip->i_flag |= IN_CHANGE | IN_UPDATE; + *bnp = bno; + return (0); + } +nospace: +#ifdef QUOTA + UFS_UNLOCK(ump); + /* + * Restore user's disk quota because allocation failed. + */ + (void) chkdq(ip, -btodb(size), cred, FORCE); + UFS_LOCK(ump); +#endif + if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { + reclaimed = 1; + softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT); + goto retry; + } + UFS_UNLOCK(ump); + if (reclaimed > 0 && ppsratecheck(&lastfail, &curfail, 1)) { + ffs_fserr(fs, ip->i_number, "filesystem full"); + uprintf("\n%s: write failed, filesystem is full\n", + fs->fs_fsmnt); + } + return (ENOSPC); +} + +/* + * Reallocate a fragment to a bigger size + * + * The number and size of the old block is given, and a preference + * and new size is also specified. The allocator attempts to extend + * the original block. Failing that, the regular block allocator is + * invoked to get an appropriate block. + */ +int +ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp) + struct inode *ip; + ufs2_daddr_t lbprev; + ufs2_daddr_t bprev; + ufs2_daddr_t bpref; + int osize, nsize, flags; + struct ucred *cred; + struct buf **bpp; +{ + struct vnode *vp; + struct fs *fs; + struct buf *bp; + struct ufsmount *ump; + u_int cg, request, reclaimed; + int error, gbflags; + ufs2_daddr_t bno; + static struct timeval lastfail; + static int curfail; + int64_t delta; + + vp = ITOV(ip); + ump = ITOUMP(ip); + fs = ump->um_fs; + bp = NULL; + gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; + + mtx_assert(UFS_MTX(ump), MA_OWNED); +#ifdef INVARIANTS + if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) + panic("ffs_realloccg: allocation on suspended filesystem"); + if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || + (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { + printf( + "dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n", + devtoname(ump->um_dev), (long)fs->fs_bsize, osize, + nsize, fs->fs_fsmnt); + panic("ffs_realloccg: bad size"); + } + if (cred == NOCRED) + panic("ffs_realloccg: missing credential"); +#endif /* INVARIANTS */ + reclaimed = 0; +retry: + if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) && + freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) { + goto nospace; + } + if (bprev == 0) { + printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n", + devtoname(ump->um_dev), (long)fs->fs_bsize, (intmax_t)bprev, + fs->fs_fsmnt); + panic("ffs_realloccg: bad bprev"); + } + UFS_UNLOCK(ump); + /* + * Allocate the extra space in the buffer. + */ + error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp); + if (error) { + brelse(bp); + return (error); + } + + if (bp->b_blkno == bp->b_lblkno) { + if (lbprev >= NDADDR) + panic("ffs_realloccg: lbprev out of range"); + bp->b_blkno = fsbtodb(fs, bprev); + } + +#ifdef QUOTA + error = chkdq(ip, btodb(nsize - osize), cred, 0); + if (error) { + brelse(bp); + return (error); + } +#endif + /* + * Check for extension in the existing location. + */ + *bpp = NULL; + cg = dtog(fs, bprev); + UFS_LOCK(ump); + bno = ffs_fragextend(ip, cg, bprev, osize, nsize); + if (bno) { + if (bp->b_blkno != fsbtodb(fs, bno)) + panic("ffs_realloccg: bad blockno"); + delta = btodb(nsize - osize); + DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); + if (flags & IO_EXT) + ip->i_flag |= IN_CHANGE; + else + ip->i_flag |= IN_CHANGE | IN_UPDATE; + allocbuf(bp, nsize); + bp->b_flags |= B_DONE; + vfs_bio_bzero_buf(bp, osize, nsize - osize); + if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) + vfs_bio_set_valid(bp, osize, nsize - osize); + *bpp = bp; + return (0); + } + /* + * Allocate a new disk location. + */ + if (bpref >= fs->fs_size) + bpref = 0; + switch ((int)fs->fs_optim) { + case FS_OPTSPACE: + /* + * Allocate an exact sized fragment. Although this makes + * best use of space, we will waste time relocating it if + * the file continues to grow. If the fragmentation is + * less than half of the minimum free reserve, we choose + * to begin optimizing for time. + */ + request = nsize; + if (fs->fs_minfree <= 5 || + fs->fs_cstotal.cs_nffree > + (off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100)) + break; + log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n", + fs->fs_fsmnt); + fs->fs_optim = FS_OPTTIME; + break; + case FS_OPTTIME: + /* + * At this point we have discovered a file that is trying to + * grow a small fragment to a larger fragment. To save time, + * we allocate a full sized block, then free the unused portion. + * If the file continues to grow, the `ffs_fragextend' call + * above will be able to grow it in place without further + * copying. If aberrant programs cause disk fragmentation to + * grow within 2% of the free reserve, we choose to begin + * optimizing for space. + */ + request = fs->fs_bsize; + if (fs->fs_cstotal.cs_nffree < + (off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100) + break; + log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n", + fs->fs_fsmnt); + fs->fs_optim = FS_OPTSPACE; + break; + default: + printf("dev = %s, optim = %ld, fs = %s\n", + devtoname(ump->um_dev), (long)fs->fs_optim, fs->fs_fsmnt); + panic("ffs_realloccg: bad optim"); + /* NOTREACHED */ + } + bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg); + if (bno > 0) { + bp->b_blkno = fsbtodb(fs, bno); + if (!DOINGSOFTDEP(vp)) + ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize, + ip->i_number, vp->v_type, NULL); + delta = btodb(nsize - osize); + DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); + if (flags & IO_EXT) + ip->i_flag |= IN_CHANGE; + else + ip->i_flag |= IN_CHANGE | IN_UPDATE; + allocbuf(bp, nsize); + bp->b_flags |= B_DONE; + vfs_bio_bzero_buf(bp, osize, nsize - osize); + if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) + vfs_bio_set_valid(bp, osize, nsize - osize); + *bpp = bp; + return (0); + } +#ifdef QUOTA + UFS_UNLOCK(ump); + /* + * Restore user's disk quota because allocation failed. + */ + (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE); + UFS_LOCK(ump); +#endif +nospace: + /* + * no space available + */ + if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { + reclaimed = 1; + UFS_UNLOCK(ump); + if (bp) { + brelse(bp); + bp = NULL; + } + UFS_LOCK(ump); + softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT); + goto retry; + } + UFS_UNLOCK(ump); + if (bp) + brelse(bp); + if (reclaimed > 0 && ppsratecheck(&lastfail, &curfail, 1)) { + ffs_fserr(fs, ip->i_number, "filesystem full"); + uprintf("\n%s: write failed, filesystem is full\n", + fs->fs_fsmnt); + } + return (ENOSPC); +} + +/* + * Reallocate a sequence of blocks into a contiguous sequence of blocks. + * + * The vnode and an array of buffer pointers for a range of sequential + * logical blocks to be made contiguous is given. The allocator attempts + * to find a range of sequential blocks starting as close as possible + * from the end of the allocation for the logical block immediately + * preceding the current range. If successful, the physical block numbers + * in the buffer pointers and in the inode are changed to reflect the new + * allocation. If unsuccessful, the allocation is left unchanged. The + * success in doing the reallocation is returned. Note that the error + * return is not reflected back to the user. Rather the previous block + * allocation will be used. + */ + +SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW, 0, "FFS filesystem"); + +static int doasyncfree = 1; +SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, +"do not force synchronous writes when blocks are reallocated"); + +static int doreallocblks = 1; +SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, +"enable block reallocation"); + +static int maxclustersearch = 10; +SYSCTL_INT(_vfs_ffs, OID_AUTO, maxclustersearch, CTLFLAG_RW, &maxclustersearch, +0, "max number of cylinder group to search for contigous blocks"); + +#ifdef DEBUG +static volatile int prtrealloc = 0; +#endif + +int +ffs_reallocblks(ap) + struct vop_reallocblks_args /* { + struct vnode *a_vp; + struct cluster_save *a_buflist; + } */ *ap; +{ + struct ufsmount *ump; + + /* + * If the underlying device can do deletes, then skip reallocating + * the blocks of this file into contiguous sequences. Devices that + * benefit from BIO_DELETE also benefit from not moving the data. + * These devices are flash and therefore work less well with this + * optimization. Also skip if reallocblks has been disabled globally. + */ + ump = ap->a_vp->v_mount->mnt_data; + if (ump->um_candelete || doreallocblks == 0) + return (ENOSPC); + + /* + * We can't wait in softdep prealloc as it may fsync and recurse + * here. Instead we simply fail to reallocate blocks if this + * rare condition arises. + */ + if (DOINGSOFTDEP(ap->a_vp)) + if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0) + return (ENOSPC); + if (ump->um_fstype == UFS1) + return (ffs_reallocblks_ufs1(ap)); + return (ffs_reallocblks_ufs2(ap)); +} + +static int +ffs_reallocblks_ufs1(ap) + struct vop_reallocblks_args /* { + struct vnode *a_vp; + struct cluster_save *a_buflist; + } */ *ap; +{ + struct fs *fs; + struct inode *ip; + struct vnode *vp; + struct buf *sbp, *ebp; + ufs1_daddr_t *bap, *sbap, *ebap; + struct cluster_save *buflist; + struct ufsmount *ump; + ufs_lbn_t start_lbn, end_lbn; + ufs1_daddr_t soff, newblk, blkno; + ufs2_daddr_t pref; + struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp; + int i, cg, len, start_lvl, end_lvl, ssize; + + vp = ap->a_vp; + ip = VTOI(vp); + ump = ITOUMP(ip); + fs = ump->um_fs; + /* + * If we are not tracking block clusters or if we have less than 4% + * free blocks left, then do not attempt to cluster. Running with + * less than 5% free block reserve is not recommended and those that + * choose to do so do not expect to have good file layout. + */ + if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) + return (ENOSPC); + buflist = ap->a_buflist; + len = buflist->bs_nchildren; + start_lbn = buflist->bs_children[0]->b_lblkno; + end_lbn = start_lbn + len - 1; +#ifdef INVARIANTS + for (i = 0; i < len; i++) + if (!ffs_checkblk(ip, + dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) + panic("ffs_reallocblks: unallocated block 1"); + for (i = 1; i < len; i++) + if (buflist->bs_children[i]->b_lblkno != start_lbn + i) + panic("ffs_reallocblks: non-logical cluster"); + blkno = buflist->bs_children[0]->b_blkno; + ssize = fsbtodb(fs, fs->fs_frag); + for (i = 1; i < len - 1; i++) + if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) + panic("ffs_reallocblks: non-physical cluster %d", i); +#endif + /* + * If the cluster crosses the boundary for the first indirect + * block, leave space for the indirect block. Indirect blocks + * are initially laid out in a position after the last direct + * block. Block reallocation would usually destroy locality by + * moving the indirect block out of the way to make room for + * data blocks if we didn't compensate here. We should also do + * this for other indirect block boundaries, but it is only + * important for the first one. + */ + if (start_lbn < NDADDR && end_lbn >= NDADDR) + return (ENOSPC); + /* + * If the latest allocation is in a new cylinder group, assume that + * the filesystem has decided to move and do not force it back to + * the previous cylinder group. + */ + if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != + dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) + return (ENOSPC); + if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || + ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) + return (ENOSPC); + /* + * Get the starting offset and block map for the first block. + */ + if (start_lvl == 0) { + sbap = &ip->i_din1->di_db[0]; + soff = start_lbn; + } else { + idp = &start_ap[start_lvl - 1]; + if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { + brelse(sbp); + return (ENOSPC); + } + sbap = (ufs1_daddr_t *)sbp->b_data; + soff = idp->in_off; + } + /* + * If the block range spans two block maps, get the second map. + */ + ebap = NULL; + if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { + ssize = len; + } else { +#ifdef INVARIANTS + if (start_lvl > 0 && + start_ap[start_lvl - 1].in_lbn == idp->in_lbn) + panic("ffs_reallocblk: start == end"); +#endif + ssize = len - (idp->in_off + 1); + if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) + goto fail; + ebap = (ufs1_daddr_t *)ebp->b_data; + } + /* + * Find the preferred location for the cluster. If we have not + * previously failed at this endeavor, then follow our standard + * preference calculation. If we have failed at it, then pick up + * where we last ended our search. + */ + UFS_LOCK(ump); + if (ip->i_nextclustercg == -1) + pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap); + else + pref = cgdata(fs, ip->i_nextclustercg); + /* + * Search the block map looking for an allocation of the desired size. + * To avoid wasting too much time, we limit the number of cylinder + * groups that we will search. + */ + cg = dtog(fs, pref); + for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { + if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) + break; + cg += 1; + if (cg >= fs->fs_ncg) + cg = 0; + } + /* + * If we have failed in our search, record where we gave up for + * next time. Otherwise, fall back to our usual search citerion. + */ + if (newblk == 0) { + ip->i_nextclustercg = cg; + UFS_UNLOCK(ump); + goto fail; + } + ip->i_nextclustercg = -1; + /* + * We have found a new contiguous block. + * + * First we have to replace the old block pointers with the new + * block pointers in the inode and indirect blocks associated + * with the file. + */ +#ifdef DEBUG + if (prtrealloc) + printf("realloc: ino %ju, lbns %jd-%jd\n\told:", + (uintmax_t)ip->i_number, + (intmax_t)start_lbn, (intmax_t)end_lbn); +#endif + blkno = newblk; + for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { + if (i == ssize) { + bap = ebap; + soff = -i; + } +#ifdef INVARIANTS + if (!ffs_checkblk(ip, + dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) + panic("ffs_reallocblks: unallocated block 2"); + if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) + panic("ffs_reallocblks: alloc mismatch"); +#endif +#ifdef DEBUG + if (prtrealloc) + printf(" %d,", *bap); +#endif + if (DOINGSOFTDEP(vp)) { + if (sbap == &ip->i_din1->di_db[0] && i < ssize) + softdep_setup_allocdirect(ip, start_lbn + i, + blkno, *bap, fs->fs_bsize, fs->fs_bsize, + buflist->bs_children[i]); + else + softdep_setup_allocindir_page(ip, start_lbn + i, + i < ssize ? sbp : ebp, soff + i, blkno, + *bap, buflist->bs_children[i]); + } + *bap++ = blkno; + } + /* + * Next we must write out the modified inode and indirect blocks. + * For strict correctness, the writes should be synchronous since + * the old block values may have been written to disk. In practise + * they are almost never written, but if we are concerned about + * strict correctness, the `doasyncfree' flag should be set to zero. + * + * The test on `doasyncfree' should be changed to test a flag + * that shows whether the associated buffers and inodes have + * been written. The flag should be set when the cluster is + * started and cleared whenever the buffer or inode is flushed. + * We can then check below to see if it is set, and do the + * synchronous write only when it has been cleared. + */ + if (sbap != &ip->i_din1->di_db[0]) { + if (doasyncfree) + bdwrite(sbp); + else + bwrite(sbp); + } else { + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (!doasyncfree) + ffs_update(vp, 1); + } + if (ssize < len) { + if (doasyncfree) + bdwrite(ebp); + else + bwrite(ebp); + } + /* + * Last, free the old blocks and assign the new blocks to the buffers. + */ +#ifdef DEBUG + if (prtrealloc) + printf("\n\tnew:"); +#endif + for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { + if (!DOINGSOFTDEP(vp)) + ffs_blkfree(ump, fs, ump->um_devvp, + dbtofsb(fs, buflist->bs_children[i]->b_blkno), + fs->fs_bsize, ip->i_number, vp->v_type, NULL); + buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); +#ifdef INVARIANTS + if (!ffs_checkblk(ip, + dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) + panic("ffs_reallocblks: unallocated block 3"); +#endif +#ifdef DEBUG + if (prtrealloc) + printf(" %d,", blkno); +#endif + } +#ifdef DEBUG + if (prtrealloc) { + prtrealloc--; + printf("\n"); + } +#endif + return (0); + +fail: + if (ssize < len) + brelse(ebp); + if (sbap != &ip->i_din1->di_db[0]) + brelse(sbp); + return (ENOSPC); +} + +static int +ffs_reallocblks_ufs2(ap) + struct vop_reallocblks_args /* { + struct vnode *a_vp; + struct cluster_save *a_buflist; + } */ *ap; +{ + struct fs *fs; + struct inode *ip; + struct vnode *vp; + struct buf *sbp, *ebp; + ufs2_daddr_t *bap, *sbap, *ebap; + struct cluster_save *buflist; + struct ufsmount *ump; + ufs_lbn_t start_lbn, end_lbn; + ufs2_daddr_t soff, newblk, blkno, pref; + struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp; + int i, cg, len, start_lvl, end_lvl, ssize; + + vp = ap->a_vp; + ip = VTOI(vp); + ump = ITOUMP(ip); + fs = ump->um_fs; + /* + * If we are not tracking block clusters or if we have less than 4% + * free blocks left, then do not attempt to cluster. Running with + * less than 5% free block reserve is not recommended and those that + * choose to do so do not expect to have good file layout. + */ + if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) + return (ENOSPC); + buflist = ap->a_buflist; + len = buflist->bs_nchildren; + start_lbn = buflist->bs_children[0]->b_lblkno; + end_lbn = start_lbn + len - 1; +#ifdef INVARIANTS + for (i = 0; i < len; i++) + if (!ffs_checkblk(ip, + dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) + panic("ffs_reallocblks: unallocated block 1"); + for (i = 1; i < len; i++) + if (buflist->bs_children[i]->b_lblkno != start_lbn + i) + panic("ffs_reallocblks: non-logical cluster"); + blkno = buflist->bs_children[0]->b_blkno; + ssize = fsbtodb(fs, fs->fs_frag); + for (i = 1; i < len - 1; i++) + if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) + panic("ffs_reallocblks: non-physical cluster %d", i); +#endif + /* + * If the cluster crosses the boundary for the first indirect + * block, do not move anything in it. Indirect blocks are + * usually initially laid out in a position between the data + * blocks. Block reallocation would usually destroy locality by + * moving the indirect block out of the way to make room for + * data blocks if we didn't compensate here. We should also do + * this for other indirect block boundaries, but it is only + * important for the first one. + */ + if (start_lbn < NDADDR && end_lbn >= NDADDR) + return (ENOSPC); + /* + * If the latest allocation is in a new cylinder group, assume that + * the filesystem has decided to move and do not force it back to + * the previous cylinder group. + */ + if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != + dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) + return (ENOSPC); + if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || + ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) + return (ENOSPC); + /* + * Get the starting offset and block map for the first block. + */ + if (start_lvl == 0) { + sbap = &ip->i_din2->di_db[0]; + soff = start_lbn; + } else { + idp = &start_ap[start_lvl - 1]; + if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { + brelse(sbp); + return (ENOSPC); + } + sbap = (ufs2_daddr_t *)sbp->b_data; + soff = idp->in_off; + } + /* + * If the block range spans two block maps, get the second map. + */ + ebap = NULL; + if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { + ssize = len; + } else { +#ifdef INVARIANTS + if (start_lvl > 0 && + start_ap[start_lvl - 1].in_lbn == idp->in_lbn) + panic("ffs_reallocblk: start == end"); +#endif + ssize = len - (idp->in_off + 1); + if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) + goto fail; + ebap = (ufs2_daddr_t *)ebp->b_data; + } + /* + * Find the preferred location for the cluster. If we have not + * previously failed at this endeavor, then follow our standard + * preference calculation. If we have failed at it, then pick up + * where we last ended our search. + */ + UFS_LOCK(ump); + if (ip->i_nextclustercg == -1) + pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap); + else + pref = cgdata(fs, ip->i_nextclustercg); + /* + * Search the block map looking for an allocation of the desired size. + * To avoid wasting too much time, we limit the number of cylinder + * groups that we will search. + */ + cg = dtog(fs, pref); + for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { + if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) + break; + cg += 1; + if (cg >= fs->fs_ncg) + cg = 0; + } + /* + * If we have failed in our search, record where we gave up for + * next time. Otherwise, fall back to our usual search citerion. + */ + if (newblk == 0) { + ip->i_nextclustercg = cg; + UFS_UNLOCK(ump); + goto fail; + } + ip->i_nextclustercg = -1; + /* + * We have found a new contiguous block. + * + * First we have to replace the old block pointers with the new + * block pointers in the inode and indirect blocks associated + * with the file. + */ +#ifdef DEBUG + if (prtrealloc) + printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number, + (intmax_t)start_lbn, (intmax_t)end_lbn); +#endif + blkno = newblk; + for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { + if (i == ssize) { + bap = ebap; + soff = -i; + } +#ifdef INVARIANTS + if (!ffs_checkblk(ip, + dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) + panic("ffs_reallocblks: unallocated block 2"); + if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) + panic("ffs_reallocblks: alloc mismatch"); +#endif +#ifdef DEBUG + if (prtrealloc) + printf(" %jd,", (intmax_t)*bap); +#endif + if (DOINGSOFTDEP(vp)) { + if (sbap == &ip->i_din2->di_db[0] && i < ssize) + softdep_setup_allocdirect(ip, start_lbn + i, + blkno, *bap, fs->fs_bsize, fs->fs_bsize, + buflist->bs_children[i]); + else + softdep_setup_allocindir_page(ip, start_lbn + i, + i < ssize ? sbp : ebp, soff + i, blkno, + *bap, buflist->bs_children[i]); + } + *bap++ = blkno; + } + /* + * Next we must write out the modified inode and indirect blocks. + * For strict correctness, the writes should be synchronous since + * the old block values may have been written to disk. In practise + * they are almost never written, but if we are concerned about + * strict correctness, the `doasyncfree' flag should be set to zero. + * + * The test on `doasyncfree' should be changed to test a flag + * that shows whether the associated buffers and inodes have + * been written. The flag should be set when the cluster is + * started and cleared whenever the buffer or inode is flushed. + * We can then check below to see if it is set, and do the + * synchronous write only when it has been cleared. + */ + if (sbap != &ip->i_din2->di_db[0]) { + if (doasyncfree) + bdwrite(sbp); + else + bwrite(sbp); + } else { + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (!doasyncfree) + ffs_update(vp, 1); + } + if (ssize < len) { + if (doasyncfree) + bdwrite(ebp); + else + bwrite(ebp); + } + /* + * Last, free the old blocks and assign the new blocks to the buffers. + */ +#ifdef DEBUG + if (prtrealloc) + printf("\n\tnew:"); +#endif + for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { + if (!DOINGSOFTDEP(vp)) + ffs_blkfree(ump, fs, ump->um_devvp, + dbtofsb(fs, buflist->bs_children[i]->b_blkno), + fs->fs_bsize, ip->i_number, vp->v_type, NULL); + buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno); +#ifdef INVARIANTS + if (!ffs_checkblk(ip, + dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) + panic("ffs_reallocblks: unallocated block 3"); +#endif +#ifdef DEBUG + if (prtrealloc) + printf(" %jd,", (intmax_t)blkno); +#endif + } +#ifdef DEBUG + if (prtrealloc) { + prtrealloc--; + printf("\n"); + } +#endif + return (0); + +fail: + if (ssize < len) + brelse(ebp); + if (sbap != &ip->i_din2->di_db[0]) + brelse(sbp); + return (ENOSPC); +} + +/* + * Allocate an inode in the filesystem. + * + * If allocating a directory, use ffs_dirpref to select the inode. + * If allocating in a directory, the following hierarchy is followed: + * 1) allocate the preferred inode. + * 2) allocate an inode in the same cylinder group. + * 3) quadradically rehash into other cylinder groups, until an + * available inode is located. + * If no inode preference is given the following hierarchy is used + * to allocate an inode: + * 1) allocate an inode in cylinder group 0. + * 2) quadradically rehash into other cylinder groups, until an + * available inode is located. + */ +int +ffs_valloc(pvp, mode, cred, vpp) + struct vnode *pvp; + int mode; + struct ucred *cred; + struct vnode **vpp; +{ + struct inode *pip; + struct fs *fs; + struct inode *ip; + struct timespec ts; + struct ufsmount *ump; + ino_t ino, ipref; + u_int cg; + int error, error1, reclaimed; + static struct timeval lastfail; + static int curfail; + + *vpp = NULL; + pip = VTOI(pvp); + ump = ITOUMP(pip); + fs = ump->um_fs; + + UFS_LOCK(ump); + reclaimed = 0; +retry: + if (fs->fs_cstotal.cs_nifree == 0) + goto noinodes; + + if ((mode & IFMT) == IFDIR) + ipref = ffs_dirpref(pip); + else + ipref = pip->i_number; + if (ipref >= fs->fs_ncg * fs->fs_ipg) + ipref = 0; + cg = ino_to_cg(fs, ipref); + /* + * Track number of dirs created one after another + * in a same cg without intervening by files. + */ + if ((mode & IFMT) == IFDIR) { + if (fs->fs_contigdirs[cg] < 255) + fs->fs_contigdirs[cg]++; + } else { + if (fs->fs_contigdirs[cg] > 0) + fs->fs_contigdirs[cg]--; + } + ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, + (allocfcn_t *)ffs_nodealloccg); + if (ino == 0) + goto noinodes; + error = ffs_vget(pvp->v_mount, ino, LK_EXCLUSIVE, vpp); + if (error) { + error1 = ffs_vgetf(pvp->v_mount, ino, LK_EXCLUSIVE, vpp, + FFSV_FORCEINSMQ); + ffs_vfree(pvp, ino, mode); + if (error1 == 0) { + ip = VTOI(*vpp); + if (ip->i_mode) + goto dup_alloc; + ip->i_flag |= IN_MODIFIED; + vput(*vpp); + } + return (error); + } + ip = VTOI(*vpp); + if (ip->i_mode) { +dup_alloc: + printf("mode = 0%o, inum = %ju, fs = %s\n", + ip->i_mode, (uintmax_t)ip->i_number, fs->fs_fsmnt); + panic("ffs_valloc: dup alloc"); + } + if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) { /* XXX */ + printf("free inode %s/%lu had %ld blocks\n", + fs->fs_fsmnt, (u_long)ino, (long)DIP(ip, i_blocks)); + DIP_SET(ip, i_blocks, 0); + } + ip->i_flags = 0; + DIP_SET(ip, i_flags, 0); + /* + * Set up a new generation number for this inode. + */ + while (ip->i_gen == 0 || ++ip->i_gen == 0) + ip->i_gen = arc4random(); + DIP_SET(ip, i_gen, ip->i_gen); + if (fs->fs_magic == FS_UFS2_MAGIC) { + vfs_timestamp(&ts); + ip->i_din2->di_birthtime = ts.tv_sec; + ip->i_din2->di_birthnsec = ts.tv_nsec; + } + ufs_prepare_reclaim(*vpp); + ip->i_flag = 0; + (*vpp)->v_vflag = 0; + (*vpp)->v_type = VNON; + if (fs->fs_magic == FS_UFS2_MAGIC) { + (*vpp)->v_op = &ffs_vnodeops2; + ip->i_flag |= IN_UFS2; + } else { + (*vpp)->v_op = &ffs_vnodeops1; + } + return (0); +noinodes: + if (reclaimed == 0) { + reclaimed = 1; + softdep_request_cleanup(fs, pvp, cred, FLUSH_INODES_WAIT); + goto retry; + } + UFS_UNLOCK(ump); + if (ppsratecheck(&lastfail, &curfail, 1)) { + ffs_fserr(fs, pip->i_number, "out of inodes"); + uprintf("\n%s: create/symlink failed, no inodes free\n", + fs->fs_fsmnt); + } + return (ENOSPC); +} + +/* + * Find a cylinder group to place a directory. + * + * The policy implemented by this algorithm is to allocate a + * directory inode in the same cylinder group as its parent + * directory, but also to reserve space for its files inodes + * and data. Restrict the number of directories which may be + * allocated one after another in the same cylinder group + * without intervening allocation of files. + * + * If we allocate a first level directory then force allocation + * in another cylinder group. + */ +static ino_t +ffs_dirpref(pip) + struct inode *pip; +{ + struct fs *fs; + int cg, prefcg, dirsize, cgsize; + u_int avgifree, avgbfree, avgndir, curdirsize; + u_int minifree, minbfree, maxndir; + u_int mincg, minndir; + u_int maxcontigdirs; + + mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED); + fs = ITOFS(pip); + + avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; + avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; + avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; + + /* + * Force allocation in another cg if creating a first level dir. + */ + ASSERT_VOP_LOCKED(ITOV(pip), "ffs_dirpref"); + if (ITOV(pip)->v_vflag & VV_ROOT) { + prefcg = arc4random() % fs->fs_ncg; + mincg = prefcg; + minndir = fs->fs_ipg; + for (cg = prefcg; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_ndir < minndir && + fs->fs_cs(fs, cg).cs_nifree >= avgifree && + fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + mincg = cg; + minndir = fs->fs_cs(fs, cg).cs_ndir; + } + for (cg = 0; cg < prefcg; cg++) + if (fs->fs_cs(fs, cg).cs_ndir < minndir && + fs->fs_cs(fs, cg).cs_nifree >= avgifree && + fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + mincg = cg; + minndir = fs->fs_cs(fs, cg).cs_ndir; + } + return ((ino_t)(fs->fs_ipg * mincg)); + } + + /* + * Count various limits which used for + * optimal allocation of a directory inode. + */ + maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg); + minifree = avgifree - avgifree / 4; + if (minifree < 1) + minifree = 1; + minbfree = avgbfree - avgbfree / 4; + if (minbfree < 1) + minbfree = 1; + cgsize = fs->fs_fsize * fs->fs_fpg; + dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir; + curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0; + if (dirsize < curdirsize) + dirsize = curdirsize; + if (dirsize <= 0) + maxcontigdirs = 0; /* dirsize overflowed */ + else + maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255); + if (fs->fs_avgfpdir > 0) + maxcontigdirs = min(maxcontigdirs, + fs->fs_ipg / fs->fs_avgfpdir); + if (maxcontigdirs == 0) + maxcontigdirs = 1; + + /* + * Limit number of dirs in one cg and reserve space for + * regular files, but only if we have no deficit in + * inodes or space. + * + * We are trying to find a suitable cylinder group nearby + * our preferred cylinder group to place a new directory. + * We scan from our preferred cylinder group forward looking + * for a cylinder group that meets our criterion. If we get + * to the final cylinder group and do not find anything, + * we start scanning forwards from the beginning of the + * filesystem. While it might seem sensible to start scanning + * backwards or even to alternate looking forward and backward, + * this approach fails badly when the filesystem is nearly full. + * Specifically, we first search all the areas that have no space + * and finally try the one preceding that. We repeat this on + * every request and in the case of the final block end up + * searching the entire filesystem. By jumping to the front + * of the filesystem, our future forward searches always look + * in new cylinder groups so finds every possible block after + * one pass over the filesystem. + */ + prefcg = ino_to_cg(fs, pip->i_number); + for (cg = prefcg; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_ndir < maxndir && + fs->fs_cs(fs, cg).cs_nifree >= minifree && + fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { + if (fs->fs_contigdirs[cg] < maxcontigdirs) + return ((ino_t)(fs->fs_ipg * cg)); + } + for (cg = 0; cg < prefcg; cg++) + if (fs->fs_cs(fs, cg).cs_ndir < maxndir && + fs->fs_cs(fs, cg).cs_nifree >= minifree && + fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { + if (fs->fs_contigdirs[cg] < maxcontigdirs) + return ((ino_t)(fs->fs_ipg * cg)); + } + /* + * This is a backstop when we have deficit in space. + */ + for (cg = prefcg; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) + return ((ino_t)(fs->fs_ipg * cg)); + for (cg = 0; cg < prefcg; cg++) + if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) + break; + return ((ino_t)(fs->fs_ipg * cg)); +} + +/* + * Select the desired position for the next block in a file. The file is + * logically divided into sections. The first section is composed of the + * direct blocks and the next fs_maxbpg blocks. Each additional section + * contains fs_maxbpg blocks. + * + * If no blocks have been allocated in the first section, the policy is to + * request a block in the same cylinder group as the inode that describes + * the file. The first indirect is allocated immediately following the last + * direct block and the data blocks for the first indirect immediately + * follow it. + * + * If no blocks have been allocated in any other section, the indirect + * block(s) are allocated in the same cylinder group as its inode in an + * area reserved immediately following the inode blocks. The policy for + * the data blocks is to place them in a cylinder group with a greater than + * average number of free blocks. An appropriate cylinder group is found + * by using a rotor that sweeps the cylinder groups. When a new group of + * blocks is needed, the sweep begins in the cylinder group following the + * cylinder group from which the previous allocation was made. The sweep + * continues until a cylinder group with greater than the average number + * of free blocks is found. If the allocation is for the first block in an + * indirect block or the previous block is a hole, then the information on + * the previous allocation is unavailable; here a best guess is made based + * on the logical block number being allocated. + * + * If a section is already partially allocated, the policy is to + * allocate blocks contiguously within the section if possible. + */ +ufs2_daddr_t +ffs_blkpref_ufs1(ip, lbn, indx, bap) + struct inode *ip; + ufs_lbn_t lbn; + int indx; + ufs1_daddr_t *bap; +{ + struct fs *fs; + u_int cg, inocg; + u_int avgbfree, startcg; + ufs2_daddr_t pref; + + KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); + mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); + fs = ITOFS(ip); + /* + * Allocation of indirect blocks is indicated by passing negative + * values in indx: -1 for single indirect, -2 for double indirect, + * -3 for triple indirect. As noted below, we attempt to allocate + * the first indirect inline with the file data. For all later + * indirect blocks, the data is often allocated in other cylinder + * groups. However to speed random file access and to speed up + * fsck, the filesystem reserves the first fs_metaspace blocks + * (typically half of fs_minfree) of the data area of each cylinder + * group to hold these later indirect blocks. + */ + inocg = ino_to_cg(fs, ip->i_number); + if (indx < 0) { + /* + * Our preference for indirect blocks is the zone at the + * beginning of the inode's cylinder group data area that + * we try to reserve for indirect blocks. + */ + pref = cgmeta(fs, inocg); + /* + * If we are allocating the first indirect block, try to + * place it immediately following the last direct block. + */ + if (indx == -1 && lbn < NDADDR + NINDIR(fs) && + ip->i_din1->di_db[NDADDR - 1] != 0) + pref = ip->i_din1->di_db[NDADDR - 1] + fs->fs_frag; + return (pref); + } + /* + * If we are allocating the first data block in the first indirect + * block and the indirect has been allocated in the data block area, + * try to place it immediately following the indirect block. + */ + if (lbn == NDADDR) { + pref = ip->i_din1->di_ib[0]; + if (pref != 0 && pref >= cgdata(fs, inocg) && + pref < cgbase(fs, inocg + 1)) + return (pref + fs->fs_frag); + } + /* + * If we are at the beginning of a file, or we have already allocated + * the maximum number of blocks per cylinder group, or we do not + * have a block allocated immediately preceding us, then we need + * to decide where to start allocating new blocks. + */ + if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { + /* + * If we are allocating a directory data block, we want + * to place it in the metadata area. + */ + if ((ip->i_mode & IFMT) == IFDIR) + return (cgmeta(fs, inocg)); + /* + * Until we fill all the direct and all the first indirect's + * blocks, we try to allocate in the data area of the inode's + * cylinder group. + */ + if (lbn < NDADDR + NINDIR(fs)) + return (cgdata(fs, inocg)); + /* + * Find a cylinder with greater than average number of + * unused data blocks. + */ + if (indx == 0 || bap[indx - 1] == 0) + startcg = inocg + lbn / fs->fs_maxbpg; + else + startcg = dtog(fs, bap[indx - 1]) + 1; + startcg %= fs->fs_ncg; + avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; + for (cg = startcg; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + fs->fs_cgrotor = cg; + return (cgdata(fs, cg)); + } + for (cg = 0; cg <= startcg; cg++) + if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + fs->fs_cgrotor = cg; + return (cgdata(fs, cg)); + } + return (0); + } + /* + * Otherwise, we just always try to lay things out contiguously. + */ + return (bap[indx - 1] + fs->fs_frag); +} + +/* + * Same as above, but for UFS2 + */ +ufs2_daddr_t +ffs_blkpref_ufs2(ip, lbn, indx, bap) + struct inode *ip; + ufs_lbn_t lbn; + int indx; + ufs2_daddr_t *bap; +{ + struct fs *fs; + u_int cg, inocg; + u_int avgbfree, startcg; + ufs2_daddr_t pref; + + KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); + mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); + fs = ITOFS(ip); + /* + * Allocation of indirect blocks is indicated by passing negative + * values in indx: -1 for single indirect, -2 for double indirect, + * -3 for triple indirect. As noted below, we attempt to allocate + * the first indirect inline with the file data. For all later + * indirect blocks, the data is often allocated in other cylinder + * groups. However to speed random file access and to speed up + * fsck, the filesystem reserves the first fs_metaspace blocks + * (typically half of fs_minfree) of the data area of each cylinder + * group to hold these later indirect blocks. + */ + inocg = ino_to_cg(fs, ip->i_number); + if (indx < 0) { + /* + * Our preference for indirect blocks is the zone at the + * beginning of the inode's cylinder group data area that + * we try to reserve for indirect blocks. + */ + pref = cgmeta(fs, inocg); + /* + * If we are allocating the first indirect block, try to + * place it immediately following the last direct block. + */ + if (indx == -1 && lbn < NDADDR + NINDIR(fs) && + ip->i_din2->di_db[NDADDR - 1] != 0) + pref = ip->i_din2->di_db[NDADDR - 1] + fs->fs_frag; + return (pref); + } + /* + * If we are allocating the first data block in the first indirect + * block and the indirect has been allocated in the data block area, + * try to place it immediately following the indirect block. + */ + if (lbn == NDADDR) { + pref = ip->i_din2->di_ib[0]; + if (pref != 0 && pref >= cgdata(fs, inocg) && + pref < cgbase(fs, inocg + 1)) + return (pref + fs->fs_frag); + } + /* + * If we are at the beginning of a file, or we have already allocated + * the maximum number of blocks per cylinder group, or we do not + * have a block allocated immediately preceding us, then we need + * to decide where to start allocating new blocks. + */ + if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) { + /* + * If we are allocating a directory data block, we want + * to place it in the metadata area. + */ + if ((ip->i_mode & IFMT) == IFDIR) + return (cgmeta(fs, inocg)); + /* + * Until we fill all the direct and all the first indirect's + * blocks, we try to allocate in the data area of the inode's + * cylinder group. + */ + if (lbn < NDADDR + NINDIR(fs)) + return (cgdata(fs, inocg)); + /* + * Find a cylinder with greater than average number of + * unused data blocks. + */ + if (indx == 0 || bap[indx - 1] == 0) + startcg = inocg + lbn / fs->fs_maxbpg; + else + startcg = dtog(fs, bap[indx - 1]) + 1; + startcg %= fs->fs_ncg; + avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; + for (cg = startcg; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + fs->fs_cgrotor = cg; + return (cgdata(fs, cg)); + } + for (cg = 0; cg <= startcg; cg++) + if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + fs->fs_cgrotor = cg; + return (cgdata(fs, cg)); + } + return (0); + } + /* + * Otherwise, we just always try to lay things out contiguously. + */ + return (bap[indx - 1] + fs->fs_frag); +} + +/* + * Implement the cylinder overflow algorithm. + * + * The policy implemented by this algorithm is: + * 1) allocate the block in its requested cylinder group. + * 2) quadradically rehash on the cylinder group number. + * 3) brute force search for a free block. + * + * Must be called with the UFS lock held. Will release the lock on success + * and return with it held on failure. + */ +/*VARARGS5*/ +static ufs2_daddr_t +ffs_hashalloc(ip, cg, pref, size, rsize, allocator) + struct inode *ip; + u_int cg; + ufs2_daddr_t pref; + int size; /* Search size for data blocks, mode for inodes */ + int rsize; /* Real allocated size. */ + allocfcn_t *allocator; +{ + struct fs *fs; + ufs2_daddr_t result; + u_int i, icg = cg; + + mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); +#ifdef INVARIANTS + if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED) + panic("ffs_hashalloc: allocation on suspended filesystem"); +#endif + fs = ITOFS(ip); + /* + * 1: preferred cylinder group + */ + result = (*allocator)(ip, cg, pref, size, rsize); + if (result) + return (result); + /* + * 2: quadratic rehash + */ + for (i = 1; i < fs->fs_ncg; i *= 2) { + cg += i; + if (cg >= fs->fs_ncg) + cg -= fs->fs_ncg; + result = (*allocator)(ip, cg, 0, size, rsize); + if (result) + return (result); + } + /* + * 3: brute force search + * Note that we start at i == 2, since 0 was checked initially, + * and 1 is always checked in the quadratic rehash. + */ + cg = (icg + 2) % fs->fs_ncg; + for (i = 2; i < fs->fs_ncg; i++) { + result = (*allocator)(ip, cg, 0, size, rsize); + if (result) + return (result); + cg++; + if (cg == fs->fs_ncg) + cg = 0; + } + return (0); +} + +/* + * Determine whether a fragment can be extended. + * + * Check to see if the necessary fragments are available, and + * if they are, allocate them. + */ +static ufs2_daddr_t +ffs_fragextend(ip, cg, bprev, osize, nsize) + struct inode *ip; + u_int cg; + ufs2_daddr_t bprev; + int osize, nsize; +{ + struct fs *fs; + struct cg *cgp; + struct buf *bp; + struct ufsmount *ump; + int nffree; + long bno; + int frags, bbase; + int i, error; + u_int8_t *blksfree; + + ump = ITOUMP(ip); + fs = ump->um_fs; + if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) + return (0); + frags = numfrags(fs, nsize); + bbase = fragnum(fs, bprev); + if (bbase > fragnum(fs, (bprev + frags - 1))) { + /* cannot extend across a block boundary */ + return (0); + } + UFS_UNLOCK(ump); + error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + if (error) + goto fail; + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp)) + goto fail; + bp->b_xflags |= BX_BKGRDWRITE; + cgp->cg_old_time = cgp->cg_time = time_second; + bno = dtogd(fs, bprev); + blksfree = cg_blksfree(cgp); + for (i = numfrags(fs, osize); i < frags; i++) + if (isclr(blksfree, bno + i)) + goto fail; + /* + * the current fragment can be extended + * deduct the count on fragment being extended into + * increase the count on the remaining fragment (if any) + * allocate the extended piece + */ + for (i = frags; i < fs->fs_frag - bbase; i++) + if (isclr(blksfree, bno + i)) + break; + cgp->cg_frsum[i - numfrags(fs, osize)]--; + if (i != frags) + cgp->cg_frsum[i - frags]++; + for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) { + clrbit(blksfree, bno + i); + cgp->cg_cs.cs_nffree--; + nffree++; + } + UFS_LOCK(ump); + fs->fs_cstotal.cs_nffree -= nffree; + fs->fs_cs(fs, cg).cs_nffree -= nffree; + fs->fs_fmod = 1; + ACTIVECLEAR(fs, cg); + UFS_UNLOCK(ump); + if (DOINGSOFTDEP(ITOV(ip))) + softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev, + frags, numfrags(fs, osize)); + bdwrite(bp); + return (bprev); + +fail: + brelse(bp); + UFS_LOCK(ump); + return (0); + +} + +/* + * Determine whether a block can be allocated. + * + * Check to see if a block of the appropriate size is available, + * and if it is, allocate it. + */ +static ufs2_daddr_t +ffs_alloccg(ip, cg, bpref, size, rsize) + struct inode *ip; + u_int cg; + ufs2_daddr_t bpref; + int size; + int rsize; +{ + struct fs *fs; + struct cg *cgp; + struct buf *bp; + struct ufsmount *ump; + ufs1_daddr_t bno; + ufs2_daddr_t blkno; + int i, allocsiz, error, frags; + u_int8_t *blksfree; + + ump = ITOUMP(ip); + fs = ump->um_fs; + if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) + return (0); + UFS_UNLOCK(ump); + error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + if (error) + goto fail; + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp) || + (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) + goto fail; + bp->b_xflags |= BX_BKGRDWRITE; + cgp->cg_old_time = cgp->cg_time = time_second; + if (size == fs->fs_bsize) { + UFS_LOCK(ump); + blkno = ffs_alloccgblk(ip, bp, bpref, rsize); + ACTIVECLEAR(fs, cg); + UFS_UNLOCK(ump); + bdwrite(bp); + return (blkno); + } + /* + * check to see if any fragments are already available + * allocsiz is the size which will be allocated, hacking + * it down to a smaller size if necessary + */ + blksfree = cg_blksfree(cgp); + frags = numfrags(fs, size); + for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) + if (cgp->cg_frsum[allocsiz] != 0) + break; + if (allocsiz == fs->fs_frag) { + /* + * no fragments were available, so a block will be + * allocated, and hacked up + */ + if (cgp->cg_cs.cs_nbfree == 0) + goto fail; + UFS_LOCK(ump); + blkno = ffs_alloccgblk(ip, bp, bpref, rsize); + ACTIVECLEAR(fs, cg); + UFS_UNLOCK(ump); + bdwrite(bp); + return (blkno); + } + KASSERT(size == rsize, + ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize)); + bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); + if (bno < 0) + goto fail; + for (i = 0; i < frags; i++) + clrbit(blksfree, bno + i); + cgp->cg_cs.cs_nffree -= frags; + cgp->cg_frsum[allocsiz]--; + if (frags != allocsiz) + cgp->cg_frsum[allocsiz - frags]++; + UFS_LOCK(ump); + fs->fs_cstotal.cs_nffree -= frags; + fs->fs_cs(fs, cg).cs_nffree -= frags; + fs->fs_fmod = 1; + blkno = cgbase(fs, cg) + bno; + ACTIVECLEAR(fs, cg); + UFS_UNLOCK(ump); + if (DOINGSOFTDEP(ITOV(ip))) + softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0); + bdwrite(bp); + return (blkno); + +fail: + brelse(bp); + UFS_LOCK(ump); + return (0); +} + +/* + * Allocate a block in a cylinder group. + * + * This algorithm implements the following policy: + * 1) allocate the requested block. + * 2) allocate a rotationally optimal block in the same cylinder. + * 3) allocate the next available block on the block rotor for the + * specified cylinder group. + * Note that this routine only allocates fs_bsize blocks; these + * blocks may be fragmented by the routine that allocates them. + */ +static ufs2_daddr_t +ffs_alloccgblk(ip, bp, bpref, size) + struct inode *ip; + struct buf *bp; + ufs2_daddr_t bpref; + int size; +{ + struct fs *fs; + struct cg *cgp; + struct ufsmount *ump; + ufs1_daddr_t bno; + ufs2_daddr_t blkno; + u_int8_t *blksfree; + int i, cgbpref; + + ump = ITOUMP(ip); + fs = ump->um_fs; + mtx_assert(UFS_MTX(ump), MA_OWNED); + cgp = (struct cg *)bp->b_data; + blksfree = cg_blksfree(cgp); + if (bpref == 0) { + bpref = cgbase(fs, cgp->cg_cgx) + cgp->cg_rotor + fs->fs_frag; + } else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) { + /* map bpref to correct zone in this cg */ + if (bpref < cgdata(fs, cgbpref)) + bpref = cgmeta(fs, cgp->cg_cgx); + else + bpref = cgdata(fs, cgp->cg_cgx); + } + /* + * if the requested block is available, use it + */ + bno = dtogd(fs, blknum(fs, bpref)); + if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno))) + goto gotit; + /* + * Take the next available block in this cylinder group. + */ + bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); + if (bno < 0) + return (0); + /* Update cg_rotor only if allocated from the data zone */ + if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx))) + cgp->cg_rotor = bno; +gotit: + blkno = fragstoblks(fs, bno); + ffs_clrblock(fs, blksfree, (long)blkno); + ffs_clusteracct(fs, cgp, blkno, -1); + cgp->cg_cs.cs_nbfree--; + fs->fs_cstotal.cs_nbfree--; + fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; + fs->fs_fmod = 1; + blkno = cgbase(fs, cgp->cg_cgx) + bno; + /* + * If the caller didn't want the whole block free the frags here. + */ + size = numfrags(fs, size); + if (size != fs->fs_frag) { + bno = dtogd(fs, blkno); + for (i = size; i < fs->fs_frag; i++) + setbit(blksfree, bno + i); + i = fs->fs_frag - size; + cgp->cg_cs.cs_nffree += i; + fs->fs_cstotal.cs_nffree += i; + fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i; + fs->fs_fmod = 1; + cgp->cg_frsum[i]++; + } + /* XXX Fixme. */ + UFS_UNLOCK(ump); + if (DOINGSOFTDEP(ITOV(ip))) + softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, + size, 0); + UFS_LOCK(ump); + return (blkno); +} + +/* + * Determine whether a cluster can be allocated. + * + * We do not currently check for optimal rotational layout if there + * are multiple choices in the same cylinder group. Instead we just + * take the first one that we find following bpref. + */ +static ufs2_daddr_t +ffs_clusteralloc(ip, cg, bpref, len) + struct inode *ip; + u_int cg; + ufs2_daddr_t bpref; + int len; +{ + struct fs *fs; + struct cg *cgp; + struct buf *bp; + struct ufsmount *ump; + int i, run, bit, map, got; + ufs2_daddr_t bno; + u_char *mapp; + int32_t *lp; + u_int8_t *blksfree; + + ump = ITOUMP(ip); + fs = ump->um_fs; + if (fs->fs_maxcluster[cg] < len) + return (0); + UFS_UNLOCK(ump); + if (bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize, + NOCRED, &bp)) + goto fail_lock; + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp)) + goto fail_lock; + bp->b_xflags |= BX_BKGRDWRITE; + /* + * Check to see if a cluster of the needed size (or bigger) is + * available in this cylinder group. + */ + lp = &cg_clustersum(cgp)[len]; + for (i = len; i <= fs->fs_contigsumsize; i++) + if (*lp++ > 0) + break; + if (i > fs->fs_contigsumsize) { + /* + * This is the first time looking for a cluster in this + * cylinder group. Update the cluster summary information + * to reflect the true maximum sized cluster so that + * future cluster allocation requests can avoid reading + * the cylinder group map only to find no clusters. + */ + lp = &cg_clustersum(cgp)[len - 1]; + for (i = len - 1; i > 0; i--) + if (*lp-- > 0) + break; + UFS_LOCK(ump); + fs->fs_maxcluster[cg] = i; + goto fail; + } + /* + * Search the cluster map to find a big enough cluster. + * We take the first one that we find, even if it is larger + * than we need as we prefer to get one close to the previous + * block allocation. We do not search before the current + * preference point as we do not want to allocate a block + * that is allocated before the previous one (as we will + * then have to wait for another pass of the elevator + * algorithm before it will be read). We prefer to fail and + * be recalled to try an allocation in the next cylinder group. + */ + if (dtog(fs, bpref) != cg) + bpref = cgdata(fs, cg); + else + bpref = blknum(fs, bpref); + bpref = fragstoblks(fs, dtogd(fs, bpref)); + mapp = &cg_clustersfree(cgp)[bpref / NBBY]; + map = *mapp++; + bit = 1 << (bpref % NBBY); + for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) { + if ((map & bit) == 0) { + run = 0; + } else { + run++; + if (run == len) + break; + } + if ((got & (NBBY - 1)) != (NBBY - 1)) { + bit <<= 1; + } else { + map = *mapp++; + bit = 1; + } + } + if (got >= cgp->cg_nclusterblks) + goto fail_lock; + /* + * Allocate the cluster that we have found. + */ + blksfree = cg_blksfree(cgp); + for (i = 1; i <= len; i++) + if (!ffs_isblock(fs, blksfree, got - run + i)) + panic("ffs_clusteralloc: map mismatch"); + bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1); + if (dtog(fs, bno) != cg) + panic("ffs_clusteralloc: allocated out of group"); + len = blkstofrags(fs, len); + UFS_LOCK(ump); + for (i = 0; i < len; i += fs->fs_frag) + if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i) + panic("ffs_clusteralloc: lost block"); + ACTIVECLEAR(fs, cg); + UFS_UNLOCK(ump); + bdwrite(bp); + return (bno); + +fail_lock: + UFS_LOCK(ump); +fail: + brelse(bp); + return (0); +} + +static inline struct buf * +getinobuf(struct inode *ip, u_int cg, u_int32_t cginoblk, int gbflags) +{ + struct fs *fs; + + fs = ITOFS(ip); + return (getblk(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs, + cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0, + gbflags)); +} + +/* + * Synchronous inode initialization is needed only when barrier writes do not + * work as advertised, and will impose a heavy cost on file creation in a newly + * created filesystem. + */ +static int doasyncinodeinit = 1; +SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncinodeinit, CTLFLAG_RWTUN, + &doasyncinodeinit, 0, + "Perform inode block initialization using asynchronous writes"); + +/* + * Determine whether an inode can be allocated. + * + * Check to see if an inode is available, and if it is, + * allocate it using the following policy: + * 1) allocate the requested inode. + * 2) allocate the next available inode after the requested + * inode in the specified cylinder group. + */ +static ufs2_daddr_t +ffs_nodealloccg(ip, cg, ipref, mode, unused) + struct inode *ip; + u_int cg; + ufs2_daddr_t ipref; + int mode; + int unused; +{ + struct fs *fs; + struct cg *cgp; + struct buf *bp, *ibp; + struct ufsmount *ump; + u_int8_t *inosused, *loc; + struct ufs2_dinode *dp2; + int error, start, len, i; + u_int32_t old_initediblk; + + ump = ITOUMP(ip); + fs = ump->um_fs; +check_nifree: + if (fs->fs_cs(fs, cg).cs_nifree == 0) + return (0); + UFS_UNLOCK(ump); + error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + if (error) { + brelse(bp); + UFS_LOCK(ump); + return (0); + } + cgp = (struct cg *)bp->b_data; +restart: + if (!cg_chkmagic(cgp) || cgp->cg_cs.cs_nifree == 0) { + brelse(bp); + UFS_LOCK(ump); + return (0); + } + bp->b_xflags |= BX_BKGRDWRITE; + inosused = cg_inosused(cgp); + if (ipref) { + ipref %= fs->fs_ipg; + if (isclr(inosused, ipref)) + goto gotit; + } + start = cgp->cg_irotor / NBBY; + len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); + loc = memcchr(&inosused[start], 0xff, len); + if (loc == NULL) { + len = start + 1; + start = 0; + loc = memcchr(&inosused[start], 0xff, len); + if (loc == NULL) { + printf("cg = %d, irotor = %ld, fs = %s\n", + cg, (long)cgp->cg_irotor, fs->fs_fsmnt); + panic("ffs_nodealloccg: map corrupted"); + /* NOTREACHED */ + } + } + ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1; +gotit: + /* + * Check to see if we need to initialize more inodes. + */ + if (fs->fs_magic == FS_UFS2_MAGIC && + ipref + INOPB(fs) > cgp->cg_initediblk && + cgp->cg_initediblk < cgp->cg_niblk) { + old_initediblk = cgp->cg_initediblk; + + /* + * Free the cylinder group lock before writing the + * initialized inode block. Entering the + * babarrierwrite() with the cylinder group lock + * causes lock order violation between the lock and + * snaplk. + * + * Another thread can decide to initialize the same + * inode block, but whichever thread first gets the + * cylinder group lock after writing the newly + * allocated inode block will update it and the other + * will realize that it has lost and leave the + * cylinder group unchanged. + */ + ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT); + brelse(bp); + if (ibp == NULL) { + /* + * The inode block buffer is already owned by + * another thread, which must initialize it. + * Wait on the buffer to allow another thread + * to finish the updates, with dropped cg + * buffer lock, then retry. + */ + ibp = getinobuf(ip, cg, old_initediblk, 0); + brelse(ibp); + UFS_LOCK(ump); + goto check_nifree; + } + bzero(ibp->b_data, (int)fs->fs_bsize); + dp2 = (struct ufs2_dinode *)(ibp->b_data); + for (i = 0; i < INOPB(fs); i++) { + while (dp2->di_gen == 0) + dp2->di_gen = arc4random(); + dp2++; + } + + /* + * Rather than adding a soft updates dependency to ensure + * that the new inode block is written before it is claimed + * by the cylinder group map, we just do a barrier write + * here. The barrier write will ensure that the inode block + * gets written before the updated cylinder group map can be + * written. The barrier write should only slow down bulk + * loading of newly created filesystems. + */ + if (doasyncinodeinit) + babarrierwrite(ibp); + else + bwrite(ibp); + + /* + * After the inode block is written, try to update the + * cg initediblk pointer. If another thread beat us + * to it, then leave it unchanged as the other thread + * has already set it correctly. + */ + error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + UFS_LOCK(ump); + ACTIVECLEAR(fs, cg); + UFS_UNLOCK(ump); + if (error != 0) { + brelse(bp); + return (error); + } + cgp = (struct cg *)bp->b_data; + if (cgp->cg_initediblk == old_initediblk) + cgp->cg_initediblk += INOPB(fs); + goto restart; + } + cgp->cg_old_time = cgp->cg_time = time_second; + cgp->cg_irotor = ipref; + UFS_LOCK(ump); + ACTIVECLEAR(fs, cg); + setbit(inosused, ipref); + cgp->cg_cs.cs_nifree--; + fs->fs_cstotal.cs_nifree--; + fs->fs_cs(fs, cg).cs_nifree--; + fs->fs_fmod = 1; + if ((mode & IFMT) == IFDIR) { + cgp->cg_cs.cs_ndir++; + fs->fs_cstotal.cs_ndir++; + fs->fs_cs(fs, cg).cs_ndir++; + } + UFS_UNLOCK(ump); + if (DOINGSOFTDEP(ITOV(ip))) + softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref, mode); + bdwrite(bp); + return ((ino_t)(cg * fs->fs_ipg + ipref)); +} + +/* + * Free a block or fragment. + * + * The specified block or fragment is placed back in the + * free map. If a fragment is deallocated, a possible + * block reassembly is checked. + */ +static void +ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd) + struct ufsmount *ump; + struct fs *fs; + struct vnode *devvp; + ufs2_daddr_t bno; + long size; + ino_t inum; + struct workhead *dephd; +{ + struct mount *mp; + struct cg *cgp; + struct buf *bp; + ufs1_daddr_t fragno, cgbno; + ufs2_daddr_t cgblkno; + int i, blk, frags, bbase; + u_int cg; + u_int8_t *blksfree; + struct cdev *dev; + + cg = dtog(fs, bno); + if (devvp->v_type == VREG) { + /* devvp is a snapshot */ + MPASS(devvp->v_mount->mnt_data == ump); + dev = ump->um_devvp->v_rdev; + cgblkno = fragstoblks(fs, cgtod(fs, cg)); + } else if (devvp->v_type == VCHR) { + /* devvp is a normal disk device */ + dev = devvp->v_rdev; + cgblkno = fsbtodb(fs, cgtod(fs, cg)); + ASSERT_VOP_LOCKED(devvp, "ffs_blkfree_cg"); + } else + return; +#ifdef INVARIANTS + if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 || + fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) { + printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n", + devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize, + size, fs->fs_fsmnt); + panic("ffs_blkfree_cg: bad size"); + } +#endif + if ((u_int)bno >= fs->fs_size) { + printf("bad block %jd, ino %lu\n", (intmax_t)bno, + (u_long)inum); + ffs_fserr(fs, inum, "bad block"); + return; + } + if (bread(devvp, cgblkno, (int)fs->fs_cgsize, NOCRED, &bp)) { + brelse(bp); + return; + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp)) { + brelse(bp); + return; + } + bp->b_xflags |= BX_BKGRDWRITE; + cgp->cg_old_time = cgp->cg_time = time_second; + cgbno = dtogd(fs, bno); + blksfree = cg_blksfree(cgp); + UFS_LOCK(ump); + if (size == fs->fs_bsize) { + fragno = fragstoblks(fs, cgbno); + if (!ffs_isfreeblock(fs, blksfree, fragno)) { + if (devvp->v_type == VREG) { + UFS_UNLOCK(ump); + /* devvp is a snapshot */ + brelse(bp); + return; + } + printf("dev = %s, block = %jd, fs = %s\n", + devtoname(dev), (intmax_t)bno, fs->fs_fsmnt); + panic("ffs_blkfree_cg: freeing free block"); + } + ffs_setblock(fs, blksfree, fragno); + ffs_clusteracct(fs, cgp, fragno, 1); + cgp->cg_cs.cs_nbfree++; + fs->fs_cstotal.cs_nbfree++; + fs->fs_cs(fs, cg).cs_nbfree++; + } else { + bbase = cgbno - fragnum(fs, cgbno); + /* + * decrement the counts associated with the old frags + */ + blk = blkmap(fs, blksfree, bbase); + ffs_fragacct(fs, blk, cgp->cg_frsum, -1); + /* + * deallocate the fragment + */ + frags = numfrags(fs, size); + for (i = 0; i < frags; i++) { + if (isset(blksfree, cgbno + i)) { + printf("dev = %s, block = %jd, fs = %s\n", + devtoname(dev), (intmax_t)(bno + i), + fs->fs_fsmnt); + panic("ffs_blkfree_cg: freeing free frag"); + } + setbit(blksfree, cgbno + i); + } + cgp->cg_cs.cs_nffree += i; + fs->fs_cstotal.cs_nffree += i; + fs->fs_cs(fs, cg).cs_nffree += i; + /* + * add back in counts associated with the new frags + */ + blk = blkmap(fs, blksfree, bbase); + ffs_fragacct(fs, blk, cgp->cg_frsum, 1); + /* + * if a complete block has been reassembled, account for it + */ + fragno = fragstoblks(fs, bbase); + if (ffs_isblock(fs, blksfree, fragno)) { + cgp->cg_cs.cs_nffree -= fs->fs_frag; + fs->fs_cstotal.cs_nffree -= fs->fs_frag; + fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; + ffs_clusteracct(fs, cgp, fragno, 1); + cgp->cg_cs.cs_nbfree++; + fs->fs_cstotal.cs_nbfree++; + fs->fs_cs(fs, cg).cs_nbfree++; + } + } + fs->fs_fmod = 1; + ACTIVECLEAR(fs, cg); + UFS_UNLOCK(ump); + mp = UFSTOVFS(ump); + if (MOUNTEDSOFTDEP(mp) && devvp->v_type == VCHR) + softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, + numfrags(fs, size), dephd); + bdwrite(bp); +} + +struct ffs_blkfree_trim_params { + struct task task; + struct ufsmount *ump; + struct vnode *devvp; + ufs2_daddr_t bno; + long size; + ino_t inum; + struct workhead *pdephd; + struct workhead dephd; +}; + +static void +ffs_blkfree_trim_task(ctx, pending) + void *ctx; + int pending; +{ + struct ffs_blkfree_trim_params *tp; + + tp = ctx; + ffs_blkfree_cg(tp->ump, tp->ump->um_fs, tp->devvp, tp->bno, tp->size, + tp->inum, tp->pdephd); + vn_finished_secondary_write(UFSTOVFS(tp->ump)); + atomic_add_int(&tp->ump->um_trim_inflight, -1); + free(tp, M_TEMP); +} + +static void +ffs_blkfree_trim_completed(bip) + struct bio *bip; +{ + struct ffs_blkfree_trim_params *tp; + + tp = bip->bio_caller2; + g_destroy_bio(bip); + TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp); + taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task); +} + +void +ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd) + struct ufsmount *ump; + struct fs *fs; + struct vnode *devvp; + ufs2_daddr_t bno; + long size; + ino_t inum; + enum vtype vtype; + struct workhead *dephd; +{ + struct mount *mp; + struct bio *bip; + struct ffs_blkfree_trim_params *tp; + + /* + * Check to see if a snapshot wants to claim the block. + * Check that devvp is a normal disk device, not a snapshot, + * it has a snapshot(s) associated with it, and one of the + * snapshots wants to claim the block. + */ + if (devvp->v_type == VCHR && + (devvp->v_vflag & VV_COPYONWRITE) && + ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) { + return; + } + /* + * Nothing to delay if TRIM is disabled, or the operation is + * performed on the snapshot. + */ + if (!ump->um_candelete || devvp->v_type == VREG) { + ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd); + return; + } + + /* + * Postpone the set of the free bit in the cg bitmap until the + * BIO_DELETE is completed. Otherwise, due to disk queue + * reordering, TRIM might be issued after we reuse the block + * and write some new data into it. + */ + atomic_add_int(&ump->um_trim_inflight, 1); + tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TEMP, M_WAITOK); + tp->ump = ump; + tp->devvp = devvp; + tp->bno = bno; + tp->size = size; + tp->inum = inum; + if (dephd != NULL) { + LIST_INIT(&tp->dephd); + LIST_SWAP(dephd, &tp->dephd, worklist, wk_list); + tp->pdephd = &tp->dephd; + } else + tp->pdephd = NULL; + + bip = g_alloc_bio(); + bip->bio_cmd = BIO_DELETE; + bip->bio_offset = dbtob(fsbtodb(fs, bno)); + bip->bio_done = ffs_blkfree_trim_completed; + bip->bio_length = size; + bip->bio_caller2 = tp; + + mp = UFSTOVFS(ump); + vn_start_secondary_write(NULL, &mp, 0); + g_io_request(bip, (struct g_consumer *)devvp->v_bufobj.bo_private); +} + +#ifdef INVARIANTS +/* + * Verify allocation of a block or fragment. Returns true if block or + * fragment is allocated, false if it is free. + */ +static int +ffs_checkblk(ip, bno, size) + struct inode *ip; + ufs2_daddr_t bno; + long size; +{ + struct fs *fs; + struct cg *cgp; + struct buf *bp; + ufs1_daddr_t cgbno; + int i, error, frags, free; + u_int8_t *blksfree; + + fs = ITOFS(ip); + if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { + printf("bsize = %ld, size = %ld, fs = %s\n", + (long)fs->fs_bsize, size, fs->fs_fsmnt); + panic("ffs_checkblk: bad size"); + } + if ((u_int)bno >= fs->fs_size) + panic("ffs_checkblk: bad block %jd", (intmax_t)bno); + error = bread(ITODEVVP(ip), fsbtodb(fs, cgtod(fs, dtog(fs, bno))), + (int)fs->fs_cgsize, NOCRED, &bp); + if (error) + panic("ffs_checkblk: cg bread failed"); + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp)) + panic("ffs_checkblk: cg magic mismatch"); + bp->b_xflags |= BX_BKGRDWRITE; + blksfree = cg_blksfree(cgp); + cgbno = dtogd(fs, bno); + if (size == fs->fs_bsize) { + free = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno)); + } else { + frags = numfrags(fs, size); + for (free = 0, i = 0; i < frags; i++) + if (isset(blksfree, cgbno + i)) + free++; + if (free != 0 && free != frags) + panic("ffs_checkblk: partially free fragment"); + } + brelse(bp); + return (!free); +} +#endif /* INVARIANTS */ + +/* + * Free an inode. + */ +int +ffs_vfree(pvp, ino, mode) + struct vnode *pvp; + ino_t ino; + int mode; +{ + struct ufsmount *ump; + struct inode *ip; + + if (DOINGSOFTDEP(pvp)) { + softdep_freefile(pvp, ino, mode); + return (0); + } + ip = VTOI(pvp); + ump = VFSTOUFS(pvp->v_mount); + return (ffs_freefile(ump, ump->um_fs, ump->um_devvp, ino, mode, NULL)); +} + +/* + * Do the actual free operation. + * The specified inode is placed back in the free map. + */ +int +ffs_freefile(ump, fs, devvp, ino, mode, wkhd) + struct ufsmount *ump; + struct fs *fs; + struct vnode *devvp; + ino_t ino; + int mode; + struct workhead *wkhd; +{ + struct cg *cgp; + struct buf *bp; + ufs2_daddr_t cgbno; + int error; + u_int cg; + u_int8_t *inosused; + struct cdev *dev; + + cg = ino_to_cg(fs, ino); + if (devvp->v_type == VREG) { + /* devvp is a snapshot */ + MPASS(devvp->v_mount->mnt_data == ump); + dev = ump->um_devvp->v_rdev; + cgbno = fragstoblks(fs, cgtod(fs, cg)); + } else if (devvp->v_type == VCHR) { + /* devvp is a normal disk device */ + dev = devvp->v_rdev; + cgbno = fsbtodb(fs, cgtod(fs, cg)); + } else { + bp = NULL; + return (0); + } + if (ino >= fs->fs_ipg * fs->fs_ncg) + panic("ffs_freefile: range: dev = %s, ino = %ju, fs = %s", + devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt); + if ((error = bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, &bp))) { + brelse(bp); + return (error); + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp)) { + brelse(bp); + return (0); + } + bp->b_xflags |= BX_BKGRDWRITE; + cgp->cg_old_time = cgp->cg_time = time_second; + inosused = cg_inosused(cgp); + ino %= fs->fs_ipg; + if (isclr(inosused, ino)) { + printf("dev = %s, ino = %ju, fs = %s\n", devtoname(dev), + (uintmax_t)(ino + cg * fs->fs_ipg), fs->fs_fsmnt); + if (fs->fs_ronly == 0) + panic("ffs_freefile: freeing free inode"); + } + clrbit(inosused, ino); + if (ino < cgp->cg_irotor) + cgp->cg_irotor = ino; + cgp->cg_cs.cs_nifree++; + UFS_LOCK(ump); + fs->fs_cstotal.cs_nifree++; + fs->fs_cs(fs, cg).cs_nifree++; + if ((mode & IFMT) == IFDIR) { + cgp->cg_cs.cs_ndir--; + fs->fs_cstotal.cs_ndir--; + fs->fs_cs(fs, cg).cs_ndir--; + } + fs->fs_fmod = 1; + ACTIVECLEAR(fs, cg); + UFS_UNLOCK(ump); + if (MOUNTEDSOFTDEP(UFSTOVFS(ump)) && devvp->v_type == VCHR) + softdep_setup_inofree(UFSTOVFS(ump), bp, + ino + cg * fs->fs_ipg, wkhd); + bdwrite(bp); + return (0); +} + +/* + * Check to see if a file is free. + */ +int +ffs_checkfreefile(fs, devvp, ino) + struct fs *fs; + struct vnode *devvp; + ino_t ino; +{ + struct cg *cgp; + struct buf *bp; + ufs2_daddr_t cgbno; + int ret; + u_int cg; + u_int8_t *inosused; + + cg = ino_to_cg(fs, ino); + if (devvp->v_type == VREG) { + /* devvp is a snapshot */ + cgbno = fragstoblks(fs, cgtod(fs, cg)); + } else if (devvp->v_type == VCHR) { + /* devvp is a normal disk device */ + cgbno = fsbtodb(fs, cgtod(fs, cg)); + } else { + return (1); + } + if (ino >= fs->fs_ipg * fs->fs_ncg) + return (1); + if (bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, &bp)) { + brelse(bp); + return (1); + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp)) { + brelse(bp); + return (1); + } + inosused = cg_inosused(cgp); + ino %= fs->fs_ipg; + ret = isclr(inosused, ino); + brelse(bp); + return (ret); +} + +/* + * Find a block of the specified size in the specified cylinder group. + * + * It is a panic if a request is made to find a block if none are + * available. + */ +static ufs1_daddr_t +ffs_mapsearch(fs, cgp, bpref, allocsiz) + struct fs *fs; + struct cg *cgp; + ufs2_daddr_t bpref; + int allocsiz; +{ + ufs1_daddr_t bno; + int start, len, loc, i; + int blk, field, subfield, pos; + u_int8_t *blksfree; + + /* + * find the fragment by searching through the free block + * map for an appropriate bit pattern + */ + if (bpref) + start = dtogd(fs, bpref) / NBBY; + else + start = cgp->cg_frotor / NBBY; + blksfree = cg_blksfree(cgp); + len = howmany(fs->fs_fpg, NBBY) - start; + loc = scanc((u_int)len, (u_char *)&blksfree[start], + fragtbl[fs->fs_frag], + (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); + if (loc == 0) { + len = start + 1; + start = 0; + loc = scanc((u_int)len, (u_char *)&blksfree[0], + fragtbl[fs->fs_frag], + (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); + if (loc == 0) { + printf("start = %d, len = %d, fs = %s\n", + start, len, fs->fs_fsmnt); + panic("ffs_alloccg: map corrupted"); + /* NOTREACHED */ + } + } + bno = (start + len - loc) * NBBY; + cgp->cg_frotor = bno; + /* + * found the byte in the map + * sift through the bits to find the selected frag + */ + for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { + blk = blkmap(fs, blksfree, bno); + blk <<= 1; + field = around[allocsiz]; + subfield = inside[allocsiz]; + for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { + if ((blk & field) == subfield) + return (bno + pos); + field <<= 1; + subfield <<= 1; + } + } + printf("bno = %lu, fs = %s\n", (u_long)bno, fs->fs_fsmnt); + panic("ffs_alloccg: block not in map"); + return (-1); +} + +/* + * Fserr prints the name of a filesystem with an error diagnostic. + * + * The form of the error message is: + * fs: error message + */ +void +ffs_fserr(fs, inum, cp) + struct fs *fs; + ino_t inum; + char *cp; +{ + struct thread *td = curthread; /* XXX */ + struct proc *p = td->td_proc; + + log(LOG_ERR, "pid %d (%s), uid %d inumber %ju on %s: %s\n", + p->p_pid, p->p_comm, td->td_ucred->cr_uid, (uintmax_t)inum, + fs->fs_fsmnt, cp); +} + +/* + * This function provides the capability for the fsck program to + * update an active filesystem. Fourteen operations are provided: + * + * adjrefcnt(inode, amt) - adjusts the reference count on the + * specified inode by the specified amount. Under normal + * operation the count should always go down. Decrementing + * the count to zero will cause the inode to be freed. + * adjblkcnt(inode, amt) - adjust the number of blocks used by the + * inode by the specified amount. + * adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) - + * adjust the superblock summary. + * freedirs(inode, count) - directory inodes [inode..inode + count - 1] + * are marked as free. Inodes should never have to be marked + * as in use. + * freefiles(inode, count) - file inodes [inode..inode + count - 1] + * are marked as free. Inodes should never have to be marked + * as in use. + * freeblks(blockno, size) - blocks [blockno..blockno + size - 1] + * are marked as free. Blocks should never have to be marked + * as in use. + * setflags(flags, set/clear) - the fs_flags field has the specified + * flags set (second parameter +1) or cleared (second parameter -1). + * setcwd(dirinode) - set the current directory to dirinode in the + * filesystem associated with the snapshot. + * setdotdot(oldvalue, newvalue) - Verify that the inode number for ".." + * in the current directory is oldvalue then change it to newvalue. + * unlink(nameptr, oldvalue) - Verify that the inode number associated + * with nameptr in the current directory is oldvalue then unlink it. + * + * The following functions may only be used on a quiescent filesystem + * by the soft updates journal. They are not safe to be run on an active + * filesystem. + * + * setinode(inode, dip) - the specified disk inode is replaced with the + * contents pointed to by dip. + * setbufoutput(fd, flags) - output associated with the specified file + * descriptor (which must reference the character device supporting + * the filesystem) switches from using physio to running through the + * buffer cache when flags is set to 1. The descriptor reverts to + * physio for output when flags is set to zero. + */ + +static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS); + +SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, CTLFLAG_WR|CTLTYPE_STRUCT, + 0, 0, sysctl_ffs_fsck, "S,fsck", "Adjust Inode Reference Count"); + +static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, CTLFLAG_WR, + sysctl_ffs_fsck, "Adjust Inode Used Blocks Count"); + +static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir, CTLFLAG_WR, + sysctl_ffs_fsck, "Adjust number of directories"); + +static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree, CTLFLAG_WR, + sysctl_ffs_fsck, "Adjust number of free blocks"); + +static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree, CTLFLAG_WR, + sysctl_ffs_fsck, "Adjust number of free inodes"); + +static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree, CTLFLAG_WR, + sysctl_ffs_fsck, "Adjust number of free frags"); + +static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters, CTLFLAG_WR, + sysctl_ffs_fsck, "Adjust number of free clusters"); + +static SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, CTLFLAG_WR, + sysctl_ffs_fsck, "Free Range of Directory Inodes"); + +static SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, CTLFLAG_WR, + sysctl_ffs_fsck, "Free Range of File Inodes"); + +static SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, CTLFLAG_WR, + sysctl_ffs_fsck, "Free Range of Blocks"); + +static SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, CTLFLAG_WR, + sysctl_ffs_fsck, "Change Filesystem Flags"); + +static SYSCTL_NODE(_vfs_ffs, FFS_SET_CWD, setcwd, CTLFLAG_WR, + sysctl_ffs_fsck, "Set Current Working Directory"); + +static SYSCTL_NODE(_vfs_ffs, FFS_SET_DOTDOT, setdotdot, CTLFLAG_WR, + sysctl_ffs_fsck, "Change Value of .. Entry"); + +static SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink, CTLFLAG_WR, + sysctl_ffs_fsck, "Unlink a Duplicate Name"); + +static SYSCTL_NODE(_vfs_ffs, FFS_SET_INODE, setinode, CTLFLAG_WR, + sysctl_ffs_fsck, "Update an On-Disk Inode"); + +static SYSCTL_NODE(_vfs_ffs, FFS_SET_BUFOUTPUT, setbufoutput, CTLFLAG_WR, + sysctl_ffs_fsck, "Set Buffered Writing for Descriptor"); + +#define DEBUG 1 +#ifdef DEBUG +static int fsckcmds = 0; +SYSCTL_INT(_debug, OID_AUTO, fsckcmds, CTLFLAG_RW, &fsckcmds, 0, ""); +#endif /* DEBUG */ + +static int buffered_write(struct file *, struct uio *, struct ucred *, + int, struct thread *); + +static int +sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) +{ + struct thread *td = curthread; + struct fsck_cmd cmd; + struct ufsmount *ump; + struct vnode *vp, *dvp, *fdvp; + struct inode *ip, *dp; + struct mount *mp; + struct fs *fs; + ufs2_daddr_t blkno; + long blkcnt, blksize; + struct file *fp, *vfp; + cap_rights_t rights; + int filetype, error; + static struct fileops *origops, bufferedops; + + if (req->newlen > sizeof cmd) + return (EBADRPC); + if ((error = SYSCTL_IN(req, &cmd, sizeof cmd)) != 0) + return (error); + if (cmd.version != FFS_CMD_VERSION) + return (ERPCMISMATCH); + if ((error = getvnode(td, cmd.handle, + cap_rights_init(&rights, CAP_FSCK), &fp)) != 0) + return (error); + vp = fp->f_data; + if (vp->v_type != VREG && vp->v_type != VDIR) { + fdrop(fp, td); + return (EINVAL); + } + vn_start_write(vp, &mp, V_WAIT); + if (mp == NULL || + strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) { + vn_finished_write(mp); + fdrop(fp, td); + return (EINVAL); + } + ump = VFSTOUFS(mp); + if ((mp->mnt_flag & MNT_RDONLY) && + ump->um_fsckpid != td->td_proc->p_pid) { + vn_finished_write(mp); + fdrop(fp, td); + return (EROFS); + } + fs = ump->um_fs; + filetype = IFREG; + + switch (oidp->oid_number) { + + case FFS_SET_FLAGS: +#ifdef DEBUG + if (fsckcmds) + printf("%s: %s flags\n", mp->mnt_stat.f_mntonname, + cmd.size > 0 ? "set" : "clear"); +#endif /* DEBUG */ + if (cmd.size > 0) + fs->fs_flags |= (long)cmd.value; + else + fs->fs_flags &= ~(long)cmd.value; + break; + + case FFS_ADJ_REFCNT: +#ifdef DEBUG + if (fsckcmds) { + printf("%s: adjust inode %jd link count by %jd\n", + mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, + (intmax_t)cmd.size); + } +#endif /* DEBUG */ + if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) + break; + ip = VTOI(vp); + ip->i_nlink += cmd.size; + DIP_SET(ip, i_nlink, ip->i_nlink); + ip->i_effnlink += cmd.size; + ip->i_flag |= IN_CHANGE | IN_MODIFIED; + error = ffs_update(vp, 1); + if (DOINGSOFTDEP(vp)) + softdep_change_linkcnt(ip); + vput(vp); + break; + + case FFS_ADJ_BLKCNT: +#ifdef DEBUG + if (fsckcmds) { + printf("%s: adjust inode %jd block count by %jd\n", + mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, + (intmax_t)cmd.size); + } +#endif /* DEBUG */ + if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) + break; + ip = VTOI(vp); + DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size); + ip->i_flag |= IN_CHANGE | IN_MODIFIED; + error = ffs_update(vp, 1); + vput(vp); + break; + + case FFS_DIR_FREE: + filetype = IFDIR; + /* fall through */ + + case FFS_FILE_FREE: +#ifdef DEBUG + if (fsckcmds) { + if (cmd.size == 1) + printf("%s: free %s inode %ju\n", + mp->mnt_stat.f_mntonname, + filetype == IFDIR ? "directory" : "file", + (uintmax_t)cmd.value); + else + printf("%s: free %s inodes %ju-%ju\n", + mp->mnt_stat.f_mntonname, + filetype == IFDIR ? "directory" : "file", + (uintmax_t)cmd.value, + (uintmax_t)(cmd.value + cmd.size - 1)); + } +#endif /* DEBUG */ + while (cmd.size > 0) { + if ((error = ffs_freefile(ump, fs, ump->um_devvp, + cmd.value, filetype, NULL))) + break; + cmd.size -= 1; + cmd.value += 1; + } + break; + + case FFS_BLK_FREE: +#ifdef DEBUG + if (fsckcmds) { + if (cmd.size == 1) + printf("%s: free block %jd\n", + mp->mnt_stat.f_mntonname, + (intmax_t)cmd.value); + else + printf("%s: free blocks %jd-%jd\n", + mp->mnt_stat.f_mntonname, + (intmax_t)cmd.value, + (intmax_t)cmd.value + cmd.size - 1); + } +#endif /* DEBUG */ + blkno = cmd.value; + blkcnt = cmd.size; + blksize = fs->fs_frag - (blkno % fs->fs_frag); + while (blkcnt > 0) { + if (blksize > blkcnt) + blksize = blkcnt; + ffs_blkfree(ump, fs, ump->um_devvp, blkno, + blksize * fs->fs_fsize, ROOTINO, VDIR, NULL); + blkno += blksize; + blkcnt -= blksize; + blksize = fs->fs_frag; + } + break; + + /* + * Adjust superblock summaries. fsck(8) is expected to + * submit deltas when necessary. + */ + case FFS_ADJ_NDIR: +#ifdef DEBUG + if (fsckcmds) { + printf("%s: adjust number of directories by %jd\n", + mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); + } +#endif /* DEBUG */ + fs->fs_cstotal.cs_ndir += cmd.value; + break; + + case FFS_ADJ_NBFREE: +#ifdef DEBUG + if (fsckcmds) { + printf("%s: adjust number of free blocks by %+jd\n", + mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); + } +#endif /* DEBUG */ + fs->fs_cstotal.cs_nbfree += cmd.value; + break; + + case FFS_ADJ_NIFREE: +#ifdef DEBUG + if (fsckcmds) { + printf("%s: adjust number of free inodes by %+jd\n", + mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); + } +#endif /* DEBUG */ + fs->fs_cstotal.cs_nifree += cmd.value; + break; + + case FFS_ADJ_NFFREE: +#ifdef DEBUG + if (fsckcmds) { + printf("%s: adjust number of free frags by %+jd\n", + mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); + } +#endif /* DEBUG */ + fs->fs_cstotal.cs_nffree += cmd.value; + break; + + case FFS_ADJ_NUMCLUSTERS: +#ifdef DEBUG + if (fsckcmds) { + printf("%s: adjust number of free clusters by %+jd\n", + mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); + } +#endif /* DEBUG */ + fs->fs_cstotal.cs_numclusters += cmd.value; + break; + + case FFS_SET_CWD: +#ifdef DEBUG + if (fsckcmds) { + printf("%s: set current directory to inode %jd\n", + mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); + } +#endif /* DEBUG */ + if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_SHARED, &vp))) + break; + AUDIT_ARG_VNODE1(vp); + if ((error = change_dir(vp, td)) != 0) { + vput(vp); + break; + } + VOP_UNLOCK(vp, 0); + pwd_chdir(td, vp); + break; + + case FFS_SET_DOTDOT: +#ifdef DEBUG + if (fsckcmds) { + printf("%s: change .. in cwd from %jd to %jd\n", + mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, + (intmax_t)cmd.size); + } +#endif /* DEBUG */ + /* + * First we have to get and lock the parent directory + * to which ".." points. + */ + error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &fdvp); + if (error) + break; + /* + * Now we get and lock the child directory containing "..". + */ + FILEDESC_SLOCK(td->td_proc->p_fd); + dvp = td->td_proc->p_fd->fd_cdir; + FILEDESC_SUNLOCK(td->td_proc->p_fd); + if ((error = vget(dvp, LK_EXCLUSIVE, td)) != 0) { + vput(fdvp); + break; + } + dp = VTOI(dvp); + dp->i_offset = 12; /* XXX mastertemplate.dot_reclen */ + error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size, + DT_DIR, 0); + cache_purge(fdvp); + cache_purge(dvp); + vput(dvp); + vput(fdvp); + break; + + case FFS_UNLINK: +#ifdef DEBUG + if (fsckcmds) { + char buf[32]; + + if (copyinstr((char *)(intptr_t)cmd.value, buf,32,NULL)) + strncpy(buf, "Name_too_long", 32); + printf("%s: unlink %s (inode %jd)\n", + mp->mnt_stat.f_mntonname, buf, (intmax_t)cmd.size); + } +#endif /* DEBUG */ + /* + * kern_unlinkat will do its own start/finish writes and + * they do not nest, so drop ours here. Setting mp == NULL + * indicates that vn_finished_write is not needed down below. + */ + vn_finished_write(mp); + mp = NULL; + error = kern_unlinkat(td, AT_FDCWD, (char *)(intptr_t)cmd.value, + UIO_USERSPACE, (ino_t)cmd.size); + break; + + case FFS_SET_INODE: + if (ump->um_fsckpid != td->td_proc->p_pid) { + error = EPERM; + break; + } +#ifdef DEBUG + if (fsckcmds) { + printf("%s: update inode %jd\n", + mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); + } +#endif /* DEBUG */ + if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) + break; + AUDIT_ARG_VNODE1(vp); + ip = VTOI(vp); + if (I_IS_UFS1(ip)) + error = copyin((void *)(intptr_t)cmd.size, ip->i_din1, + sizeof(struct ufs1_dinode)); + else + error = copyin((void *)(intptr_t)cmd.size, ip->i_din2, + sizeof(struct ufs2_dinode)); + if (error) { + vput(vp); + break; + } + ip->i_flag |= IN_CHANGE | IN_MODIFIED; + error = ffs_update(vp, 1); + vput(vp); + break; + + case FFS_SET_BUFOUTPUT: + if (ump->um_fsckpid != td->td_proc->p_pid) { + error = EPERM; + break; + } + if (ITOUMP(VTOI(vp)) != ump) { + error = EINVAL; + break; + } +#ifdef DEBUG + if (fsckcmds) { + printf("%s: %s buffered output for descriptor %jd\n", + mp->mnt_stat.f_mntonname, + cmd.size == 1 ? "enable" : "disable", + (intmax_t)cmd.value); + } +#endif /* DEBUG */ + if ((error = getvnode(td, cmd.value, + cap_rights_init(&rights, CAP_FSCK), &vfp)) != 0) + break; + if (vfp->f_vnode->v_type != VCHR) { + fdrop(vfp, td); + error = EINVAL; + break; + } + if (origops == NULL) { + origops = vfp->f_ops; + bcopy((void *)origops, (void *)&bufferedops, + sizeof(bufferedops)); + bufferedops.fo_write = buffered_write; + } + if (cmd.size == 1) + atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops, + (uintptr_t)&bufferedops); + else + atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops, + (uintptr_t)origops); + fdrop(vfp, td); + break; + + default: +#ifdef DEBUG + if (fsckcmds) { + printf("Invalid request %d from fsck\n", + oidp->oid_number); + } +#endif /* DEBUG */ + error = EINVAL; + break; + + } + fdrop(fp, td); + vn_finished_write(mp); + return (error); +} + +/* + * Function to switch a descriptor to use the buffer cache to stage + * its I/O. This is needed so that writes to the filesystem device + * will give snapshots a chance to copy modified blocks for which it + * needs to retain copies. + */ +static int +buffered_write(fp, uio, active_cred, flags, td) + struct file *fp; + struct uio *uio; + struct ucred *active_cred; + int flags; + struct thread *td; +{ + struct vnode *devvp, *vp; + struct inode *ip; + struct buf *bp; + struct fs *fs; + struct filedesc *fdp; + int error; + daddr_t lbn; + + /* + * The devvp is associated with the /dev filesystem. To discover + * the filesystem with which the device is associated, we depend + * on the application setting the current directory to a location + * within the filesystem being written. Yes, this is an ugly hack. + */ + devvp = fp->f_vnode; + if (!vn_isdisk(devvp, NULL)) + return (EINVAL); + fdp = td->td_proc->p_fd; + FILEDESC_SLOCK(fdp); + vp = fdp->fd_cdir; + vref(vp); + FILEDESC_SUNLOCK(fdp); + vn_lock(vp, LK_SHARED | LK_RETRY); + /* + * Check that the current directory vnode indeed belongs to + * UFS before trying to dereference UFS-specific v_data fields. + */ + if (vp->v_op != &ffs_vnodeops1 && vp->v_op != &ffs_vnodeops2) { + vput(vp); + return (EINVAL); + } + ip = VTOI(vp); + if (ITODEVVP(ip) != devvp) { + vput(vp); + return (EINVAL); + } + fs = ITOFS(ip); + vput(vp); + foffset_lock_uio(fp, uio, flags); + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); +#ifdef DEBUG + if (fsckcmds) { + printf("%s: buffered write for block %jd\n", + fs->fs_fsmnt, (intmax_t)btodb(uio->uio_offset)); + } +#endif /* DEBUG */ + /* + * All I/O must be contained within a filesystem block, start on + * a fragment boundary, and be a multiple of fragments in length. + */ + if (uio->uio_resid > fs->fs_bsize - (uio->uio_offset % fs->fs_bsize) || + fragoff(fs, uio->uio_offset) != 0 || + fragoff(fs, uio->uio_resid) != 0) { + error = EINVAL; + goto out; + } + lbn = numfrags(fs, uio->uio_offset); + bp = getblk(devvp, lbn, uio->uio_resid, 0, 0, 0); + bp->b_flags |= B_RELBUF; + if ((error = uiomove((char *)bp->b_data, uio->uio_resid, uio)) != 0) { + brelse(bp); + goto out; + } + error = bwrite(bp); +out: + VOP_UNLOCK(devvp, 0); + foffset_unlock_uio(fp, uio, flags | FOF_NEXTOFF); + return (error); +} diff --git a/Dump/ufs/ffs/ffs_balloc.c b/Dump/ufs/ffs/ffs_balloc.c new file mode 100644 index 0000000..0aa2f40 --- /dev/null +++ b/Dump/ufs/ffs/ffs_balloc.c @@ -0,0 +1,1151 @@ +/*- + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Marshall + * Kirk McKusick and Network Associates Laboratories, the Security + * Research Division of Network Associates, Inc. under DARPA/SPAWAR + * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS + * research program + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_balloc.c 8.8 (Berkeley) 6/16/95 + */ + +#include +__FBSDID("$FreeBSD: releng/11.2/sys/ufs/ffs/ffs_balloc.c 331722 2018-03-29 02:50:57Z eadler $"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +/* + * Balloc defines the structure of filesystem storage + * by allocating the physical blocks on a device given + * the inode and the logical block number in a file. + * This is the allocation strategy for UFS1. Below is + * the allocation strategy for UFS2. + */ +int +ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size, + struct ucred *cred, int flags, struct buf **bpp) +{ + struct inode *ip; + struct ufs1_dinode *dp; + ufs_lbn_t lbn, lastlbn; + struct fs *fs; + ufs1_daddr_t nb; + struct buf *bp, *nbp; + struct ufsmount *ump; + struct indir indirs[NIADDR + 2]; + int deallocated, osize, nsize, num, i, error; + ufs2_daddr_t newb; + ufs1_daddr_t *bap, pref; + ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1]; + ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1]; + int unwindidx = -1; + int saved_inbdflush; + static struct timeval lastfail; + static int curfail; + int gbflags, reclaimed; + + ip = VTOI(vp); + dp = ip->i_din1; + fs = ITOFS(ip); + ump = ITOUMP(ip); + lbn = lblkno(fs, startoffset); + size = blkoff(fs, startoffset) + size; + reclaimed = 0; + if (size > fs->fs_bsize) + panic("ffs_balloc_ufs1: blk too big"); + *bpp = NULL; + if (flags & IO_EXT) + return (EOPNOTSUPP); + if (lbn < 0) + return (EFBIG); + gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; + + if (DOINGSOFTDEP(vp)) + softdep_prealloc(vp, MNT_WAIT); + /* + * If the next write will extend the file into a new block, + * and the file is currently composed of a fragment + * this fragment has to be extended to be a full block. + */ + lastlbn = lblkno(fs, ip->i_size); + if (lastlbn < NDADDR && lastlbn < lbn) { + nb = lastlbn; + osize = blksize(fs, ip, nb); + if (osize < fs->fs_bsize && osize > 0) { + UFS_LOCK(ump); + error = ffs_realloccg(ip, nb, dp->di_db[nb], + ffs_blkpref_ufs1(ip, lastlbn, (int)nb, + &dp->di_db[0]), osize, (int)fs->fs_bsize, flags, + cred, &bp); + if (error) + return (error); + if (DOINGSOFTDEP(vp)) + softdep_setup_allocdirect(ip, nb, + dbtofsb(fs, bp->b_blkno), dp->di_db[nb], + fs->fs_bsize, osize, bp); + ip->i_size = smalllblktosize(fs, nb + 1); + dp->di_size = ip->i_size; + dp->di_db[nb] = dbtofsb(fs, bp->b_blkno); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (flags & IO_SYNC) + bwrite(bp); + else + bawrite(bp); + } + } + /* + * The first NDADDR blocks are direct blocks + */ + if (lbn < NDADDR) { + if (flags & BA_METAONLY) + panic("ffs_balloc_ufs1: BA_METAONLY for direct block"); + nb = dp->di_db[lbn]; + if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { + error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp); + if (error) { + brelse(bp); + return (error); + } + bp->b_blkno = fsbtodb(fs, nb); + *bpp = bp; + return (0); + } + if (nb != 0) { + /* + * Consider need to reallocate a fragment. + */ + osize = fragroundup(fs, blkoff(fs, ip->i_size)); + nsize = fragroundup(fs, size); + if (nsize <= osize) { + error = bread(vp, lbn, osize, NOCRED, &bp); + if (error) { + brelse(bp); + return (error); + } + bp->b_blkno = fsbtodb(fs, nb); + } else { + UFS_LOCK(ump); + error = ffs_realloccg(ip, lbn, dp->di_db[lbn], + ffs_blkpref_ufs1(ip, lbn, (int)lbn, + &dp->di_db[0]), osize, nsize, flags, + cred, &bp); + if (error) + return (error); + if (DOINGSOFTDEP(vp)) + softdep_setup_allocdirect(ip, lbn, + dbtofsb(fs, bp->b_blkno), nb, + nsize, osize, bp); + } + } else { + if (ip->i_size < smalllblktosize(fs, lbn + 1)) + nsize = fragroundup(fs, size); + else + nsize = fs->fs_bsize; + UFS_LOCK(ump); + error = ffs_alloc(ip, lbn, + ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]), + nsize, flags, cred, &newb); + if (error) + return (error); + bp = getblk(vp, lbn, nsize, 0, 0, gbflags); + bp->b_blkno = fsbtodb(fs, newb); + if (flags & BA_CLRBUF) + vfs_bio_clrbuf(bp); + if (DOINGSOFTDEP(vp)) + softdep_setup_allocdirect(ip, lbn, newb, 0, + nsize, 0, bp); + } + dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + *bpp = bp; + return (0); + } + /* + * Determine the number of levels of indirection. + */ + pref = 0; + if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) + return(error); +#ifdef INVARIANTS + if (num < 1) + panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block"); +#endif + saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH); + /* + * Fetch the first indirect block allocating if necessary. + */ + --num; + nb = dp->di_ib[indirs[0].in_off]; + allocib = NULL; + allocblk = allociblk; + lbns_remfree = lbns; + if (nb == 0) { + UFS_LOCK(ump); + pref = ffs_blkpref_ufs1(ip, lbn, -indirs[0].in_off - 1, + (ufs1_daddr_t *)0); + if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, + flags, cred, &newb)) != 0) { + curthread_pflags_restore(saved_inbdflush); + return (error); + } + pref = newb + fs->fs_frag; + nb = newb; + MPASS(allocblk < allociblk + nitems(allociblk)); + MPASS(lbns_remfree < lbns + nitems(lbns)); + *allocblk++ = nb; + *lbns_remfree++ = indirs[1].in_lbn; + bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags); + bp->b_blkno = fsbtodb(fs, nb); + vfs_bio_clrbuf(bp); + if (DOINGSOFTDEP(vp)) { + softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off, + newb, 0, fs->fs_bsize, 0, bp); + bdwrite(bp); + } else { + /* + * Write synchronously so that indirect blocks + * never point at garbage. + */ + if (DOINGASYNC(vp)) + bdwrite(bp); + else if ((error = bwrite(bp)) != 0) + goto fail; + } + allocib = &dp->di_ib[indirs[0].in_off]; + *allocib = nb; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } + /* + * Fetch through the indirect blocks, allocating as necessary. + */ +retry: + for (i = 1;;) { + error = bread(vp, + indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); + if (error) { + brelse(bp); + goto fail; + } + bap = (ufs1_daddr_t *)bp->b_data; + nb = bap[indirs[i].in_off]; + if (i == num) + break; + i += 1; + if (nb != 0) { + bqrelse(bp); + continue; + } + UFS_LOCK(ump); + /* + * If parent indirect has just been allocated, try to cluster + * immediately following it. + */ + if (pref == 0) + pref = ffs_blkpref_ufs1(ip, lbn, i - num - 1, + (ufs1_daddr_t *)0); + if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, + flags | IO_BUFLOCKED, cred, &newb)) != 0) { + brelse(bp); + if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { + UFS_LOCK(ump); + softdep_request_cleanup(fs, vp, cred, + FLUSH_BLOCKS_WAIT); + UFS_UNLOCK(ump); + goto retry; + } + if (ppsratecheck(&lastfail, &curfail, 1)) { + ffs_fserr(fs, ip->i_number, "filesystem full"); + uprintf("\n%s: write failed, filesystem " + "is full\n", fs->fs_fsmnt); + } + goto fail; + } + pref = newb + fs->fs_frag; + nb = newb; + MPASS(allocblk < allociblk + nitems(allociblk)); + MPASS(lbns_remfree < lbns + nitems(lbns)); + *allocblk++ = nb; + *lbns_remfree++ = indirs[i].in_lbn; + nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0); + nbp->b_blkno = fsbtodb(fs, nb); + vfs_bio_clrbuf(nbp); + if (DOINGSOFTDEP(vp)) { + softdep_setup_allocindir_meta(nbp, ip, bp, + indirs[i - 1].in_off, nb); + bdwrite(nbp); + } else { + /* + * Write synchronously so that indirect blocks + * never point at garbage. + */ + if ((error = bwrite(nbp)) != 0) { + brelse(bp); + goto fail; + } + } + bap[indirs[i - 1].in_off] = nb; + if (allocib == NULL && unwindidx < 0) + unwindidx = i - 1; + /* + * If required, write synchronously, otherwise use + * delayed write. + */ + if (flags & IO_SYNC) { + bwrite(bp); + } else { + if (bp->b_bufsize == fs->fs_bsize) + bp->b_flags |= B_CLUSTEROK; + bdwrite(bp); + } + } + /* + * If asked only for the indirect block, then return it. + */ + if (flags & BA_METAONLY) { + curthread_pflags_restore(saved_inbdflush); + *bpp = bp; + return (0); + } + /* + * Get the data block, allocating if necessary. + */ + if (nb == 0) { + UFS_LOCK(ump); + /* + * If allocating metadata at the front of the cylinder + * group and parent indirect block has just been allocated, + * then cluster next to it if it is the first indirect in + * the file. Otherwise it has been allocated in the metadata + * area, so we want to find our own place out in the data area. + */ + if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0)) + pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off, + &bap[0]); + error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, + flags | IO_BUFLOCKED, cred, &newb); + if (error) { + brelse(bp); + if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { + UFS_LOCK(ump); + softdep_request_cleanup(fs, vp, cred, + FLUSH_BLOCKS_WAIT); + UFS_UNLOCK(ump); + goto retry; + } + if (ppsratecheck(&lastfail, &curfail, 1)) { + ffs_fserr(fs, ip->i_number, "filesystem full"); + uprintf("\n%s: write failed, filesystem " + "is full\n", fs->fs_fsmnt); + } + goto fail; + } + nb = newb; + MPASS(allocblk < allociblk + nitems(allociblk)); + MPASS(lbns_remfree < lbns + nitems(lbns)); + *allocblk++ = nb; + *lbns_remfree++ = lbn; + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); + nbp->b_blkno = fsbtodb(fs, nb); + if (flags & BA_CLRBUF) + vfs_bio_clrbuf(nbp); + if (DOINGSOFTDEP(vp)) + softdep_setup_allocindir_page(ip, lbn, bp, + indirs[i].in_off, nb, 0, nbp); + bap[indirs[i].in_off] = nb; + /* + * If required, write synchronously, otherwise use + * delayed write. + */ + if (flags & IO_SYNC) { + bwrite(bp); + } else { + if (bp->b_bufsize == fs->fs_bsize) + bp->b_flags |= B_CLUSTEROK; + bdwrite(bp); + } + curthread_pflags_restore(saved_inbdflush); + *bpp = nbp; + return (0); + } + brelse(bp); + if (flags & BA_CLRBUF) { + int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT; + if (seqcount != 0 && + (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 && + !(vm_page_count_severe() || buf_dirty_count_severe())) { + error = cluster_read(vp, ip->i_size, lbn, + (int)fs->fs_bsize, NOCRED, + MAXBSIZE, seqcount, gbflags, &nbp); + } else { + error = bread_gb(vp, lbn, (int)fs->fs_bsize, NOCRED, + gbflags, &nbp); + } + if (error) { + brelse(nbp); + goto fail; + } + } else { + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); + nbp->b_blkno = fsbtodb(fs, nb); + } + curthread_pflags_restore(saved_inbdflush); + *bpp = nbp; + return (0); +fail: + curthread_pflags_restore(saved_inbdflush); + /* + * If we have failed to allocate any blocks, simply return the error. + * This is the usual case and avoids the need to fsync the file. + */ + if (allocblk == allociblk && allocib == NULL && unwindidx == -1) + return (error); + /* + * If we have failed part way through block allocation, we + * have to deallocate any indirect blocks that we have allocated. + * We have to fsync the file before we start to get rid of all + * of its dependencies so that we do not leave them dangling. + * We have to sync it at the end so that the soft updates code + * does not find any untracked changes. Although this is really + * slow, running out of disk space is not expected to be a common + * occurrence. The error return from fsync is ignored as we already + * have an error to return to the user. + * + * XXX Still have to journal the free below + */ + (void) ffs_syncvnode(vp, MNT_WAIT, 0); + for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; + blkp < allocblk; blkp++, lbns_remfree++) { + /* + * We shall not leave the freed blocks on the vnode + * buffer object lists. + */ + bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, + GB_NOCREAT | GB_UNMAPPED); + if (bp != NULL) { + KASSERT(bp->b_blkno == fsbtodb(fs, *blkp), + ("mismatch1 l %jd %jd b %ju %ju", + (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree, + (uintmax_t)bp->b_blkno, + (uintmax_t)fsbtodb(fs, *blkp))); + bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; + bp->b_flags &= ~(B_ASYNC | B_CACHE); + brelse(bp); + } + deallocated += fs->fs_bsize; + } + if (allocib != NULL) { + *allocib = 0; + } else if (unwindidx >= 0) { + int r; + + r = bread(vp, indirs[unwindidx].in_lbn, + (int)fs->fs_bsize, NOCRED, &bp); + if (r) { + panic("Could not unwind indirect block, error %d", r); + brelse(bp); + } else { + bap = (ufs1_daddr_t *)bp->b_data; + bap[indirs[unwindidx].in_off] = 0; + if (flags & IO_SYNC) { + bwrite(bp); + } else { + if (bp->b_bufsize == fs->fs_bsize) + bp->b_flags |= B_CLUSTEROK; + bdwrite(bp); + } + } + } + if (deallocated) { +#ifdef QUOTA + /* + * Restore user's disk quota because allocation failed. + */ + (void) chkdq(ip, -btodb(deallocated), cred, FORCE); +#endif + dp->di_blocks -= btodb(deallocated); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } + (void) ffs_syncvnode(vp, MNT_WAIT, 0); + /* + * After the buffers are invalidated and on-disk pointers are + * cleared, free the blocks. + */ + for (blkp = allociblk; blkp < allocblk; blkp++) { +#ifdef INVARIANTS + if (blkp == allociblk) + lbns_remfree = lbns; + bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, + GB_NOCREAT | GB_UNMAPPED); + if (bp != NULL) { + panic("zombie1 %jd %ju %ju", + (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno, + (uintmax_t)fsbtodb(fs, *blkp)); + } + lbns_remfree++; +#endif + ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, + ip->i_number, vp->v_type, NULL); + } + return (error); +} + +/* + * Balloc defines the structure of file system storage + * by allocating the physical blocks on a device given + * the inode and the logical block number in a file. + * This is the allocation strategy for UFS2. Above is + * the allocation strategy for UFS1. + */ +int +ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size, + struct ucred *cred, int flags, struct buf **bpp) +{ + struct inode *ip; + struct ufs2_dinode *dp; + ufs_lbn_t lbn, lastlbn; + struct fs *fs; + struct buf *bp, *nbp; + struct ufsmount *ump; + struct indir indirs[NIADDR + 2]; + ufs2_daddr_t nb, newb, *bap, pref; + ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1]; + ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1]; + int deallocated, osize, nsize, num, i, error; + int unwindidx = -1; + int saved_inbdflush; + static struct timeval lastfail; + static int curfail; + int gbflags, reclaimed; + + ip = VTOI(vp); + dp = ip->i_din2; + fs = ITOFS(ip); + ump = ITOUMP(ip); + lbn = lblkno(fs, startoffset); + size = blkoff(fs, startoffset) + size; + reclaimed = 0; + if (size > fs->fs_bsize) + panic("ffs_balloc_ufs2: blk too big"); + *bpp = NULL; + if (lbn < 0) + return (EFBIG); + gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; + + if (DOINGSOFTDEP(vp)) + softdep_prealloc(vp, MNT_WAIT); + + /* + * Check for allocating external data. + */ + if (flags & IO_EXT) { + if (lbn >= NXADDR) + return (EFBIG); + /* + * If the next write will extend the data into a new block, + * and the data is currently composed of a fragment + * this fragment has to be extended to be a full block. + */ + lastlbn = lblkno(fs, dp->di_extsize); + if (lastlbn < lbn) { + nb = lastlbn; + osize = sblksize(fs, dp->di_extsize, nb); + if (osize < fs->fs_bsize && osize > 0) { + UFS_LOCK(ump); + error = ffs_realloccg(ip, -1 - nb, + dp->di_extb[nb], + ffs_blkpref_ufs2(ip, lastlbn, (int)nb, + &dp->di_extb[0]), osize, + (int)fs->fs_bsize, flags, cred, &bp); + if (error) + return (error); + if (DOINGSOFTDEP(vp)) + softdep_setup_allocext(ip, nb, + dbtofsb(fs, bp->b_blkno), + dp->di_extb[nb], + fs->fs_bsize, osize, bp); + dp->di_extsize = smalllblktosize(fs, nb + 1); + dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno); + bp->b_xflags |= BX_ALTDATA; + ip->i_flag |= IN_CHANGE; + if (flags & IO_SYNC) + bwrite(bp); + else + bawrite(bp); + } + } + /* + * All blocks are direct blocks + */ + if (flags & BA_METAONLY) + panic("ffs_balloc_ufs2: BA_METAONLY for ext block"); + nb = dp->di_extb[lbn]; + if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) { + error = bread_gb(vp, -1 - lbn, fs->fs_bsize, NOCRED, + gbflags, &bp); + if (error) { + brelse(bp); + return (error); + } + bp->b_blkno = fsbtodb(fs, nb); + bp->b_xflags |= BX_ALTDATA; + *bpp = bp; + return (0); + } + if (nb != 0) { + /* + * Consider need to reallocate a fragment. + */ + osize = fragroundup(fs, blkoff(fs, dp->di_extsize)); + nsize = fragroundup(fs, size); + if (nsize <= osize) { + error = bread_gb(vp, -1 - lbn, osize, NOCRED, + gbflags, &bp); + if (error) { + brelse(bp); + return (error); + } + bp->b_blkno = fsbtodb(fs, nb); + bp->b_xflags |= BX_ALTDATA; + } else { + UFS_LOCK(ump); + error = ffs_realloccg(ip, -1 - lbn, + dp->di_extb[lbn], + ffs_blkpref_ufs2(ip, lbn, (int)lbn, + &dp->di_extb[0]), osize, nsize, flags, + cred, &bp); + if (error) + return (error); + bp->b_xflags |= BX_ALTDATA; + if (DOINGSOFTDEP(vp)) + softdep_setup_allocext(ip, lbn, + dbtofsb(fs, bp->b_blkno), nb, + nsize, osize, bp); + } + } else { + if (dp->di_extsize < smalllblktosize(fs, lbn + 1)) + nsize = fragroundup(fs, size); + else + nsize = fs->fs_bsize; + UFS_LOCK(ump); + error = ffs_alloc(ip, lbn, + ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]), + nsize, flags, cred, &newb); + if (error) + return (error); + bp = getblk(vp, -1 - lbn, nsize, 0, 0, gbflags); + bp->b_blkno = fsbtodb(fs, newb); + bp->b_xflags |= BX_ALTDATA; + if (flags & BA_CLRBUF) + vfs_bio_clrbuf(bp); + if (DOINGSOFTDEP(vp)) + softdep_setup_allocext(ip, lbn, newb, 0, + nsize, 0, bp); + } + dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno); + ip->i_flag |= IN_CHANGE; + *bpp = bp; + return (0); + } + /* + * If the next write will extend the file into a new block, + * and the file is currently composed of a fragment + * this fragment has to be extended to be a full block. + */ + lastlbn = lblkno(fs, ip->i_size); + if (lastlbn < NDADDR && lastlbn < lbn) { + nb = lastlbn; + osize = blksize(fs, ip, nb); + if (osize < fs->fs_bsize && osize > 0) { + UFS_LOCK(ump); + error = ffs_realloccg(ip, nb, dp->di_db[nb], + ffs_blkpref_ufs2(ip, lastlbn, (int)nb, + &dp->di_db[0]), osize, (int)fs->fs_bsize, + flags, cred, &bp); + if (error) + return (error); + if (DOINGSOFTDEP(vp)) + softdep_setup_allocdirect(ip, nb, + dbtofsb(fs, bp->b_blkno), + dp->di_db[nb], + fs->fs_bsize, osize, bp); + ip->i_size = smalllblktosize(fs, nb + 1); + dp->di_size = ip->i_size; + dp->di_db[nb] = dbtofsb(fs, bp->b_blkno); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (flags & IO_SYNC) + bwrite(bp); + else + bawrite(bp); + } + } + /* + * The first NDADDR blocks are direct blocks + */ + if (lbn < NDADDR) { + if (flags & BA_METAONLY) + panic("ffs_balloc_ufs2: BA_METAONLY for direct block"); + nb = dp->di_db[lbn]; + if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) { + error = bread_gb(vp, lbn, fs->fs_bsize, NOCRED, + gbflags, &bp); + if (error) { + brelse(bp); + return (error); + } + bp->b_blkno = fsbtodb(fs, nb); + *bpp = bp; + return (0); + } + if (nb != 0) { + /* + * Consider need to reallocate a fragment. + */ + osize = fragroundup(fs, blkoff(fs, ip->i_size)); + nsize = fragroundup(fs, size); + if (nsize <= osize) { + error = bread_gb(vp, lbn, osize, NOCRED, + gbflags, &bp); + if (error) { + brelse(bp); + return (error); + } + bp->b_blkno = fsbtodb(fs, nb); + } else { + UFS_LOCK(ump); + error = ffs_realloccg(ip, lbn, dp->di_db[lbn], + ffs_blkpref_ufs2(ip, lbn, (int)lbn, + &dp->di_db[0]), osize, nsize, flags, + cred, &bp); + if (error) + return (error); + if (DOINGSOFTDEP(vp)) + softdep_setup_allocdirect(ip, lbn, + dbtofsb(fs, bp->b_blkno), nb, + nsize, osize, bp); + } + } else { + if (ip->i_size < smalllblktosize(fs, lbn + 1)) + nsize = fragroundup(fs, size); + else + nsize = fs->fs_bsize; + UFS_LOCK(ump); + error = ffs_alloc(ip, lbn, + ffs_blkpref_ufs2(ip, lbn, (int)lbn, + &dp->di_db[0]), nsize, flags, cred, &newb); + if (error) + return (error); + bp = getblk(vp, lbn, nsize, 0, 0, gbflags); + bp->b_blkno = fsbtodb(fs, newb); + if (flags & BA_CLRBUF) + vfs_bio_clrbuf(bp); + if (DOINGSOFTDEP(vp)) + softdep_setup_allocdirect(ip, lbn, newb, 0, + nsize, 0, bp); + } + dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + *bpp = bp; + return (0); + } + /* + * Determine the number of levels of indirection. + */ + pref = 0; + if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0) + return(error); +#ifdef INVARIANTS + if (num < 1) + panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block"); +#endif + saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH); + /* + * Fetch the first indirect block allocating if necessary. + */ + --num; + nb = dp->di_ib[indirs[0].in_off]; + allocib = NULL; + allocblk = allociblk; + lbns_remfree = lbns; + if (nb == 0) { + UFS_LOCK(ump); + pref = ffs_blkpref_ufs2(ip, lbn, -indirs[0].in_off - 1, + (ufs2_daddr_t *)0); + if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, + flags, cred, &newb)) != 0) { + curthread_pflags_restore(saved_inbdflush); + return (error); + } + pref = newb + fs->fs_frag; + nb = newb; + MPASS(allocblk < allociblk + nitems(allociblk)); + MPASS(lbns_remfree < lbns + nitems(lbns)); + *allocblk++ = nb; + *lbns_remfree++ = indirs[1].in_lbn; + bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, + GB_UNMAPPED); + bp->b_blkno = fsbtodb(fs, nb); + vfs_bio_clrbuf(bp); + if (DOINGSOFTDEP(vp)) { + softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off, + newb, 0, fs->fs_bsize, 0, bp); + bdwrite(bp); + } else { + /* + * Write synchronously so that indirect blocks + * never point at garbage. + */ + if (DOINGASYNC(vp)) + bdwrite(bp); + else if ((error = bwrite(bp)) != 0) + goto fail; + } + allocib = &dp->di_ib[indirs[0].in_off]; + *allocib = nb; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } + /* + * Fetch through the indirect blocks, allocating as necessary. + */ +retry: + for (i = 1;;) { + error = bread(vp, + indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp); + if (error) { + brelse(bp); + goto fail; + } + bap = (ufs2_daddr_t *)bp->b_data; + nb = bap[indirs[i].in_off]; + if (i == num) + break; + i += 1; + if (nb != 0) { + bqrelse(bp); + continue; + } + UFS_LOCK(ump); + /* + * If parent indirect has just been allocated, try to cluster + * immediately following it. + */ + if (pref == 0) + pref = ffs_blkpref_ufs2(ip, lbn, i - num - 1, + (ufs2_daddr_t *)0); + if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, + flags | IO_BUFLOCKED, cred, &newb)) != 0) { + brelse(bp); + if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { + UFS_LOCK(ump); + softdep_request_cleanup(fs, vp, cred, + FLUSH_BLOCKS_WAIT); + UFS_UNLOCK(ump); + goto retry; + } + if (ppsratecheck(&lastfail, &curfail, 1)) { + ffs_fserr(fs, ip->i_number, "filesystem full"); + uprintf("\n%s: write failed, filesystem " + "is full\n", fs->fs_fsmnt); + } + goto fail; + } + pref = newb + fs->fs_frag; + nb = newb; + MPASS(allocblk < allociblk + nitems(allociblk)); + MPASS(lbns_remfree < lbns + nitems(lbns)); + *allocblk++ = nb; + *lbns_remfree++ = indirs[i].in_lbn; + nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, + GB_UNMAPPED); + nbp->b_blkno = fsbtodb(fs, nb); + vfs_bio_clrbuf(nbp); + if (DOINGSOFTDEP(vp)) { + softdep_setup_allocindir_meta(nbp, ip, bp, + indirs[i - 1].in_off, nb); + bdwrite(nbp); + } else { + /* + * Write synchronously so that indirect blocks + * never point at garbage. + */ + if ((error = bwrite(nbp)) != 0) { + brelse(bp); + goto fail; + } + } + bap[indirs[i - 1].in_off] = nb; + if (allocib == NULL && unwindidx < 0) + unwindidx = i - 1; + /* + * If required, write synchronously, otherwise use + * delayed write. + */ + if (flags & IO_SYNC) { + bwrite(bp); + } else { + if (bp->b_bufsize == fs->fs_bsize) + bp->b_flags |= B_CLUSTEROK; + bdwrite(bp); + } + } + /* + * If asked only for the indirect block, then return it. + */ + if (flags & BA_METAONLY) { + curthread_pflags_restore(saved_inbdflush); + *bpp = bp; + return (0); + } + /* + * Get the data block, allocating if necessary. + */ + if (nb == 0) { + UFS_LOCK(ump); + /* + * If allocating metadata at the front of the cylinder + * group and parent indirect block has just been allocated, + * then cluster next to it if it is the first indirect in + * the file. Otherwise it has been allocated in the metadata + * area, so we want to find our own place out in the data area. + */ + if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0)) + pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off, + &bap[0]); + error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, + flags | IO_BUFLOCKED, cred, &newb); + if (error) { + brelse(bp); + if (DOINGSOFTDEP(vp) && ++reclaimed == 1) { + UFS_LOCK(ump); + softdep_request_cleanup(fs, vp, cred, + FLUSH_BLOCKS_WAIT); + UFS_UNLOCK(ump); + goto retry; + } + if (ppsratecheck(&lastfail, &curfail, 1)) { + ffs_fserr(fs, ip->i_number, "filesystem full"); + uprintf("\n%s: write failed, filesystem " + "is full\n", fs->fs_fsmnt); + } + goto fail; + } + nb = newb; + MPASS(allocblk < allociblk + nitems(allociblk)); + MPASS(lbns_remfree < lbns + nitems(lbns)); + *allocblk++ = nb; + *lbns_remfree++ = lbn; + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); + nbp->b_blkno = fsbtodb(fs, nb); + if (flags & BA_CLRBUF) + vfs_bio_clrbuf(nbp); + if (DOINGSOFTDEP(vp)) + softdep_setup_allocindir_page(ip, lbn, bp, + indirs[i].in_off, nb, 0, nbp); + bap[indirs[i].in_off] = nb; + /* + * If required, write synchronously, otherwise use + * delayed write. + */ + if (flags & IO_SYNC) { + bwrite(bp); + } else { + if (bp->b_bufsize == fs->fs_bsize) + bp->b_flags |= B_CLUSTEROK; + bdwrite(bp); + } + curthread_pflags_restore(saved_inbdflush); + *bpp = nbp; + return (0); + } + brelse(bp); + /* + * If requested clear invalid portions of the buffer. If we + * have to do a read-before-write (typical if BA_CLRBUF is set), + * try to do some read-ahead in the sequential case to reduce + * the number of I/O transactions. + */ + if (flags & BA_CLRBUF) { + int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT; + if (seqcount != 0 && + (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 && + !(vm_page_count_severe() || buf_dirty_count_severe())) { + error = cluster_read(vp, ip->i_size, lbn, + (int)fs->fs_bsize, NOCRED, + MAXBSIZE, seqcount, gbflags, &nbp); + } else { + error = bread_gb(vp, lbn, (int)fs->fs_bsize, + NOCRED, gbflags, &nbp); + } + if (error) { + brelse(nbp); + goto fail; + } + } else { + nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags); + nbp->b_blkno = fsbtodb(fs, nb); + } + curthread_pflags_restore(saved_inbdflush); + *bpp = nbp; + return (0); +fail: + curthread_pflags_restore(saved_inbdflush); + /* + * If we have failed to allocate any blocks, simply return the error. + * This is the usual case and avoids the need to fsync the file. + */ + if (allocblk == allociblk && allocib == NULL && unwindidx == -1) + return (error); + /* + * If we have failed part way through block allocation, we + * have to deallocate any indirect blocks that we have allocated. + * We have to fsync the file before we start to get rid of all + * of its dependencies so that we do not leave them dangling. + * We have to sync it at the end so that the soft updates code + * does not find any untracked changes. Although this is really + * slow, running out of disk space is not expected to be a common + * occurrence. The error return from fsync is ignored as we already + * have an error to return to the user. + * + * XXX Still have to journal the free below + */ + (void) ffs_syncvnode(vp, MNT_WAIT, 0); + for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns; + blkp < allocblk; blkp++, lbns_remfree++) { + /* + * We shall not leave the freed blocks on the vnode + * buffer object lists. + */ + bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, + GB_NOCREAT | GB_UNMAPPED); + if (bp != NULL) { + KASSERT(bp->b_blkno == fsbtodb(fs, *blkp), + ("mismatch2 l %jd %jd b %ju %ju", + (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree, + (uintmax_t)bp->b_blkno, + (uintmax_t)fsbtodb(fs, *blkp))); + bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE; + bp->b_flags &= ~(B_ASYNC | B_CACHE); + brelse(bp); + } + deallocated += fs->fs_bsize; + } + if (allocib != NULL) { + *allocib = 0; + } else if (unwindidx >= 0) { + int r; + + r = bread(vp, indirs[unwindidx].in_lbn, + (int)fs->fs_bsize, NOCRED, &bp); + if (r) { + panic("Could not unwind indirect block, error %d", r); + brelse(bp); + } else { + bap = (ufs2_daddr_t *)bp->b_data; + bap[indirs[unwindidx].in_off] = 0; + if (flags & IO_SYNC) { + bwrite(bp); + } else { + if (bp->b_bufsize == fs->fs_bsize) + bp->b_flags |= B_CLUSTEROK; + bdwrite(bp); + } + } + } + if (deallocated) { +#ifdef QUOTA + /* + * Restore user's disk quota because allocation failed. + */ + (void) chkdq(ip, -btodb(deallocated), cred, FORCE); +#endif + dp->di_blocks -= btodb(deallocated); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } + (void) ffs_syncvnode(vp, MNT_WAIT, 0); + /* + * After the buffers are invalidated and on-disk pointers are + * cleared, free the blocks. + */ + for (blkp = allociblk; blkp < allocblk; blkp++) { +#ifdef INVARIANTS + if (blkp == allociblk) + lbns_remfree = lbns; + bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, + GB_NOCREAT | GB_UNMAPPED); + if (bp != NULL) { + panic("zombie2 %jd %ju %ju", + (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno, + (uintmax_t)fsbtodb(fs, *blkp)); + } + lbns_remfree++; +#endif + ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize, + ip->i_number, vp->v_type, NULL); + } + return (error); +} diff --git a/Dump/ufs/ffs/ffs_extern.h b/Dump/ufs/ffs/ffs_extern.h new file mode 100644 index 0000000..f50b403 --- /dev/null +++ b/Dump/ufs/ffs/ffs_extern.h @@ -0,0 +1,200 @@ +/*- + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_extern.h 8.6 (Berkeley) 3/30/95 + * $FreeBSD: releng/11.2/sys/ufs/ffs/ffs_extern.h 331722 2018-03-29 02:50:57Z eadler $ + */ + +#ifndef _UFS_FFS_EXTERN_H +#define _UFS_FFS_EXTERN_H + +#ifndef _KERNEL +#error "No user-serving parts inside" +#else + +struct buf; +struct cg; +struct fid; +struct fs; +struct inode; +struct malloc_type; +struct mount; +struct thread; +struct sockaddr; +struct statfs; +struct ucred; +struct vnode; +struct vop_fsync_args; +struct vop_reallocblks_args; +struct workhead; + +int ffs_alloc(struct inode *, ufs2_daddr_t, ufs2_daddr_t, int, int, + struct ucred *, ufs2_daddr_t *); +int ffs_balloc_ufs1(struct vnode *a_vp, off_t a_startoffset, int a_size, + struct ucred *a_cred, int a_flags, struct buf **a_bpp); +int ffs_balloc_ufs2(struct vnode *a_vp, off_t a_startoffset, int a_size, + struct ucred *a_cred, int a_flags, struct buf **a_bpp); +int ffs_blkatoff(struct vnode *, off_t, char **, struct buf **); +void ffs_blkfree(struct ufsmount *, struct fs *, struct vnode *, + ufs2_daddr_t, long, ino_t, enum vtype, struct workhead *); +ufs2_daddr_t ffs_blkpref_ufs1(struct inode *, ufs_lbn_t, int, ufs1_daddr_t *); +ufs2_daddr_t ffs_blkpref_ufs2(struct inode *, ufs_lbn_t, int, ufs2_daddr_t *); +int ffs_checkfreefile(struct fs *, struct vnode *, ino_t); +void ffs_clrblock(struct fs *, u_char *, ufs1_daddr_t); +void ffs_clusteracct(struct fs *, struct cg *, ufs1_daddr_t, int); +void ffs_bdflush(struct bufobj *, struct buf *); +int ffs_copyonwrite(struct vnode *, struct buf *); +int ffs_flushfiles(struct mount *, int, struct thread *); +void ffs_fragacct(struct fs *, int, int32_t [], int); +int ffs_freefile(struct ufsmount *, struct fs *, struct vnode *, ino_t, + int, struct workhead *); +void ffs_fserr(struct fs *, ino_t, char *); +int ffs_isblock(struct fs *, u_char *, ufs1_daddr_t); +int ffs_isfreeblock(struct fs *, u_char *, ufs1_daddr_t); +void ffs_load_inode(struct buf *, struct inode *, struct fs *, ino_t); +void ffs_oldfscompat_write(struct fs *, struct ufsmount *); +int ffs_own_mount(const struct mount *mp); +int ffs_reallocblks(struct vop_reallocblks_args *); +int ffs_realloccg(struct inode *, ufs2_daddr_t, ufs2_daddr_t, + ufs2_daddr_t, int, int, int, struct ucred *, struct buf **); +int ffs_reload(struct mount *, struct thread *, int); +int ffs_sbupdate(struct ufsmount *, int, int); +void ffs_setblock(struct fs *, u_char *, ufs1_daddr_t); +int ffs_snapblkfree(struct fs *, struct vnode *, ufs2_daddr_t, long, ino_t, + enum vtype, struct workhead *); +void ffs_snapremove(struct vnode *vp); +int ffs_snapshot(struct mount *mp, char *snapfile); +void ffs_snapshot_mount(struct mount *mp); +void ffs_snapshot_unmount(struct mount *mp); +void process_deferred_inactive(struct mount *mp); +void ffs_sync_snap(struct mount *, int); +int ffs_syncvnode(struct vnode *vp, int waitfor, int flags); +int ffs_truncate(struct vnode *, off_t, int, struct ucred *); +int ffs_update(struct vnode *, int); +int ffs_valloc(struct vnode *, int, struct ucred *, struct vnode **); + +int ffs_vfree(struct vnode *, ino_t, int); +vfs_vget_t ffs_vget; +int ffs_vgetf(struct mount *, ino_t, int, struct vnode **, int); +void ffs_susp_initialize(void); +void ffs_susp_uninitialize(void); + +#define FFSV_FORCEINSMQ 0x0001 + +#define FFSR_FORCE 0x0001 +#define FFSR_UNSUSPEND 0x0002 + +extern struct vop_vector ffs_vnodeops1; +extern struct vop_vector ffs_fifoops1; +extern struct vop_vector ffs_vnodeops2; +extern struct vop_vector ffs_fifoops2; + +/* + * Soft update function prototypes. + */ + +int softdep_check_suspend(struct mount *, struct vnode *, + int, int, int, int); +void softdep_get_depcounts(struct mount *, int *, int *); +void softdep_initialize(void); +void softdep_uninitialize(void); +int softdep_mount(struct vnode *, struct mount *, struct fs *, + struct ucred *); +void softdep_unmount(struct mount *); +int softdep_move_dependencies(struct buf *, struct buf *); +int softdep_flushworklist(struct mount *, int *, struct thread *); +int softdep_flushfiles(struct mount *, int, struct thread *); +void softdep_update_inodeblock(struct inode *, struct buf *, int); +void softdep_load_inodeblock(struct inode *); +void softdep_freefile(struct vnode *, ino_t, int); +int softdep_request_cleanup(struct fs *, struct vnode *, + struct ucred *, int); +void softdep_setup_freeblocks(struct inode *, off_t, int); +void softdep_setup_inomapdep(struct buf *, struct inode *, ino_t, int); +void softdep_setup_blkmapdep(struct buf *, struct mount *, ufs2_daddr_t, + int, int); +void softdep_setup_allocdirect(struct inode *, ufs_lbn_t, ufs2_daddr_t, + ufs2_daddr_t, long, long, struct buf *); +void softdep_setup_allocext(struct inode *, ufs_lbn_t, ufs2_daddr_t, + ufs2_daddr_t, long, long, struct buf *); +void softdep_setup_allocindir_meta(struct buf *, struct inode *, + struct buf *, int, ufs2_daddr_t); +void softdep_setup_allocindir_page(struct inode *, ufs_lbn_t, + struct buf *, int, ufs2_daddr_t, ufs2_daddr_t, struct buf *); +void softdep_setup_blkfree(struct mount *, struct buf *, ufs2_daddr_t, int, + struct workhead *); +void softdep_setup_inofree(struct mount *, struct buf *, ino_t, + struct workhead *); +void softdep_setup_sbupdate(struct ufsmount *, struct fs *, struct buf *); +void softdep_fsync_mountdev(struct vnode *); +int softdep_sync_metadata(struct vnode *); +int softdep_sync_buf(struct vnode *, struct buf *, int); +int softdep_fsync(struct vnode *); +int softdep_prealloc(struct vnode *, int); +int softdep_journal_lookup(struct mount *, struct vnode **); +void softdep_journal_freeblocks(struct inode *, struct ucred *, off_t, int); +void softdep_journal_fsync(struct inode *); +void softdep_buf_append(struct buf *, struct workhead *); +void softdep_inode_append(struct inode *, struct ucred *, struct workhead *); +void softdep_freework(struct workhead *); + + +/* + * Things to request flushing in softdep_request_cleanup() + */ +#define FLUSH_INODES 1 +#define FLUSH_INODES_WAIT 2 +#define FLUSH_BLOCKS 3 +#define FLUSH_BLOCKS_WAIT 4 +/* + * Flag to ffs_syncvnode() to request flushing of data only, + * but skip the ffs_update() on the inode itself. Used to avoid + * deadlock when flushing snapshot inodes while holding snaplk. + */ +#define NO_INO_UPDT 0x00000001 +/* + * Request data sync only from ffs_syncvnode(), not touching even more + * metadata than NO_INO_UPDT. + */ +#define DATA_ONLY 0x00000002 + +int ffs_rdonly(struct inode *); + +TAILQ_HEAD(snaphead, inode); + +struct snapdata { + LIST_ENTRY(snapdata) sn_link; + struct snaphead sn_head; + daddr_t sn_listsize; + daddr_t *sn_blklist; + struct lock sn_lock; +}; + +#endif /* _KERNEL */ + +#endif /* !_UFS_FFS_EXTERN_H */ diff --git a/Dump/ufs/ffs/ffs_inode.c b/Dump/ufs/ffs/ffs_inode.c new file mode 100644 index 0000000..1652f51 --- /dev/null +++ b/Dump/ufs/ffs/ffs_inode.c @@ -0,0 +1,765 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_inode.c 8.13 (Berkeley) 4/21/95 + */ + +#include +__FBSDID("$FreeBSD: releng/11.2/sys/ufs/ffs/ffs_inode.c 331722 2018-03-29 02:50:57Z eadler $"); + +#include "opt_quota.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +static int ffs_indirtrunc(struct inode *, ufs2_daddr_t, ufs2_daddr_t, + ufs2_daddr_t, int, ufs2_daddr_t *); + +/* + * Update the access, modified, and inode change times as specified by the + * IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively. Write the inode + * to disk if the IN_MODIFIED flag is set (it may be set initially, or by + * the timestamp update). The IN_LAZYMOD flag is set to force a write + * later if not now. The IN_LAZYACCESS is set instead of IN_MODIFIED if the fs + * is currently being suspended (or is suspended) and vnode has been accessed. + * If we write now, then clear IN_MODIFIED, IN_LAZYACCESS and IN_LAZYMOD to + * reflect the presumably successful write, and if waitfor is set, then wait + * for the write to complete. + */ +int +ffs_update(vp, waitfor) + struct vnode *vp; + int waitfor; +{ + struct fs *fs; + struct buf *bp; + struct inode *ip; + int flags, error; + + ASSERT_VOP_ELOCKED(vp, "ffs_update"); + ufs_itimes(vp); + ip = VTOI(vp); + if ((ip->i_flag & IN_MODIFIED) == 0 && waitfor == 0) + return (0); + ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED); + fs = ITOFS(ip); + if (fs->fs_ronly && ITOUMP(ip)->um_fsckpid == 0) + return (0); + /* + * If we are updating a snapshot and another process is currently + * writing the buffer containing the inode for this snapshot then + * a deadlock can occur when it tries to check the snapshot to see + * if that block needs to be copied. Thus when updating a snapshot + * we check to see if the buffer is already locked, and if it is + * we drop the snapshot lock until the buffer has been written + * and is available to us. We have to grab a reference to the + * snapshot vnode to prevent it from being removed while we are + * waiting for the buffer. + */ + flags = 0; + if (IS_SNAPSHOT(ip)) + flags = GB_LOCK_NOWAIT; +loop: + error = breadn_flags(ITODEVVP(ip), + fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), + (int) fs->fs_bsize, 0, 0, 0, NOCRED, flags, &bp); + if (error != 0) { + if (error != EBUSY) + return (error); + KASSERT((IS_SNAPSHOT(ip)), ("EBUSY from non-snapshot")); + /* + * Wait for our inode block to become available. + * + * Hold a reference to the vnode to protect against + * ffs_snapgone(). Since we hold a reference, it can only + * get reclaimed (VI_DOOMED flag) in a forcible downgrade + * or unmount. For an unmount, the entire filesystem will be + * gone, so we cannot attempt to touch anything associated + * with it while the vnode is unlocked; all we can do is + * pause briefly and try again. If when we relock the vnode + * we discover that it has been reclaimed, updating it is no + * longer necessary and we can just return an error. + */ + vref(vp); + VOP_UNLOCK(vp, 0); + pause("ffsupd", 1); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + vrele(vp); + if ((vp->v_iflag & VI_DOOMED) != 0) + return (ENOENT); + goto loop; + } + if (DOINGSOFTDEP(vp)) + softdep_update_inodeblock(ip, bp, waitfor); + else if (ip->i_effnlink != ip->i_nlink) + panic("ffs_update: bad link cnt"); + if (I_IS_UFS1(ip)) { + *((struct ufs1_dinode *)bp->b_data + + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1; + /* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */ + random_harvest_queue(&(ip->i_din1), sizeof(ip->i_din1), 1, RANDOM_FS_ATIME); + } else { + *((struct ufs2_dinode *)bp->b_data + + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2; + /* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */ + random_harvest_queue(&(ip->i_din2), sizeof(ip->i_din2), 1, RANDOM_FS_ATIME); + } + if (waitfor && !DOINGASYNC(vp)) + error = bwrite(bp); + else if (vm_page_count_severe() || buf_dirty_count_severe()) { + bawrite(bp); + error = 0; + } else { + if (bp->b_bufsize == fs->fs_bsize) + bp->b_flags |= B_CLUSTEROK; + bdwrite(bp); + error = 0; + } + return (error); +} + +#define SINGLE 0 /* index of single indirect block */ +#define DOUBLE 1 /* index of double indirect block */ +#define TRIPLE 2 /* index of triple indirect block */ +/* + * Truncate the inode ip to at most length size, freeing the + * disk blocks. + */ +int +ffs_truncate(vp, length, flags, cred) + struct vnode *vp; + off_t length; + int flags; + struct ucred *cred; +{ + struct inode *ip; + ufs2_daddr_t bn, lbn, lastblock, lastiblock[NIADDR], indir_lbn[NIADDR]; + ufs2_daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR]; + ufs2_daddr_t count, blocksreleased = 0, datablocks, blkno; + struct bufobj *bo; + struct fs *fs; + struct buf *bp; + struct ufsmount *ump; + int softdeptrunc, journaltrunc; + int needextclean, extblocks; + int offset, size, level, nblocks; + int i, error, allerror, indiroff; + off_t osize; + + ip = VTOI(vp); + ump = VFSTOUFS(vp->v_mount); + fs = ump->um_fs; + bo = &vp->v_bufobj; + + ASSERT_VOP_LOCKED(vp, "ffs_truncate"); + + if (length < 0) + return (EINVAL); + if (length > fs->fs_maxfilesize) + return (EFBIG); +#ifdef QUOTA + error = getinoquota(ip); + if (error) + return (error); +#endif + /* + * Historically clients did not have to specify which data + * they were truncating. So, if not specified, we assume + * traditional behavior, e.g., just the normal data. + */ + if ((flags & (IO_EXT | IO_NORMAL)) == 0) + flags |= IO_NORMAL; + if (!DOINGSOFTDEP(vp) && !DOINGASYNC(vp)) + flags |= IO_SYNC; + /* + * If we are truncating the extended-attributes, and cannot + * do it with soft updates, then do it slowly here. If we are + * truncating both the extended attributes and the file contents + * (e.g., the file is being unlinked), then pick it off with + * soft updates below. + */ + allerror = 0; + needextclean = 0; + softdeptrunc = 0; + journaltrunc = DOINGSUJ(vp); + if (journaltrunc == 0 && DOINGSOFTDEP(vp) && length == 0) + softdeptrunc = !softdep_slowdown(vp); + extblocks = 0; + datablocks = DIP(ip, i_blocks); + if (fs->fs_magic == FS_UFS2_MAGIC && ip->i_din2->di_extsize > 0) { + extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); + datablocks -= extblocks; + } + if ((flags & IO_EXT) && extblocks > 0) { + if (length != 0) + panic("ffs_truncate: partial trunc of extdata"); + if (softdeptrunc || journaltrunc) { + if ((flags & IO_NORMAL) == 0) + goto extclean; + needextclean = 1; + } else { + if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) + return (error); +#ifdef QUOTA + (void) chkdq(ip, -extblocks, NOCRED, 0); +#endif + vinvalbuf(vp, V_ALT, 0, 0); + vn_pages_remove(vp, + OFF_TO_IDX(lblktosize(fs, -extblocks)), 0); + osize = ip->i_din2->di_extsize; + ip->i_din2->di_blocks -= extblocks; + ip->i_din2->di_extsize = 0; + for (i = 0; i < NXADDR; i++) { + oldblks[i] = ip->i_din2->di_extb[i]; + ip->i_din2->di_extb[i] = 0; + } + ip->i_flag |= IN_CHANGE; + if ((error = ffs_update(vp, !DOINGASYNC(vp)))) + return (error); + for (i = 0; i < NXADDR; i++) { + if (oldblks[i] == 0) + continue; + ffs_blkfree(ump, fs, ITODEVVP(ip), oldblks[i], + sblksize(fs, osize, i), ip->i_number, + vp->v_type, NULL); + } + } + } + if ((flags & IO_NORMAL) == 0) + return (0); + if (vp->v_type == VLNK && + (ip->i_size < vp->v_mount->mnt_maxsymlinklen || + datablocks == 0)) { +#ifdef INVARIANTS + if (length != 0) + panic("ffs_truncate: partial truncate of symlink"); +#endif + bzero(SHORTLINK(ip), (u_int)ip->i_size); + ip->i_size = 0; + DIP_SET(ip, i_size, 0); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (needextclean) + goto extclean; + return (ffs_update(vp, !DOINGASYNC(vp))); + } + if (ip->i_size == length) { + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (needextclean) + goto extclean; + return (ffs_update(vp, 0)); + } + if (fs->fs_ronly) + panic("ffs_truncate: read-only filesystem"); + if (IS_SNAPSHOT(ip)) + ffs_snapremove(vp); + vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; + osize = ip->i_size; + /* + * Lengthen the size of the file. We must ensure that the + * last byte of the file is allocated. Since the smallest + * value of osize is 0, length will be at least 1. + */ + if (osize < length) { + vnode_pager_setsize(vp, length); + flags |= BA_CLRBUF; + error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp); + if (error) { + vnode_pager_setsize(vp, osize); + return (error); + } + ip->i_size = length; + DIP_SET(ip, i_size, length); + if (bp->b_bufsize == fs->fs_bsize) + bp->b_flags |= B_CLUSTEROK; + if (flags & IO_SYNC) + bwrite(bp); + else if (DOINGASYNC(vp)) + bdwrite(bp); + else + bawrite(bp); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + return (ffs_update(vp, !DOINGASYNC(vp))); + } + /* + * Lookup block number for a given offset. Zero length files + * have no blocks, so return a blkno of -1. + */ + lbn = lblkno(fs, length - 1); + if (length == 0) { + blkno = -1; + } else if (lbn < NDADDR) { + blkno = DIP(ip, i_db[lbn]); + } else { + error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, + cred, BA_METAONLY, &bp); + if (error) + return (error); + indiroff = (lbn - NDADDR) % NINDIR(fs); + if (I_IS_UFS1(ip)) + blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; + else + blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; + /* + * If the block number is non-zero, then the indirect block + * must have been previously allocated and need not be written. + * If the block number is zero, then we may have allocated + * the indirect block and hence need to write it out. + */ + if (blkno != 0) + brelse(bp); + else if (DOINGSOFTDEP(vp) || DOINGASYNC(vp)) + bdwrite(bp); + else + bwrite(bp); + } + /* + * If the block number at the new end of the file is zero, + * then we must allocate it to ensure that the last block of + * the file is allocated. Soft updates does not handle this + * case, so here we have to clean up the soft updates data + * structures describing the allocation past the truncation + * point. Finding and deallocating those structures is a lot of + * work. Since partial truncation with a hole at the end occurs + * rarely, we solve the problem by syncing the file so that it + * will have no soft updates data structures left. + */ + if (blkno == 0 && (error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) + return (error); + if (blkno != 0 && DOINGSOFTDEP(vp)) { + if (softdeptrunc == 0 && journaltrunc == 0) { + /* + * If soft updates cannot handle this truncation, + * clean up soft dependency data structures and + * fall through to the synchronous truncation. + */ + if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) + return (error); + } else { + flags = IO_NORMAL | (needextclean ? IO_EXT: 0); + if (journaltrunc) + softdep_journal_freeblocks(ip, cred, length, + flags); + else + softdep_setup_freeblocks(ip, length, flags); + ASSERT_VOP_LOCKED(vp, "ffs_truncate1"); + if (journaltrunc == 0) { + ip->i_flag |= IN_CHANGE | IN_UPDATE; + error = ffs_update(vp, 0); + } + return (error); + } + } + /* + * Shorten the size of the file. If the last block of the + * shortened file is unallocated, we must allocate it. + * Additionally, if the file is not being truncated to a + * block boundary, the contents of the partial block + * following the end of the file must be zero'ed in + * case it ever becomes accessible again because of + * subsequent file growth. Directories however are not + * zero'ed as they should grow back initialized to empty. + */ + offset = blkoff(fs, length); + if (blkno != 0 && offset == 0) { + ip->i_size = length; + DIP_SET(ip, i_size, length); + } else { + lbn = lblkno(fs, length); + flags |= BA_CLRBUF; + error = UFS_BALLOC(vp, length - 1, 1, cred, flags, &bp); + if (error) + return (error); + /* + * When we are doing soft updates and the UFS_BALLOC + * above fills in a direct block hole with a full sized + * block that will be truncated down to a fragment below, + * we must flush out the block dependency with an FSYNC + * so that we do not get a soft updates inconsistency + * when we create the fragment below. + */ + if (DOINGSOFTDEP(vp) && lbn < NDADDR && + fragroundup(fs, blkoff(fs, length)) < fs->fs_bsize && + (error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) + return (error); + ip->i_size = length; + DIP_SET(ip, i_size, length); + size = blksize(fs, ip, lbn); + if (vp->v_type != VDIR && offset != 0) + bzero((char *)bp->b_data + offset, + (u_int)(size - offset)); + /* Kirk's code has reallocbuf(bp, size, 1) here */ + allocbuf(bp, size); + if (bp->b_bufsize == fs->fs_bsize) + bp->b_flags |= B_CLUSTEROK; + if (flags & IO_SYNC) + bwrite(bp); + else if (DOINGASYNC(vp)) + bdwrite(bp); + else + bawrite(bp); + } + /* + * Calculate index into inode's block list of + * last direct and indirect blocks (if any) + * which we want to keep. Lastblock is -1 when + * the file is truncated to 0. + */ + lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1; + lastiblock[SINGLE] = lastblock - NDADDR; + lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); + lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); + nblocks = btodb(fs->fs_bsize); + /* + * Update file and block pointers on disk before we start freeing + * blocks. If we crash before free'ing blocks below, the blocks + * will be returned to the free list. lastiblock values are also + * normalized to -1 for calls to ffs_indirtrunc below. + */ + for (level = TRIPLE; level >= SINGLE; level--) { + oldblks[NDADDR + level] = DIP(ip, i_ib[level]); + if (lastiblock[level] < 0) { + DIP_SET(ip, i_ib[level], 0); + lastiblock[level] = -1; + } + } + for (i = 0; i < NDADDR; i++) { + oldblks[i] = DIP(ip, i_db[i]); + if (i > lastblock) + DIP_SET(ip, i_db[i], 0); + } + ip->i_flag |= IN_CHANGE | IN_UPDATE; + allerror = ffs_update(vp, !DOINGASYNC(vp)); + + /* + * Having written the new inode to disk, save its new configuration + * and put back the old block pointers long enough to process them. + * Note that we save the new block configuration so we can check it + * when we are done. + */ + for (i = 0; i < NDADDR; i++) { + newblks[i] = DIP(ip, i_db[i]); + DIP_SET(ip, i_db[i], oldblks[i]); + } + for (i = 0; i < NIADDR; i++) { + newblks[NDADDR + i] = DIP(ip, i_ib[i]); + DIP_SET(ip, i_ib[i], oldblks[NDADDR + i]); + } + ip->i_size = osize; + DIP_SET(ip, i_size, osize); + + error = vtruncbuf(vp, cred, length, fs->fs_bsize); + if (error && (allerror == 0)) + allerror = error; + + /* + * Indirect blocks first. + */ + indir_lbn[SINGLE] = -NDADDR; + indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1; + indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1; + for (level = TRIPLE; level >= SINGLE; level--) { + bn = DIP(ip, i_ib[level]); + if (bn != 0) { + error = ffs_indirtrunc(ip, indir_lbn[level], + fsbtodb(fs, bn), lastiblock[level], level, &count); + if (error) + allerror = error; + blocksreleased += count; + if (lastiblock[level] < 0) { + DIP_SET(ip, i_ib[level], 0); + ffs_blkfree(ump, fs, ump->um_devvp, bn, + fs->fs_bsize, ip->i_number, + vp->v_type, NULL); + blocksreleased += nblocks; + } + } + if (lastiblock[level] >= 0) + goto done; + } + + /* + * All whole direct blocks or frags. + */ + for (i = NDADDR - 1; i > lastblock; i--) { + long bsize; + + bn = DIP(ip, i_db[i]); + if (bn == 0) + continue; + DIP_SET(ip, i_db[i], 0); + bsize = blksize(fs, ip, i); + ffs_blkfree(ump, fs, ump->um_devvp, bn, bsize, ip->i_number, + vp->v_type, NULL); + blocksreleased += btodb(bsize); + } + if (lastblock < 0) + goto done; + + /* + * Finally, look for a change in size of the + * last direct block; release any frags. + */ + bn = DIP(ip, i_db[lastblock]); + if (bn != 0) { + long oldspace, newspace; + + /* + * Calculate amount of space we're giving + * back as old block size minus new block size. + */ + oldspace = blksize(fs, ip, lastblock); + ip->i_size = length; + DIP_SET(ip, i_size, length); + newspace = blksize(fs, ip, lastblock); + if (newspace == 0) + panic("ffs_truncate: newspace"); + if (oldspace - newspace > 0) { + /* + * Block number of space to be free'd is + * the old block # plus the number of frags + * required for the storage we're keeping. + */ + bn += numfrags(fs, newspace); + ffs_blkfree(ump, fs, ump->um_devvp, bn, + oldspace - newspace, ip->i_number, vp->v_type, NULL); + blocksreleased += btodb(oldspace - newspace); + } + } +done: +#ifdef INVARIANTS + for (level = SINGLE; level <= TRIPLE; level++) + if (newblks[NDADDR + level] != DIP(ip, i_ib[level])) + panic("ffs_truncate1"); + for (i = 0; i < NDADDR; i++) + if (newblks[i] != DIP(ip, i_db[i])) + panic("ffs_truncate2"); + BO_LOCK(bo); + if (length == 0 && + (fs->fs_magic != FS_UFS2_MAGIC || ip->i_din2->di_extsize == 0) && + (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) + panic("ffs_truncate3"); + BO_UNLOCK(bo); +#endif /* INVARIANTS */ + /* + * Put back the real size. + */ + ip->i_size = length; + DIP_SET(ip, i_size, length); + if (DIP(ip, i_blocks) >= blocksreleased) + DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - blocksreleased); + else /* sanity */ + DIP_SET(ip, i_blocks, 0); + ip->i_flag |= IN_CHANGE; +#ifdef QUOTA + (void) chkdq(ip, -blocksreleased, NOCRED, 0); +#endif + return (allerror); + +extclean: + if (journaltrunc) + softdep_journal_freeblocks(ip, cred, length, IO_EXT); + else + softdep_setup_freeblocks(ip, length, IO_EXT); + return (ffs_update(vp, (flags & IO_SYNC) != 0 || !DOINGASYNC(vp))); +} + +/* + * Release blocks associated with the inode ip and stored in the indirect + * block bn. Blocks are free'd in LIFO order up to (but not including) + * lastbn. If level is greater than SINGLE, the block is an indirect block + * and recursive calls to indirtrunc must be used to cleanse other indirect + * blocks. + */ +static int +ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp) + struct inode *ip; + ufs2_daddr_t lbn, lastbn; + ufs2_daddr_t dbn; + int level; + ufs2_daddr_t *countp; +{ + struct buf *bp; + struct fs *fs; + struct vnode *vp; + caddr_t copy = NULL; + int i, nblocks, error = 0, allerror = 0; + ufs2_daddr_t nb, nlbn, last; + ufs2_daddr_t blkcount, factor, blocksreleased = 0; + ufs1_daddr_t *bap1 = NULL; + ufs2_daddr_t *bap2 = NULL; +#define BAP(ip, i) (I_IS_UFS1(ip) ? bap1[i] : bap2[i]) + + fs = ITOFS(ip); + + /* + * Calculate index in current block of last + * block to be kept. -1 indicates the entire + * block so we need not calculate the index. + */ + factor = lbn_offset(fs, level); + last = lastbn; + if (lastbn > 0) + last /= factor; + nblocks = btodb(fs->fs_bsize); + /* + * Get buffer of block pointers, zero those entries corresponding + * to blocks to be free'd, and update on disk copy first. Since + * double(triple) indirect before single(double) indirect, calls + * to bmap on these blocks will fail. However, we already have + * the on disk address, so we have to set the b_blkno field + * explicitly instead of letting bread do everything for us. + */ + vp = ITOV(ip); + bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0, 0); + if ((bp->b_flags & B_CACHE) == 0) { +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, bp, 0); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ + curthread->td_ru.ru_inblock++; /* pay for read */ + bp->b_iocmd = BIO_READ; + bp->b_flags &= ~B_INVAL; + bp->b_ioflags &= ~BIO_ERROR; + if (bp->b_bcount > bp->b_bufsize) + panic("ffs_indirtrunc: bad buffer size"); + bp->b_blkno = dbn; + vfs_busy_pages(bp, 0); + bp->b_iooffset = dbtob(bp->b_blkno); + bstrategy(bp); + error = bufwait(bp); + } + if (error) { + brelse(bp); + *countp = 0; + return (error); + } + + if (I_IS_UFS1(ip)) + bap1 = (ufs1_daddr_t *)bp->b_data; + else + bap2 = (ufs2_daddr_t *)bp->b_data; + if (lastbn != -1) { + copy = malloc(fs->fs_bsize, M_TEMP, M_WAITOK); + bcopy((caddr_t)bp->b_data, copy, (u_int)fs->fs_bsize); + for (i = last + 1; i < NINDIR(fs); i++) + if (I_IS_UFS1(ip)) + bap1[i] = 0; + else + bap2[i] = 0; + if (DOINGASYNC(vp)) { + bdwrite(bp); + } else { + error = bwrite(bp); + if (error) + allerror = error; + } + if (I_IS_UFS1(ip)) + bap1 = (ufs1_daddr_t *)copy; + else + bap2 = (ufs2_daddr_t *)copy; + } + + /* + * Recursively free totally unused blocks. + */ + for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last; + i--, nlbn += factor) { + nb = BAP(ip, i); + if (nb == 0) + continue; + if (level > SINGLE) { + if ((error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb), + (ufs2_daddr_t)-1, level - 1, &blkcount)) != 0) + allerror = error; + blocksreleased += blkcount; + } + ffs_blkfree(ITOUMP(ip), fs, ITODEVVP(ip), nb, fs->fs_bsize, + ip->i_number, vp->v_type, NULL); + blocksreleased += nblocks; + } + + /* + * Recursively free last partial block. + */ + if (level > SINGLE && lastbn >= 0) { + last = lastbn % factor; + nb = BAP(ip, i); + if (nb != 0) { + error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb), + last, level - 1, &blkcount); + if (error) + allerror = error; + blocksreleased += blkcount; + } + } + if (copy != NULL) { + free(copy, M_TEMP); + } else { + bp->b_flags |= B_INVAL | B_NOCACHE; + brelse(bp); + } + + *countp = blocksreleased; + return (allerror); +} + +int +ffs_rdonly(struct inode *ip) +{ + + return (ITOFS(ip)->fs_ronly != 0); +} + diff --git a/Dump/ufs/ffs/ffs_rawread.c b/Dump/ufs/ffs/ffs_rawread.c new file mode 100644 index 0000000..4cb577a --- /dev/null +++ b/Dump/ufs/ffs/ffs_rawread.c @@ -0,0 +1,474 @@ +/*- + * Copyright (c) 2000-2003 Tor Egge + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: releng/11.2/sys/ufs/ffs/ffs_rawread.c 318266 2017-05-14 11:51:30Z kib $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static int ffs_rawread_readahead(struct vnode *vp, + caddr_t udata, + off_t offset, + size_t len, + struct thread *td, + struct buf *bp); +static int ffs_rawread_main(struct vnode *vp, + struct uio *uio); + +static int ffs_rawread_sync(struct vnode *vp); + +int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); + +SYSCTL_DECL(_vfs_ffs); + +static int ffsrawbufcnt = 4; +SYSCTL_INT(_vfs_ffs, OID_AUTO, ffsrawbufcnt, CTLFLAG_RD, &ffsrawbufcnt, 0, + "Buffers available for raw reads"); + +static int allowrawread = 1; +SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, + "Flag to enable raw reads"); + +static int rawreadahead = 1; +SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, + "Flag to enable readahead for long raw reads"); + +static void +ffs_rawread_setup(void *arg __unused) +{ + + ffsrawbufcnt = (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8; +} +SYSINIT(ffs_raw, SI_SUB_VM_CONF, SI_ORDER_ANY, ffs_rawread_setup, NULL); + +static int +ffs_rawread_sync(struct vnode *vp) +{ + int error; + int upgraded; + struct bufobj *bo; + struct mount *mp; + vm_object_t obj; + + /* Check for dirty mmap, pending writes and dirty buffers */ + bo = &vp->v_bufobj; + BO_LOCK(bo); + VI_LOCK(vp); + if (bo->bo_numoutput > 0 || + bo->bo_dirty.bv_cnt > 0 || + ((obj = vp->v_object) != NULL && + (obj->flags & OBJ_MIGHTBEDIRTY) != 0)) { + VI_UNLOCK(vp); + BO_UNLOCK(bo); + + if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { + if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) + upgraded = 1; + else + upgraded = 0; + VOP_UNLOCK(vp, 0); + (void) vn_start_write(vp, &mp, V_WAIT); + VOP_LOCK(vp, LK_EXCLUSIVE); + } else if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { + upgraded = 1; + /* Upgrade to exclusive lock, this might block */ + VOP_LOCK(vp, LK_UPGRADE); + } else + upgraded = 0; + + + VI_LOCK(vp); + /* Check if vnode was reclaimed while unlocked. */ + if ((vp->v_iflag & VI_DOOMED) != 0) { + VI_UNLOCK(vp); + if (upgraded != 0) + VOP_LOCK(vp, LK_DOWNGRADE); + vn_finished_write(mp); + return (EIO); + } + /* Attempt to msync mmap() regions to clean dirty mmap */ + if ((obj = vp->v_object) != NULL && + (obj->flags & OBJ_MIGHTBEDIRTY) != 0) { + VI_UNLOCK(vp); + VM_OBJECT_WLOCK(obj); + vm_object_page_clean(obj, 0, 0, OBJPC_SYNC); + VM_OBJECT_WUNLOCK(obj); + } else + VI_UNLOCK(vp); + + /* Wait for pending writes to complete */ + BO_LOCK(bo); + error = bufobj_wwait(&vp->v_bufobj, 0, 0); + if (error != 0) { + /* XXX: can't happen with a zero timeout ??? */ + BO_UNLOCK(bo); + if (upgraded != 0) + VOP_LOCK(vp, LK_DOWNGRADE); + vn_finished_write(mp); + return (error); + } + /* Flush dirty buffers */ + if (bo->bo_dirty.bv_cnt > 0) { + BO_UNLOCK(bo); + if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) { + if (upgraded != 0) + VOP_LOCK(vp, LK_DOWNGRADE); + vn_finished_write(mp); + return (error); + } + BO_LOCK(bo); + if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) + panic("ffs_rawread_sync: dirty bufs"); + } + BO_UNLOCK(bo); + if (upgraded != 0) + VOP_LOCK(vp, LK_DOWNGRADE); + vn_finished_write(mp); + } else { + VI_UNLOCK(vp); + BO_UNLOCK(bo); + } + return 0; +} + + +static int +ffs_rawread_readahead(struct vnode *vp, + caddr_t udata, + off_t offset, + size_t len, + struct thread *td, + struct buf *bp) +{ + int error; + u_int iolen; + off_t blockno; + int blockoff; + int bsize; + struct vnode *dp; + int bforwards; + struct inode *ip; + ufs2_daddr_t blkno; + + bsize = vp->v_mount->mnt_stat.f_iosize; + + ip = VTOI(vp); + dp = ITODEVVP(ip); + + iolen = ((vm_offset_t) udata) & PAGE_MASK; + bp->b_bcount = len; + if (bp->b_bcount + iolen > bp->b_kvasize) { + bp->b_bcount = bp->b_kvasize; + if (iolen != 0) + bp->b_bcount -= PAGE_SIZE; + } + bp->b_flags = 0; /* XXX necessary ? */ + bp->b_iocmd = BIO_READ; + bp->b_iodone = bdone; + bp->b_data = udata; + blockno = offset / bsize; + blockoff = (offset % bsize) / DEV_BSIZE; + if ((daddr_t) blockno != blockno) { + return EINVAL; /* blockno overflow */ + } + + bp->b_lblkno = bp->b_blkno = blockno; + + error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, NULL, &bforwards, NULL); + if (error != 0) + return error; + if (blkno == -1) { + + /* Fill holes with NULs to preserve semantics */ + + if (bp->b_bcount + blockoff * DEV_BSIZE > bsize) + bp->b_bcount = bsize - blockoff * DEV_BSIZE; + bp->b_bufsize = bp->b_bcount; + + if (vmapbuf(bp, 1) < 0) + return EFAULT; + + maybe_yield(); + bzero(bp->b_data, bp->b_bufsize); + + /* Mark operation completed (similar to bufdone()) */ + + bp->b_resid = 0; + bp->b_flags |= B_DONE; + return 0; + } + bp->b_blkno = blkno + blockoff; + bp->b_offset = bp->b_iooffset = (blkno + blockoff) * DEV_BSIZE; + + if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards)) + bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE; + bp->b_bufsize = bp->b_bcount; + + if (vmapbuf(bp, 1) < 0) + return EFAULT; + + BO_STRATEGY(&dp->v_bufobj, bp); + return 0; +} + + +static int +ffs_rawread_main(struct vnode *vp, + struct uio *uio) +{ + int error, nerror; + struct buf *bp, *nbp, *tbp; + u_int iolen; + caddr_t udata; + long resid; + off_t offset; + struct thread *td; + + td = uio->uio_td ? uio->uio_td : curthread; + udata = uio->uio_iov->iov_base; + resid = uio->uio_resid; + offset = uio->uio_offset; + + /* + * keep the process from being swapped + */ + PHOLD(td->td_proc); + + error = 0; + nerror = 0; + + bp = NULL; + nbp = NULL; + + while (resid > 0) { + + if (bp == NULL) { /* Setup first read */ + /* XXX: Leave some bufs for swap */ + bp = getpbuf(&ffsrawbufcnt); + pbgetvp(vp, bp); + error = ffs_rawread_readahead(vp, udata, offset, + resid, td, bp); + if (error != 0) + break; + + if (resid > bp->b_bufsize) { /* Setup fist readahead */ + /* XXX: Leave bufs for swap */ + if (rawreadahead != 0) + nbp = trypbuf(&ffsrawbufcnt); + else + nbp = NULL; + if (nbp != NULL) { + pbgetvp(vp, nbp); + + nerror = ffs_rawread_readahead(vp, + udata + + bp->b_bufsize, + offset + + bp->b_bufsize, + resid - + bp->b_bufsize, + td, + nbp); + if (nerror) { + pbrelvp(nbp); + relpbuf(nbp, &ffsrawbufcnt); + nbp = NULL; + } + } + } + } + + bwait(bp, PRIBIO, "rawrd"); + vunmapbuf(bp); + + iolen = bp->b_bcount - bp->b_resid; + if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) { + nerror = 0; /* Ignore possible beyond EOF error */ + break; /* EOF */ + } + + if ((bp->b_ioflags & BIO_ERROR) != 0) { + error = bp->b_error; + break; + } + resid -= iolen; + udata += iolen; + offset += iolen; + if (iolen < bp->b_bufsize) { + /* Incomplete read. Try to read remaining part */ + error = ffs_rawread_readahead(vp, + udata, + offset, + bp->b_bufsize - iolen, + td, + bp); + if (error != 0) + break; + } else if (nbp != NULL) { /* Complete read with readahead */ + + tbp = bp; + bp = nbp; + nbp = tbp; + + if (resid <= bp->b_bufsize) { /* No more readaheads */ + pbrelvp(nbp); + relpbuf(nbp, &ffsrawbufcnt); + nbp = NULL; + } else { /* Setup next readahead */ + nerror = ffs_rawread_readahead(vp, + udata + + bp->b_bufsize, + offset + + bp->b_bufsize, + resid - + bp->b_bufsize, + td, + nbp); + if (nerror != 0) { + pbrelvp(nbp); + relpbuf(nbp, &ffsrawbufcnt); + nbp = NULL; + } + } + } else if (nerror != 0) {/* Deferred Readahead error */ + break; + } else if (resid > 0) { /* More to read, no readahead */ + error = ffs_rawread_readahead(vp, udata, offset, + resid, td, bp); + if (error != 0) + break; + } + } + + if (bp != NULL) { + pbrelvp(bp); + relpbuf(bp, &ffsrawbufcnt); + } + if (nbp != NULL) { /* Run down readahead buffer */ + bwait(nbp, PRIBIO, "rawrd"); + vunmapbuf(nbp); + pbrelvp(nbp); + relpbuf(nbp, &ffsrawbufcnt); + } + + if (error == 0) + error = nerror; + PRELE(td->td_proc); + uio->uio_iov->iov_base = udata; + uio->uio_resid = resid; + uio->uio_offset = offset; + return error; +} + + +int +ffs_rawread(struct vnode *vp, + struct uio *uio, + int *workdone) +{ + if (allowrawread != 0 && + uio->uio_iovcnt == 1 && + uio->uio_segflg == UIO_USERSPACE && + uio->uio_resid == uio->uio_iov->iov_len && + (((uio->uio_td != NULL) ? uio->uio_td : curthread)->td_pflags & + TDP_DEADLKTREAT) == 0) { + int secsize; /* Media sector size */ + off_t filebytes; /* Bytes left of file */ + int blockbytes; /* Bytes left of file in full blocks */ + int partialbytes; /* Bytes in last partial block */ + int skipbytes; /* Bytes not to read in ffs_rawread */ + struct inode *ip; + int error; + + + /* Only handle sector aligned reads */ + ip = VTOI(vp); + secsize = ITODEVVP(ip)->v_bufobj.bo_bsize; + if ((uio->uio_offset & (secsize - 1)) == 0 && + (uio->uio_resid & (secsize - 1)) == 0) { + + /* Sync dirty pages and buffers if needed */ + error = ffs_rawread_sync(vp); + if (error != 0) + return error; + + /* Check for end of file */ + if (ip->i_size > uio->uio_offset) { + filebytes = ip->i_size - uio->uio_offset; + + /* No special eof handling needed ? */ + if (uio->uio_resid <= filebytes) { + *workdone = 1; + return ffs_rawread_main(vp, uio); + } + + partialbytes = ((unsigned int) ip->i_size) % + ITOFS(ip)->fs_bsize; + blockbytes = (int) filebytes - partialbytes; + if (blockbytes > 0) { + skipbytes = uio->uio_resid - + blockbytes; + uio->uio_resid = blockbytes; + error = ffs_rawread_main(vp, uio); + uio->uio_resid += skipbytes; + if (error != 0) + return error; + /* Read remaining part using buffer */ + } + } + } + } + *workdone = 0; + return 0; +} diff --git a/Dump/ufs/ffs/ffs_snapshot.c b/Dump/ufs/ffs/ffs_snapshot.c new file mode 100644 index 0000000..f30dfca --- /dev/null +++ b/Dump/ufs/ffs/ffs_snapshot.c @@ -0,0 +1,2699 @@ +/*- + * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. + * + * Further information about snapshots can be obtained from: + * + * Marshall Kirk McKusick http://www.mckusick.com/softdep/ + * 1614 Oxford Street mckusick@mckusick.com + * Berkeley, CA 94709-1608 +1-510-843-9542 + * USA + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 + */ + +#include +__FBSDID("$FreeBSD: releng/11.2/sys/ufs/ffs/ffs_snapshot.c 322130 2017-08-07 02:17:15Z mckusick $"); + +#include "opt_quota.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +#include +#include + +#define KERNCRED thread0.td_ucred +#define DEBUG 1 + +#include "opt_ffs.h" + +#ifdef NO_FFS_SNAPSHOT +int +ffs_snapshot(mp, snapfile) + struct mount *mp; + char *snapfile; +{ + return (EINVAL); +} + +int +ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd) + struct fs *fs; + struct vnode *devvp; + ufs2_daddr_t bno; + long size; + ino_t inum; + enum vtype vtype; + struct workhead *wkhd; +{ + return (EINVAL); +} + +void +ffs_snapremove(vp) + struct vnode *vp; +{ +} + +void +ffs_snapshot_mount(mp) + struct mount *mp; +{ +} + +void +ffs_snapshot_unmount(mp) + struct mount *mp; +{ +} + +void +ffs_snapgone(ip) + struct inode *ip; +{ +} + +int +ffs_copyonwrite(devvp, bp) + struct vnode *devvp; + struct buf *bp; +{ + return (EINVAL); +} + +void +ffs_sync_snap(mp, waitfor) + struct mount *mp; + int waitfor; +{ +} + +#else +FEATURE(ffs_snapshot, "FFS snapshot support"); + +LIST_HEAD(, snapdata) snapfree; +static struct mtx snapfree_lock; +MTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF); + +static int cgaccount(int, struct vnode *, struct buf *, int); +static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, + int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, + ufs_lbn_t, int), int, int); +static int indiracct_ufs1(struct vnode *, struct vnode *, int, + ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, + int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, + ufs_lbn_t, int), int); +static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, + struct fs *, ufs_lbn_t, int); +static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, + struct fs *, ufs_lbn_t, int); +static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, + struct fs *, ufs_lbn_t, int); +static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, + int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, + ufs_lbn_t, int), int, int); +static int indiracct_ufs2(struct vnode *, struct vnode *, int, + ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, + int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, + ufs_lbn_t, int), int); +static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, + struct fs *, ufs_lbn_t, int); +static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, + struct fs *, ufs_lbn_t, int); +static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, + struct fs *, ufs_lbn_t, int); +static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t); +static void try_free_snapdata(struct vnode *devvp); +static struct snapdata *ffs_snapdata_acquire(struct vnode *devvp); +static int ffs_bp_snapblk(struct vnode *, struct buf *); + +/* + * To ensure the consistency of snapshots across crashes, we must + * synchronously write out copied blocks before allowing the + * originals to be modified. Because of the rather severe speed + * penalty that this imposes, the code normally only ensures + * persistence for the filesystem metadata contained within a + * snapshot. Setting the following flag allows this crash + * persistence to be enabled for file contents. + */ +int dopersistence = 0; + +#ifdef DEBUG +#include +SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); +static int snapdebug = 0; +SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); +int collectsnapstats = 0; +SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, + 0, ""); +#endif /* DEBUG */ + +/* + * Create a snapshot file and initialize it for the filesystem. + */ +int +ffs_snapshot(mp, snapfile) + struct mount *mp; + char *snapfile; +{ + ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; + int error, cg, snaploc; + int i, size, len, loc; + ufs2_daddr_t blockno; + uint64_t flag; + struct timespec starttime = {0, 0}, endtime; + char saved_nice = 0; + long redo = 0, snaplistsize = 0; + int32_t *lp; + void *space; + struct fs *copy_fs = NULL, *fs; + struct thread *td = curthread; + struct inode *ip, *xp; + struct buf *bp, *nbp, *ibp; + struct nameidata nd; + struct mount *wrtmp; + struct vattr vat; + struct vnode *vp, *xvp, *mvp, *devvp; + struct uio auio; + struct iovec aiov; + struct snapdata *sn; + struct ufsmount *ump; + + ump = VFSTOUFS(mp); + fs = ump->um_fs; + sn = NULL; + /* + * At the moment, journaled soft updates cannot support + * taking snapshots. + */ + if (MOUNTEDSUJ(mp)) { + vfs_mount_error(mp, "%s: Snapshots are not yet supported when " + "running with journaled soft updates", fs->fs_fsmnt); + return (EOPNOTSUPP); + } + MNT_ILOCK(mp); + flag = mp->mnt_flag; + MNT_IUNLOCK(mp); + /* + * Need to serialize access to snapshot code per filesystem. + */ + /* + * Assign a snapshot slot in the superblock. + */ + UFS_LOCK(ump); + for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) + if (fs->fs_snapinum[snaploc] == 0) + break; + UFS_UNLOCK(ump); + if (snaploc == FSMAXSNAP) + return (ENOSPC); + /* + * Create the snapshot file. + */ +restart: + NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF | NOCACHE, UIO_SYSSPACE, + snapfile, td); + if ((error = namei(&nd)) != 0) + return (error); + if (nd.ni_vp != NULL) { + vput(nd.ni_vp); + error = EEXIST; + } + if (nd.ni_dvp->v_mount != mp) + error = EXDEV; + if (error) { + NDFREE(&nd, NDF_ONLY_PNBUF); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + return (error); + } + VATTR_NULL(&vat); + vat.va_type = VREG; + vat.va_mode = S_IRUSR; + vat.va_vaflags |= VA_EXCLUSIVE; + if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) + wrtmp = NULL; + if (wrtmp != mp) + panic("ffs_snapshot: mount mismatch"); + vfs_rel(wrtmp); + if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vput(nd.ni_dvp); + if ((error = vn_start_write(NULL, &wrtmp, + V_XSLEEP | PCATCH)) != 0) + return (error); + goto restart; + } + error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); + VOP_UNLOCK(nd.ni_dvp, 0); + if (error) { + NDFREE(&nd, NDF_ONLY_PNBUF); + vn_finished_write(wrtmp); + vrele(nd.ni_dvp); + return (error); + } + vp = nd.ni_vp; + vp->v_vflag |= VV_SYSTEM; + ip = VTOI(vp); + devvp = ITODEVVP(ip); + /* + * Allocate and copy the last block contents so as to be able + * to set size to that of the filesystem. + */ + numblks = howmany(fs->fs_size, fs->fs_frag); + error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), + fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); + if (error) + goto out; + ip->i_size = lblktosize(fs, (off_t)numblks); + DIP_SET(ip, i_size, ip->i_size); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + error = readblock(vp, bp, numblks - 1); + bawrite(bp); + if (error != 0) + goto out; + /* + * Preallocate critical data structures so that we can copy + * them in without further allocation after we suspend all + * operations on the filesystem. We would like to just release + * the allocated buffers without writing them since they will + * be filled in below once we are ready to go, but this upsets + * the soft update code, so we go ahead and write the new buffers. + * + * Allocate all indirect blocks and mark all of them as not + * needing to be copied. + */ + for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { + error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), + fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); + if (error) + goto out; + bawrite(ibp); + } + /* + * Allocate copies for the superblock and its summary information. + */ + error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, + 0, &nbp); + if (error) + goto out; + bawrite(nbp); + blkno = fragstoblks(fs, fs->fs_csaddr); + len = howmany(fs->fs_cssize, fs->fs_bsize); + for (loc = 0; loc < len; loc++) { + error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), + fs->fs_bsize, KERNCRED, 0, &nbp); + if (error) + goto out; + bawrite(nbp); + } + /* + * Allocate all cylinder group blocks. + */ + for (cg = 0; cg < fs->fs_ncg; cg++) { + error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), + fs->fs_bsize, KERNCRED, 0, &nbp); + if (error) + goto out; + bawrite(nbp); + if (cg % 10 == 0) + ffs_syncvnode(vp, MNT_WAIT, 0); + } + /* + * Copy all the cylinder group maps. Although the + * filesystem is still active, we hope that only a few + * cylinder groups will change between now and when we + * suspend operations. Thus, we will be able to quickly + * touch up the few cylinder groups that changed during + * the suspension period. + */ + len = howmany(fs->fs_ncg, NBBY); + space = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO); + UFS_LOCK(ump); + fs->fs_active = space; + UFS_UNLOCK(ump); + for (cg = 0; cg < fs->fs_ncg; cg++) { + error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), + fs->fs_bsize, KERNCRED, 0, &nbp); + if (error) + goto out; + error = cgaccount(cg, vp, nbp, 1); + bawrite(nbp); + if (cg % 10 == 0) + ffs_syncvnode(vp, MNT_WAIT, 0); + if (error) + goto out; + } + /* + * Change inode to snapshot type file. + */ + ip->i_flags |= SF_SNAPSHOT; + DIP_SET(ip, i_flags, ip->i_flags); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + /* + * Ensure that the snapshot is completely on disk. + * Since we have marked it as a snapshot it is safe to + * unlock it as no process will be allowed to write to it. + */ + if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) + goto out; + VOP_UNLOCK(vp, 0); + /* + * All allocations are done, so we can now snapshot the system. + * + * Recind nice scheduling while running with the filesystem suspended. + */ + if (td->td_proc->p_nice > 0) { + struct proc *p; + + p = td->td_proc; + PROC_LOCK(p); + saved_nice = p->p_nice; + sched_nice(p, 0); + PROC_UNLOCK(p); + } + /* + * Suspend operation on filesystem. + */ + for (;;) { + vn_finished_write(wrtmp); + if ((error = vfs_write_suspend(vp->v_mount, 0)) != 0) { + vn_start_write(NULL, &wrtmp, V_WAIT); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + goto out; + } + if (mp->mnt_kern_flag & MNTK_SUSPENDED) + break; + vn_start_write(NULL, &wrtmp, V_WAIT); + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + if (ip->i_effnlink == 0) { + error = ENOENT; /* Snapshot file unlinked */ + goto out1; + } + if (collectsnapstats) + nanotime(&starttime); + + /* The last block might have changed. Copy it again to be sure. */ + error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), + fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); + if (error != 0) + goto out1; + error = readblock(vp, bp, numblks - 1); + bp->b_flags |= B_VALIDSUSPWRT; + bawrite(bp); + if (error != 0) + goto out1; + /* + * First, copy all the cylinder group maps that have changed. + */ + for (cg = 0; cg < fs->fs_ncg; cg++) { + if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) + continue; + redo++; + error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), + fs->fs_bsize, KERNCRED, 0, &nbp); + if (error) + goto out1; + error = cgaccount(cg, vp, nbp, 2); + bawrite(nbp); + if (error) + goto out1; + } + /* + * Grab a copy of the superblock and its summary information. + * We delay writing it until the suspension is released below. + */ + copy_fs = malloc((u_long)fs->fs_bsize, M_UFSMNT, M_WAITOK); + bcopy(fs, copy_fs, fs->fs_sbsize); + if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) + copy_fs->fs_clean = 1; + size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; + if (fs->fs_sbsize < size) + bzero(&((char *)copy_fs)[fs->fs_sbsize], + size - fs->fs_sbsize); + size = blkroundup(fs, fs->fs_cssize); + if (fs->fs_contigsumsize > 0) + size += fs->fs_ncg * sizeof(int32_t); + space = malloc((u_long)size, M_UFSMNT, M_WAITOK); + copy_fs->fs_csp = space; + bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); + space = (char *)space + fs->fs_cssize; + loc = howmany(fs->fs_cssize, fs->fs_fsize); + i = fs->fs_frag - loc % fs->fs_frag; + len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; + if (len > 0) { + if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), + len, KERNCRED, &bp)) != 0) { + brelse(bp); + free(copy_fs->fs_csp, M_UFSMNT); + free(copy_fs, M_UFSMNT); + copy_fs = NULL; + goto out1; + } + bcopy(bp->b_data, space, (u_int)len); + space = (char *)space + len; + bp->b_flags |= B_INVAL | B_NOCACHE; + brelse(bp); + } + if (fs->fs_contigsumsize > 0) { + copy_fs->fs_maxcluster = lp = space; + for (i = 0; i < fs->fs_ncg; i++) + *lp++ = fs->fs_contigsumsize; + } + /* + * We must check for active files that have been unlinked + * (e.g., with a zero link count). We have to expunge all + * trace of these files from the snapshot so that they are + * not reclaimed prematurely by fsck or unnecessarily dumped. + * We turn off the MNTK_SUSPENDED flag to avoid a panic from + * spec_strategy about writing on a suspended filesystem. + * Note that we skip unlinked snapshot files as they will + * be handled separately below. + * + * We also calculate the needed size for the snapshot list. + */ + snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + + FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; + MNT_ILOCK(mp); + mp->mnt_kern_flag &= ~MNTK_SUSPENDED; + MNT_IUNLOCK(mp); +loop: + MNT_VNODE_FOREACH_ALL(xvp, mp, mvp) { + if ((xvp->v_usecount == 0 && + (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) || + xvp->v_type == VNON || + IS_SNAPSHOT(VTOI(xvp))) { + VI_UNLOCK(xvp); + continue; + } + /* + * We can skip parent directory vnode because it must have + * this snapshot file in it. + */ + if (xvp == nd.ni_dvp) { + VI_UNLOCK(xvp); + continue; + } + vholdl(xvp); + if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) { + MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); + vdrop(xvp); + goto loop; + } + VI_LOCK(xvp); + if (xvp->v_usecount == 0 && + (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) { + VI_UNLOCK(xvp); + VOP_UNLOCK(xvp, 0); + vdrop(xvp); + continue; + } + VI_UNLOCK(xvp); + if (snapdebug) + vn_printf(xvp, "ffs_snapshot: busy vnode "); + if (VOP_GETATTR(xvp, &vat, td->td_ucred) == 0 && + vat.va_nlink > 0) { + VOP_UNLOCK(xvp, 0); + vdrop(xvp); + continue; + } + xp = VTOI(xvp); + if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { + VOP_UNLOCK(xvp, 0); + vdrop(xvp); + continue; + } + /* + * If there is a fragment, clear it here. + */ + blkno = 0; + loc = howmany(xp->i_size, fs->fs_bsize) - 1; + if (loc < NDADDR) { + len = fragroundup(fs, blkoff(fs, xp->i_size)); + if (len != 0 && len < fs->fs_bsize) { + ffs_blkfree(ump, copy_fs, vp, + DIP(xp, i_db[loc]), len, xp->i_number, + xvp->v_type, NULL); + blkno = DIP(xp, i_db[loc]); + DIP_SET(xp, i_db[loc], 0); + } + } + snaplistsize += 1; + if (I_IS_UFS1(xp)) + error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, + BLK_NOCOPY, 1); + else + error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, + BLK_NOCOPY, 1); + if (blkno) + DIP_SET(xp, i_db[loc], blkno); + if (!error) + error = ffs_freefile(ump, copy_fs, vp, xp->i_number, + xp->i_mode, NULL); + VOP_UNLOCK(xvp, 0); + vdrop(xvp); + if (error) { + free(copy_fs->fs_csp, M_UFSMNT); + free(copy_fs, M_UFSMNT); + copy_fs = NULL; + MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); + goto out1; + } + } + /* + * Erase the journal file from the snapshot. + */ + if (fs->fs_flags & FS_SUJ) { + error = softdep_journal_lookup(mp, &xvp); + if (error) { + free(copy_fs->fs_csp, M_UFSMNT); + free(copy_fs, M_UFSMNT); + copy_fs = NULL; + goto out1; + } + xp = VTOI(xvp); + if (I_IS_UFS1(xp)) + error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, + BLK_NOCOPY, 0); + else + error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, + BLK_NOCOPY, 0); + vput(xvp); + } + /* + * Acquire a lock on the snapdata structure, creating it if necessary. + */ + sn = ffs_snapdata_acquire(devvp); + /* + * Change vnode to use shared snapshot lock instead of the original + * private lock. + */ + vp->v_vnlock = &sn->sn_lock; + lockmgr(&vp->v_lock, LK_RELEASE, NULL); + xp = TAILQ_FIRST(&sn->sn_head); + /* + * If this is the first snapshot on this filesystem, then we need + * to allocate the space for the list of preallocated snapshot blocks. + * This list will be refined below, but this preliminary one will + * keep us out of deadlock until the full one is ready. + */ + if (xp == NULL) { + snapblklist = malloc(snaplistsize * sizeof(daddr_t), + M_UFSMNT, M_WAITOK); + blkp = &snapblklist[1]; + *blkp++ = lblkno(fs, fs->fs_sblockloc); + blkno = fragstoblks(fs, fs->fs_csaddr); + for (cg = 0; cg < fs->fs_ncg; cg++) { + if (fragstoblks(fs, cgtod(fs, cg) > blkno)) + break; + *blkp++ = fragstoblks(fs, cgtod(fs, cg)); + } + len = howmany(fs->fs_cssize, fs->fs_bsize); + for (loc = 0; loc < len; loc++) + *blkp++ = blkno + loc; + for (; cg < fs->fs_ncg; cg++) + *blkp++ = fragstoblks(fs, cgtod(fs, cg)); + snapblklist[0] = blkp - snapblklist; + VI_LOCK(devvp); + if (sn->sn_blklist != NULL) + panic("ffs_snapshot: non-empty list"); + sn->sn_blklist = snapblklist; + sn->sn_listsize = blkp - snapblklist; + VI_UNLOCK(devvp); + } + /* + * Record snapshot inode. Since this is the newest snapshot, + * it must be placed at the end of the list. + */ + VI_LOCK(devvp); + fs->fs_snapinum[snaploc] = ip->i_number; + if (ip->i_nextsnap.tqe_prev != 0) + panic("ffs_snapshot: %ju already on list", + (uintmax_t)ip->i_number); + TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); + devvp->v_vflag |= VV_COPYONWRITE; + VI_UNLOCK(devvp); + ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); +out1: + KASSERT((sn != NULL && copy_fs != NULL && error == 0) || + (sn == NULL && copy_fs == NULL && error != 0), + ("email phk@ and mckusick@")); + /* + * Resume operation on filesystem. + */ + vfs_write_resume(vp->v_mount, VR_START_WRITE | VR_NO_SUSPCLR); + if (collectsnapstats && starttime.tv_sec > 0) { + nanotime(&endtime); + timespecsub(&endtime, &starttime); + printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", + vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, + endtime.tv_nsec / 1000000, redo, fs->fs_ncg); + } + if (copy_fs == NULL) + goto out; + /* + * Copy allocation information from all the snapshots in + * this snapshot and then expunge them from its view. + */ + TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) { + if (xp == ip) + break; + if (I_IS_UFS1(xp)) + error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, + BLK_SNAP, 0); + else + error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, + BLK_SNAP, 0); + if (error == 0 && xp->i_effnlink == 0) { + error = ffs_freefile(ump, + copy_fs, + vp, + xp->i_number, + xp->i_mode, NULL); + } + if (error) { + fs->fs_snapinum[snaploc] = 0; + goto done; + } + } + /* + * Allocate space for the full list of preallocated snapshot blocks. + */ + snapblklist = malloc(snaplistsize * sizeof(daddr_t), + M_UFSMNT, M_WAITOK); + ip->i_snapblklist = &snapblklist[1]; + /* + * Expunge the blocks used by the snapshots from the set of + * blocks marked as used in the snapshot bitmaps. Also, collect + * the list of allocated blocks in i_snapblklist. + */ + if (I_IS_UFS1(ip)) + error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, + BLK_SNAP, 0); + else + error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, + BLK_SNAP, 0); + if (error) { + fs->fs_snapinum[snaploc] = 0; + free(snapblklist, M_UFSMNT); + goto done; + } + if (snaplistsize < ip->i_snapblklist - snapblklist) + panic("ffs_snapshot: list too small"); + snaplistsize = ip->i_snapblklist - snapblklist; + snapblklist[0] = snaplistsize; + ip->i_snapblklist = 0; + /* + * Write out the list of allocated blocks to the end of the snapshot. + */ + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = (void *)snapblklist; + aiov.iov_len = snaplistsize * sizeof(daddr_t); + auio.uio_resid = aiov.iov_len; + auio.uio_offset = ip->i_size; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_WRITE; + auio.uio_td = td; + if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { + fs->fs_snapinum[snaploc] = 0; + free(snapblklist, M_UFSMNT); + goto done; + } + /* + * Write the superblock and its summary information + * to the snapshot. + */ + blkno = fragstoblks(fs, fs->fs_csaddr); + len = howmany(fs->fs_cssize, fs->fs_bsize); + space = copy_fs->fs_csp; + for (loc = 0; loc < len; loc++) { + error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); + if (error) { + brelse(nbp); + fs->fs_snapinum[snaploc] = 0; + free(snapblklist, M_UFSMNT); + goto done; + } + bcopy(space, nbp->b_data, fs->fs_bsize); + space = (char *)space + fs->fs_bsize; + bawrite(nbp); + } + error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, + KERNCRED, &nbp); + if (error) { + brelse(nbp); + } else { + loc = blkoff(fs, fs->fs_sblockloc); + bcopy((char *)copy_fs, &nbp->b_data[loc], (u_int)fs->fs_sbsize); + bawrite(nbp); + } + /* + * As this is the newest list, it is the most inclusive, so + * should replace the previous list. + */ + VI_LOCK(devvp); + space = sn->sn_blklist; + sn->sn_blklist = snapblklist; + sn->sn_listsize = snaplistsize; + VI_UNLOCK(devvp); + if (space != NULL) + free(space, M_UFSMNT); + /* + * Preallocate all the direct blocks in the snapshot inode so + * that we never have to write the inode itself to commit an + * update to the contents of the snapshot. Note that once + * created, the size of the snapshot will never change, so + * there will never be a need to write the inode except to + * update the non-integrity-critical time fields and + * allocated-block count. + */ + for (blockno = 0; blockno < NDADDR; blockno++) { + if (DIP(ip, i_db[blockno]) != 0) + continue; + error = UFS_BALLOC(vp, lblktosize(fs, blockno), + fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); + if (error) + break; + error = readblock(vp, bp, blockno); + bawrite(bp); + if (error != 0) + break; + } +done: + free(copy_fs->fs_csp, M_UFSMNT); + free(copy_fs, M_UFSMNT); + copy_fs = NULL; +out: + NDFREE(&nd, NDF_ONLY_PNBUF); + if (saved_nice > 0) { + struct proc *p; + + p = td->td_proc; + PROC_LOCK(p); + sched_nice(td->td_proc, saved_nice); + PROC_UNLOCK(td->td_proc); + } + UFS_LOCK(ump); + if (fs->fs_active != 0) { + free(fs->fs_active, M_DEVBUF); + fs->fs_active = 0; + } + UFS_UNLOCK(ump); + MNT_ILOCK(mp); + mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); + MNT_IUNLOCK(mp); + if (error) + (void) ffs_truncate(vp, (off_t)0, 0, NOCRED); + (void) ffs_syncvnode(vp, MNT_WAIT, 0); + if (error) + vput(vp); + else + VOP_UNLOCK(vp, 0); + vrele(nd.ni_dvp); + vn_finished_write(wrtmp); + process_deferred_inactive(mp); + return (error); +} + +/* + * Copy a cylinder group map. All the unallocated blocks are marked + * BLK_NOCOPY so that the snapshot knows that it need not copy them + * if they are later written. If passno is one, then this is a first + * pass, so only setting needs to be done. If passno is 2, then this + * is a revision to a previous pass which must be undone as the + * replacement pass is done. + */ +static int +cgaccount(cg, vp, nbp, passno) + int cg; + struct vnode *vp; + struct buf *nbp; + int passno; +{ + struct buf *bp, *ibp; + struct inode *ip; + struct cg *cgp; + struct fs *fs; + ufs2_daddr_t base, numblks; + int error, len, loc, indiroff; + + ip = VTOI(vp); + fs = ITOFS(ip); + error = bread(ITODEVVP(ip), fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, KERNCRED, &bp); + if (error) { + brelse(bp); + return (error); + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp)) { + brelse(bp); + return (EIO); + } + UFS_LOCK(ITOUMP(ip)); + ACTIVESET(fs, cg); + /* + * Recomputation of summary information might not have been performed + * at mount time. Sync up summary information for current cylinder + * group while data is in memory to ensure that result of background + * fsck is slightly more consistent. + */ + fs->fs_cs(fs, cg) = cgp->cg_cs; + UFS_UNLOCK(ITOUMP(ip)); + bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); + if (fs->fs_cgsize < fs->fs_bsize) + bzero(&nbp->b_data[fs->fs_cgsize], + fs->fs_bsize - fs->fs_cgsize); + cgp = (struct cg *)nbp->b_data; + bqrelse(bp); + if (passno == 2) + nbp->b_flags |= B_VALIDSUSPWRT; + numblks = howmany(fs->fs_size, fs->fs_frag); + len = howmany(fs->fs_fpg, fs->fs_frag); + base = cgbase(fs, cg) / fs->fs_frag; + if (base + len >= numblks) + len = numblks - base - 1; + loc = 0; + if (base < NDADDR) { + for ( ; loc < NDADDR; loc++) { + if (ffs_isblock(fs, cg_blksfree(cgp), loc)) + DIP_SET(ip, i_db[loc], BLK_NOCOPY); + else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) + DIP_SET(ip, i_db[loc], 0); + else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) + panic("ffs_snapshot: lost direct block"); + } + } + error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), + fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); + if (error) { + return (error); + } + indiroff = (base + loc - NDADDR) % NINDIR(fs); + for ( ; loc < len; loc++, indiroff++) { + if (indiroff >= NINDIR(fs)) { + if (passno == 2) + ibp->b_flags |= B_VALIDSUSPWRT; + bawrite(ibp); + error = UFS_BALLOC(vp, + lblktosize(fs, (off_t)(base + loc)), + fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); + if (error) { + return (error); + } + indiroff = 0; + } + if (I_IS_UFS1(ip)) { + if (ffs_isblock(fs, cg_blksfree(cgp), loc)) + ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = + BLK_NOCOPY; + else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) + [indiroff] == BLK_NOCOPY) + ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; + else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) + [indiroff] == BLK_NOCOPY) + panic("ffs_snapshot: lost indirect block"); + continue; + } + if (ffs_isblock(fs, cg_blksfree(cgp), loc)) + ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; + else if (passno == 2 && + ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) + ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; + else if (passno == 1 && + ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) + panic("ffs_snapshot: lost indirect block"); + } + if (passno == 2) + ibp->b_flags |= B_VALIDSUSPWRT; + bdwrite(ibp); + return (0); +} + +/* + * Before expunging a snapshot inode, note all the + * blocks that it claims with BLK_SNAP so that fsck will + * be able to account for those blocks properly and so + * that this snapshot knows that it need not copy them + * if the other snapshot holding them is freed. This code + * is reproduced once each for UFS1 and UFS2. + */ +static int +expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype, clearmode) + struct vnode *snapvp; + struct inode *cancelip; + struct fs *fs; + int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, + struct fs *, ufs_lbn_t, int); + int expungetype; + int clearmode; +{ + int i, error, indiroff; + ufs_lbn_t lbn, rlbn; + ufs2_daddr_t len, blkno, numblks, blksperindir; + struct ufs1_dinode *dip; + struct thread *td = curthread; + struct buf *bp; + + /* + * Prepare to expunge the inode. If its inode block has not + * yet been copied, then allocate and fill the copy. + */ + lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); + blkno = 0; + if (lbn < NDADDR) { + blkno = VTOI(snapvp)->i_din1->di_db[lbn]; + } else { + if (DOINGSOFTDEP(snapvp)) + softdep_prealloc(snapvp, MNT_WAIT); + td->td_pflags |= TDP_COWINPROGRESS; + error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), + fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); + td->td_pflags &= ~TDP_COWINPROGRESS; + if (error) + return (error); + indiroff = (lbn - NDADDR) % NINDIR(fs); + blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; + bqrelse(bp); + } + if (blkno != 0) { + if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) + return (error); + } else { + error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), + fs->fs_bsize, KERNCRED, 0, &bp); + if (error) + return (error); + if ((error = readblock(snapvp, bp, lbn)) != 0) + return (error); + } + /* + * Set a snapshot inode to be a zero length file, regular files + * or unlinked snapshots to be completely unallocated. + */ + dip = (struct ufs1_dinode *)bp->b_data + + ino_to_fsbo(fs, cancelip->i_number); + if (clearmode || cancelip->i_effnlink == 0) + dip->di_mode = 0; + dip->di_size = 0; + dip->di_blocks = 0; + dip->di_flags &= ~SF_SNAPSHOT; + bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); + bdwrite(bp); + /* + * Now go through and expunge all the blocks in the file + * using the function requested. + */ + numblks = howmany(cancelip->i_size, fs->fs_bsize); + if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], + &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype))) + return (error); + if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], + &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype))) + return (error); + blksperindir = 1; + lbn = -NDADDR; + len = numblks - NDADDR; + rlbn = NDADDR; + for (i = 0; len > 0 && i < NIADDR; i++) { + error = indiracct_ufs1(snapvp, ITOV(cancelip), i, + cancelip->i_din1->di_ib[i], lbn, rlbn, len, + blksperindir, fs, acctfunc, expungetype); + if (error) + return (error); + blksperindir *= NINDIR(fs); + lbn -= blksperindir + 1; + len -= blksperindir; + rlbn += blksperindir; + } + return (0); +} + +/* + * Descend an indirect block chain for vnode cancelvp accounting for all + * its indirect blocks in snapvp. + */ +static int +indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, + blksperindir, fs, acctfunc, expungetype) + struct vnode *snapvp; + struct vnode *cancelvp; + int level; + ufs1_daddr_t blkno; + ufs_lbn_t lbn; + ufs_lbn_t rlbn; + ufs_lbn_t remblks; + ufs_lbn_t blksperindir; + struct fs *fs; + int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, + struct fs *, ufs_lbn_t, int); + int expungetype; +{ + int error, num, i; + ufs_lbn_t subblksperindir; + struct indir indirs[NIADDR + 2]; + ufs1_daddr_t last, *bap; + struct buf *bp; + + if (blkno == 0) { + if (expungetype == BLK_NOCOPY) + return (0); + panic("indiracct_ufs1: missing indir"); + } + if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) + return (error); + if (lbn != indirs[num - 1 - level].in_lbn || num < 2) + panic("indiracct_ufs1: botched params"); + /* + * We have to expand bread here since it will deadlock looking + * up the block number for any blocks that are not in the cache. + */ + bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); + bp->b_blkno = fsbtodb(fs, blkno); + if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && + (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { + brelse(bp); + return (error); + } + /* + * Account for the block pointers in this indirect block. + */ + last = howmany(remblks, blksperindir); + if (last > NINDIR(fs)) + last = NINDIR(fs); + bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); + bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); + bqrelse(bp); + error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, + level == 0 ? rlbn : -1, expungetype); + if (error || level == 0) + goto out; + /* + * Account for the block pointers in each of the indirect blocks + * in the levels below us. + */ + subblksperindir = blksperindir / NINDIR(fs); + for (lbn++, level--, i = 0; i < last; i++) { + error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, + rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); + if (error) + goto out; + rlbn += blksperindir; + lbn -= blksperindir; + remblks -= blksperindir; + } +out: + free(bap, M_DEVBUF); + return (error); +} + +/* + * Do both snap accounting and map accounting. + */ +static int +fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) + struct vnode *vp; + ufs1_daddr_t *oldblkp, *lastblkp; + struct fs *fs; + ufs_lbn_t lblkno; + int exptype; /* BLK_SNAP or BLK_NOCOPY */ +{ + int error; + + if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) + return (error); + return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); +} + +/* + * Identify a set of blocks allocated in a snapshot inode. + */ +static int +snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) + struct vnode *vp; + ufs1_daddr_t *oldblkp, *lastblkp; + struct fs *fs; + ufs_lbn_t lblkno; + int expungetype; /* BLK_SNAP or BLK_NOCOPY */ +{ + struct inode *ip = VTOI(vp); + ufs1_daddr_t blkno, *blkp; + ufs_lbn_t lbn; + struct buf *ibp; + int error; + + for ( ; oldblkp < lastblkp; oldblkp++) { + blkno = *oldblkp; + if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) + continue; + lbn = fragstoblks(fs, blkno); + if (lbn < NDADDR) { + blkp = &ip->i_din1->di_db[lbn]; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } else { + error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn), + fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); + if (error) + return (error); + blkp = &((ufs1_daddr_t *)(ibp->b_data)) + [(lbn - NDADDR) % NINDIR(fs)]; + } + /* + * If we are expunging a snapshot vnode and we + * find a block marked BLK_NOCOPY, then it is + * one that has been allocated to this snapshot after + * we took our current snapshot and can be ignored. + */ + if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { + if (lbn >= NDADDR) + brelse(ibp); + } else { + if (*blkp != 0) + panic("snapacct_ufs1: bad block"); + *blkp = expungetype; + if (lbn >= NDADDR) + bdwrite(ibp); + } + } + return (0); +} + +/* + * Account for a set of blocks allocated in a snapshot inode. + */ +static int +mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) + struct vnode *vp; + ufs1_daddr_t *oldblkp, *lastblkp; + struct fs *fs; + ufs_lbn_t lblkno; + int expungetype; +{ + ufs1_daddr_t blkno; + struct inode *ip; + ino_t inum; + int acctit; + + ip = VTOI(vp); + inum = ip->i_number; + if (lblkno == -1) + acctit = 0; + else + acctit = 1; + for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { + blkno = *oldblkp; + if (blkno == 0 || blkno == BLK_NOCOPY) + continue; + if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) + *ip->i_snapblklist++ = lblkno; + if (blkno == BLK_SNAP) + blkno = blkstofrags(fs, lblkno); + ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, + vp->v_type, NULL); + } + return (0); +} + +/* + * Before expunging a snapshot inode, note all the + * blocks that it claims with BLK_SNAP so that fsck will + * be able to account for those blocks properly and so + * that this snapshot knows that it need not copy them + * if the other snapshot holding them is freed. This code + * is reproduced once each for UFS1 and UFS2. + */ +static int +expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype, clearmode) + struct vnode *snapvp; + struct inode *cancelip; + struct fs *fs; + int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, + struct fs *, ufs_lbn_t, int); + int expungetype; + int clearmode; +{ + int i, error, indiroff; + ufs_lbn_t lbn, rlbn; + ufs2_daddr_t len, blkno, numblks, blksperindir; + struct ufs2_dinode *dip; + struct thread *td = curthread; + struct buf *bp; + + /* + * Prepare to expunge the inode. If its inode block has not + * yet been copied, then allocate and fill the copy. + */ + lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); + blkno = 0; + if (lbn < NDADDR) { + blkno = VTOI(snapvp)->i_din2->di_db[lbn]; + } else { + if (DOINGSOFTDEP(snapvp)) + softdep_prealloc(snapvp, MNT_WAIT); + td->td_pflags |= TDP_COWINPROGRESS; + error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), + fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); + td->td_pflags &= ~TDP_COWINPROGRESS; + if (error) + return (error); + indiroff = (lbn - NDADDR) % NINDIR(fs); + blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; + bqrelse(bp); + } + if (blkno != 0) { + if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) + return (error); + } else { + error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), + fs->fs_bsize, KERNCRED, 0, &bp); + if (error) + return (error); + if ((error = readblock(snapvp, bp, lbn)) != 0) + return (error); + } + /* + * Set a snapshot inode to be a zero length file, regular files + * to be completely unallocated. + */ + dip = (struct ufs2_dinode *)bp->b_data + + ino_to_fsbo(fs, cancelip->i_number); + if (clearmode || cancelip->i_effnlink == 0) + dip->di_mode = 0; + dip->di_size = 0; + dip->di_blocks = 0; + dip->di_flags &= ~SF_SNAPSHOT; + bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); + bdwrite(bp); + /* + * Now go through and expunge all the blocks in the file + * using the function requested. + */ + numblks = howmany(cancelip->i_size, fs->fs_bsize); + if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], + &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype))) + return (error); + if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], + &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype))) + return (error); + blksperindir = 1; + lbn = -NDADDR; + len = numblks - NDADDR; + rlbn = NDADDR; + for (i = 0; len > 0 && i < NIADDR; i++) { + error = indiracct_ufs2(snapvp, ITOV(cancelip), i, + cancelip->i_din2->di_ib[i], lbn, rlbn, len, + blksperindir, fs, acctfunc, expungetype); + if (error) + return (error); + blksperindir *= NINDIR(fs); + lbn -= blksperindir + 1; + len -= blksperindir; + rlbn += blksperindir; + } + return (0); +} + +/* + * Descend an indirect block chain for vnode cancelvp accounting for all + * its indirect blocks in snapvp. + */ +static int +indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, + blksperindir, fs, acctfunc, expungetype) + struct vnode *snapvp; + struct vnode *cancelvp; + int level; + ufs2_daddr_t blkno; + ufs_lbn_t lbn; + ufs_lbn_t rlbn; + ufs_lbn_t remblks; + ufs_lbn_t blksperindir; + struct fs *fs; + int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, + struct fs *, ufs_lbn_t, int); + int expungetype; +{ + int error, num, i; + ufs_lbn_t subblksperindir; + struct indir indirs[NIADDR + 2]; + ufs2_daddr_t last, *bap; + struct buf *bp; + + if (blkno == 0) { + if (expungetype == BLK_NOCOPY) + return (0); + panic("indiracct_ufs2: missing indir"); + } + if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) + return (error); + if (lbn != indirs[num - 1 - level].in_lbn || num < 2) + panic("indiracct_ufs2: botched params"); + /* + * We have to expand bread here since it will deadlock looking + * up the block number for any blocks that are not in the cache. + */ + bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); + bp->b_blkno = fsbtodb(fs, blkno); + if ((bp->b_flags & B_CACHE) == 0 && + (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { + brelse(bp); + return (error); + } + /* + * Account for the block pointers in this indirect block. + */ + last = howmany(remblks, blksperindir); + if (last > NINDIR(fs)) + last = NINDIR(fs); + bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); + bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); + bqrelse(bp); + error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, + level == 0 ? rlbn : -1, expungetype); + if (error || level == 0) + goto out; + /* + * Account for the block pointers in each of the indirect blocks + * in the levels below us. + */ + subblksperindir = blksperindir / NINDIR(fs); + for (lbn++, level--, i = 0; i < last; i++) { + error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, + rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); + if (error) + goto out; + rlbn += blksperindir; + lbn -= blksperindir; + remblks -= blksperindir; + } +out: + free(bap, M_DEVBUF); + return (error); +} + +/* + * Do both snap accounting and map accounting. + */ +static int +fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) + struct vnode *vp; + ufs2_daddr_t *oldblkp, *lastblkp; + struct fs *fs; + ufs_lbn_t lblkno; + int exptype; /* BLK_SNAP or BLK_NOCOPY */ +{ + int error; + + if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) + return (error); + return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); +} + +/* + * Identify a set of blocks allocated in a snapshot inode. + */ +static int +snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) + struct vnode *vp; + ufs2_daddr_t *oldblkp, *lastblkp; + struct fs *fs; + ufs_lbn_t lblkno; + int expungetype; /* BLK_SNAP or BLK_NOCOPY */ +{ + struct inode *ip = VTOI(vp); + ufs2_daddr_t blkno, *blkp; + ufs_lbn_t lbn; + struct buf *ibp; + int error; + + for ( ; oldblkp < lastblkp; oldblkp++) { + blkno = *oldblkp; + if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) + continue; + lbn = fragstoblks(fs, blkno); + if (lbn < NDADDR) { + blkp = &ip->i_din2->di_db[lbn]; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } else { + error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn), + fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); + if (error) + return (error); + blkp = &((ufs2_daddr_t *)(ibp->b_data)) + [(lbn - NDADDR) % NINDIR(fs)]; + } + /* + * If we are expunging a snapshot vnode and we + * find a block marked BLK_NOCOPY, then it is + * one that has been allocated to this snapshot after + * we took our current snapshot and can be ignored. + */ + if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { + if (lbn >= NDADDR) + brelse(ibp); + } else { + if (*blkp != 0) + panic("snapacct_ufs2: bad block"); + *blkp = expungetype; + if (lbn >= NDADDR) + bdwrite(ibp); + } + } + return (0); +} + +/* + * Account for a set of blocks allocated in a snapshot inode. + */ +static int +mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) + struct vnode *vp; + ufs2_daddr_t *oldblkp, *lastblkp; + struct fs *fs; + ufs_lbn_t lblkno; + int expungetype; +{ + ufs2_daddr_t blkno; + struct inode *ip; + ino_t inum; + int acctit; + + ip = VTOI(vp); + inum = ip->i_number; + if (lblkno == -1) + acctit = 0; + else + acctit = 1; + for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { + blkno = *oldblkp; + if (blkno == 0 || blkno == BLK_NOCOPY) + continue; + if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) + *ip->i_snapblklist++ = lblkno; + if (blkno == BLK_SNAP) + blkno = blkstofrags(fs, lblkno); + ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, + vp->v_type, NULL); + } + return (0); +} + +/* + * Decrement extra reference on snapshot when last name is removed. + * It will not be freed until the last open reference goes away. + */ +void +ffs_snapgone(ip) + struct inode *ip; +{ + struct inode *xp; + struct fs *fs; + int snaploc; + struct snapdata *sn; + struct ufsmount *ump; + + /* + * Find snapshot in incore list. + */ + xp = NULL; + sn = ITODEVVP(ip)->v_rdev->si_snapdata; + if (sn != NULL) + TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) + if (xp == ip) + break; + if (xp != NULL) + vrele(ITOV(ip)); + else if (snapdebug) + printf("ffs_snapgone: lost snapshot vnode %ju\n", + (uintmax_t)ip->i_number); + /* + * Delete snapshot inode from superblock. Keep list dense. + */ + ump = ITOUMP(ip); + fs = ump->um_fs; + UFS_LOCK(ump); + for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) + if (fs->fs_snapinum[snaploc] == ip->i_number) + break; + if (snaploc < FSMAXSNAP) { + for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { + if (fs->fs_snapinum[snaploc] == 0) + break; + fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; + } + fs->fs_snapinum[snaploc - 1] = 0; + } + UFS_UNLOCK(ump); +} + +/* + * Prepare a snapshot file for being removed. + */ +void +ffs_snapremove(vp) + struct vnode *vp; +{ + struct inode *ip; + struct vnode *devvp; + struct buf *ibp; + struct fs *fs; + ufs2_daddr_t numblks, blkno, dblk; + int error, i, last, loc; + struct snapdata *sn; + + ip = VTOI(vp); + fs = ITOFS(ip); + devvp = ITODEVVP(ip); + /* + * If active, delete from incore list (this snapshot may + * already have been in the process of being deleted, so + * would not have been active). + * + * Clear copy-on-write flag if last snapshot. + */ + VI_LOCK(devvp); + if (ip->i_nextsnap.tqe_prev != 0) { + sn = devvp->v_rdev->si_snapdata; + TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap); + ip->i_nextsnap.tqe_prev = 0; + VI_UNLOCK(devvp); + lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); + for (i = 0; i < sn->sn_lock.lk_recurse; i++) + lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); + KASSERT(vp->v_vnlock == &sn->sn_lock, + ("ffs_snapremove: lost lock mutation")); + vp->v_vnlock = &vp->v_lock; + VI_LOCK(devvp); + while (sn->sn_lock.lk_recurse > 0) + lockmgr(&sn->sn_lock, LK_RELEASE, NULL); + lockmgr(&sn->sn_lock, LK_RELEASE, NULL); + try_free_snapdata(devvp); + } else + VI_UNLOCK(devvp); + /* + * Clear all BLK_NOCOPY fields. Pass any block claims to other + * snapshots that want them (see ffs_snapblkfree below). + */ + for (blkno = 1; blkno < NDADDR; blkno++) { + dblk = DIP(ip, i_db[blkno]); + if (dblk == 0) + continue; + if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) + DIP_SET(ip, i_db[blkno], 0); + else if ((dblk == blkstofrags(fs, blkno) && + ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize, + ip->i_number, vp->v_type, NULL))) { + DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - + btodb(fs->fs_bsize)); + DIP_SET(ip, i_db[blkno], 0); + } + } + numblks = howmany(ip->i_size, fs->fs_bsize); + for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { + error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), + fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); + if (error) + continue; + if (fs->fs_size - blkno > NINDIR(fs)) + last = NINDIR(fs); + else + last = fs->fs_size - blkno; + for (loc = 0; loc < last; loc++) { + if (I_IS_UFS1(ip)) { + dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; + if (dblk == 0) + continue; + if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) + ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; + else if ((dblk == blkstofrags(fs, blkno) && + ffs_snapblkfree(fs, ITODEVVP(ip), dblk, + fs->fs_bsize, ip->i_number, vp->v_type, + NULL))) { + ip->i_din1->di_blocks -= + btodb(fs->fs_bsize); + ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; + } + continue; + } + dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; + if (dblk == 0) + continue; + if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) + ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; + else if ((dblk == blkstofrags(fs, blkno) && + ffs_snapblkfree(fs, ITODEVVP(ip), dblk, + fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { + ip->i_din2->di_blocks -= btodb(fs->fs_bsize); + ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; + } + } + bawrite(ibp); + } + /* + * Clear snapshot flag and drop reference. + */ + ip->i_flags &= ~SF_SNAPSHOT; + DIP_SET(ip, i_flags, ip->i_flags); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + /* + * The dirtied indirects must be written out before + * softdep_setup_freeblocks() is called. Otherwise indir_trunc() + * may find indirect pointers using the magic BLK_* values. + */ + if (DOINGSOFTDEP(vp)) + ffs_syncvnode(vp, MNT_WAIT, 0); +#ifdef QUOTA + /* + * Reenable disk quotas for ex-snapshot file. + */ + if (!getinoquota(ip)) + (void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE); +#endif +} + +/* + * Notification that a block is being freed. Return zero if the free + * should be allowed to proceed. Return non-zero if the snapshot file + * wants to claim the block. The block will be claimed if it is an + * uncopied part of one of the snapshots. It will be freed if it is + * either a BLK_NOCOPY or has already been copied in all of the snapshots. + * If a fragment is being freed, then all snapshots that care about + * it must make a copy since a snapshot file can only claim full sized + * blocks. Note that if more than one snapshot file maps the block, + * we can pick one at random to claim it. Since none of the snapshots + * can change, we are assurred that they will all see the same unmodified + * image. When deleting a snapshot file (see ffs_snapremove above), we + * must push any of these claimed blocks to one of the other snapshots + * that maps it. These claimed blocks are easily identified as they will + * have a block number equal to their logical block number within the + * snapshot. A copied block can never have this property because they + * must always have been allocated from a BLK_NOCOPY location. + */ +int +ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd) + struct fs *fs; + struct vnode *devvp; + ufs2_daddr_t bno; + long size; + ino_t inum; + enum vtype vtype; + struct workhead *wkhd; +{ + struct buf *ibp, *cbp, *savedcbp = NULL; + struct thread *td = curthread; + struct inode *ip; + struct vnode *vp = NULL; + ufs_lbn_t lbn; + ufs2_daddr_t blkno; + int indiroff = 0, error = 0, claimedblk = 0; + struct snapdata *sn; + + lbn = fragstoblks(fs, bno); +retry: + VI_LOCK(devvp); + sn = devvp->v_rdev->si_snapdata; + if (sn == NULL) { + VI_UNLOCK(devvp); + return (0); + } + if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, + VI_MTX(devvp)) != 0) + goto retry; + TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { + vp = ITOV(ip); + if (DOINGSOFTDEP(vp)) + softdep_prealloc(vp, MNT_WAIT); + /* + * Lookup block being written. + */ + if (lbn < NDADDR) { + blkno = DIP(ip, i_db[lbn]); + } else { + td->td_pflags |= TDP_COWINPROGRESS; + error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), + fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); + td->td_pflags &= ~TDP_COWINPROGRESS; + if (error) + break; + indiroff = (lbn - NDADDR) % NINDIR(fs); + if (I_IS_UFS1(ip)) + blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; + else + blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; + } + /* + * Check to see if block needs to be copied. + */ + if (blkno == 0) { + /* + * A block that we map is being freed. If it has not + * been claimed yet, we will claim or copy it (below). + */ + claimedblk = 1; + } else if (blkno == BLK_SNAP) { + /* + * No previous snapshot claimed the block, + * so it will be freed and become a BLK_NOCOPY + * (don't care) for us. + */ + if (claimedblk) + panic("snapblkfree: inconsistent block type"); + if (lbn < NDADDR) { + DIP_SET(ip, i_db[lbn], BLK_NOCOPY); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } else if (I_IS_UFS1(ip)) { + ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = + BLK_NOCOPY; + bdwrite(ibp); + } else { + ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = + BLK_NOCOPY; + bdwrite(ibp); + } + continue; + } else /* BLK_NOCOPY or default */ { + /* + * If the snapshot has already copied the block + * (default), or does not care about the block, + * it is not needed. + */ + if (lbn >= NDADDR) + bqrelse(ibp); + continue; + } + /* + * If this is a full size block, we will just grab it + * and assign it to the snapshot inode. Otherwise we + * will proceed to copy it. See explanation for this + * routine as to why only a single snapshot needs to + * claim this block. + */ + if (size == fs->fs_bsize) { +#ifdef DEBUG + if (snapdebug) + printf("%s %ju lbn %jd from inum %ju\n", + "Grabonremove: snapino", + (uintmax_t)ip->i_number, + (intmax_t)lbn, (uintmax_t)inum); +#endif + /* + * If journaling is tracking this write we must add + * the work to the inode or indirect being written. + */ + if (wkhd != NULL) { + if (lbn < NDADDR) + softdep_inode_append(ip, + curthread->td_ucred, wkhd); + else + softdep_buf_append(ibp, wkhd); + } + if (lbn < NDADDR) { + DIP_SET(ip, i_db[lbn], bno); + } else if (I_IS_UFS1(ip)) { + ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; + bdwrite(ibp); + } else { + ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; + bdwrite(ibp); + } + DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size)); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + lockmgr(vp->v_vnlock, LK_RELEASE, NULL); + return (1); + } + if (lbn >= NDADDR) + bqrelse(ibp); + /* + * Allocate the block into which to do the copy. Note that this + * allocation will never require any additional allocations for + * the snapshot inode. + */ + td->td_pflags |= TDP_COWINPROGRESS; + error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), + fs->fs_bsize, KERNCRED, 0, &cbp); + td->td_pflags &= ~TDP_COWINPROGRESS; + if (error) + break; +#ifdef DEBUG + if (snapdebug) + printf("%s%ju lbn %jd %s %ju size %ld to blkno %jd\n", + "Copyonremove: snapino ", (uintmax_t)ip->i_number, + (intmax_t)lbn, "for inum", (uintmax_t)inum, size, + (intmax_t)cbp->b_blkno); +#endif + /* + * If we have already read the old block contents, then + * simply copy them to the new block. Note that we need + * to synchronously write snapshots that have not been + * unlinked, and hence will be visible after a crash, + * to ensure their integrity. At a minimum we ensure the + * integrity of the filesystem metadata, but use the + * dopersistence sysctl-setable flag to decide on the + * persistence needed for file content data. + */ + if (savedcbp != NULL) { + bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); + bawrite(cbp); + if ((vtype == VDIR || dopersistence) && + ip->i_effnlink > 0) + (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); + continue; + } + /* + * Otherwise, read the old block contents into the buffer. + */ + if ((error = readblock(vp, cbp, lbn)) != 0) { + bzero(cbp->b_data, fs->fs_bsize); + bawrite(cbp); + if ((vtype == VDIR || dopersistence) && + ip->i_effnlink > 0) + (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); + break; + } + savedcbp = cbp; + } + /* + * Note that we need to synchronously write snapshots that + * have not been unlinked, and hence will be visible after + * a crash, to ensure their integrity. At a minimum we + * ensure the integrity of the filesystem metadata, but + * use the dopersistence sysctl-setable flag to decide on + * the persistence needed for file content data. + */ + if (savedcbp) { + vp = savedcbp->b_vp; + bawrite(savedcbp); + if ((vtype == VDIR || dopersistence) && + VTOI(vp)->i_effnlink > 0) + (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); + } + /* + * If we have been unable to allocate a block in which to do + * the copy, then return non-zero so that the fragment will + * not be freed. Although space will be lost, the snapshot + * will stay consistent. + */ + if (error != 0 && wkhd != NULL) + softdep_freework(wkhd); + lockmgr(&sn->sn_lock, LK_RELEASE, NULL); + return (error); +} + +/* + * Associate snapshot files when mounting. + */ +void +ffs_snapshot_mount(mp) + struct mount *mp; +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct vnode *devvp = ump->um_devvp; + struct fs *fs = ump->um_fs; + struct thread *td = curthread; + struct snapdata *sn; + struct vnode *vp; + struct vnode *lastvp; + struct inode *ip; + struct uio auio; + struct iovec aiov; + void *snapblklist; + char *reason; + daddr_t snaplistsize; + int error, snaploc, loc; + + /* + * XXX The following needs to be set before ffs_truncate or + * VOP_READ can be called. + */ + mp->mnt_stat.f_iosize = fs->fs_bsize; + /* + * Process each snapshot listed in the superblock. + */ + vp = NULL; + lastvp = NULL; + sn = NULL; + for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { + if (fs->fs_snapinum[snaploc] == 0) + break; + if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc], + LK_EXCLUSIVE, &vp)) != 0){ + printf("ffs_snapshot_mount: vget failed %d\n", error); + continue; + } + ip = VTOI(vp); + if (!IS_SNAPSHOT(ip) || ip->i_size == + lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { + if (!IS_SNAPSHOT(ip)) { + reason = "non-snapshot"; + } else { + reason = "old format snapshot"; + (void)ffs_truncate(vp, (off_t)0, 0, NOCRED); + (void)ffs_syncvnode(vp, MNT_WAIT, 0); + } + printf("ffs_snapshot_mount: %s inode %d\n", + reason, fs->fs_snapinum[snaploc]); + vput(vp); + vp = NULL; + for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { + if (fs->fs_snapinum[loc] == 0) + break; + fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; + } + fs->fs_snapinum[loc - 1] = 0; + snaploc--; + continue; + } + /* + * Acquire a lock on the snapdata structure, creating it if + * necessary. + */ + sn = ffs_snapdata_acquire(devvp); + /* + * Change vnode to use shared snapshot lock instead of the + * original private lock. + */ + vp->v_vnlock = &sn->sn_lock; + lockmgr(&vp->v_lock, LK_RELEASE, NULL); + /* + * Link it onto the active snapshot list. + */ + VI_LOCK(devvp); + if (ip->i_nextsnap.tqe_prev != 0) + panic("ffs_snapshot_mount: %ju already on list", + (uintmax_t)ip->i_number); + else + TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); + vp->v_vflag |= VV_SYSTEM; + VI_UNLOCK(devvp); + VOP_UNLOCK(vp, 0); + lastvp = vp; + } + vp = lastvp; + /* + * No usable snapshots found. + */ + if (sn == NULL || vp == NULL) + return; + /* + * Allocate the space for the block hints list. We always want to + * use the list from the newest snapshot. + */ + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = (void *)&snaplistsize; + aiov.iov_len = sizeof(snaplistsize); + auio.uio_resid = aiov.iov_len; + auio.uio_offset = + lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_td = td; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { + printf("ffs_snapshot_mount: read_1 failed %d\n", error); + VOP_UNLOCK(vp, 0); + return; + } + snapblklist = malloc(snaplistsize * sizeof(daddr_t), + M_UFSMNT, M_WAITOK); + auio.uio_iovcnt = 1; + aiov.iov_base = snapblklist; + aiov.iov_len = snaplistsize * sizeof (daddr_t); + auio.uio_resid = aiov.iov_len; + auio.uio_offset -= sizeof(snaplistsize); + if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { + printf("ffs_snapshot_mount: read_2 failed %d\n", error); + VOP_UNLOCK(vp, 0); + free(snapblklist, M_UFSMNT); + return; + } + VOP_UNLOCK(vp, 0); + VI_LOCK(devvp); + ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); + sn->sn_listsize = snaplistsize; + sn->sn_blklist = (daddr_t *)snapblklist; + devvp->v_vflag |= VV_COPYONWRITE; + VI_UNLOCK(devvp); +} + +/* + * Disassociate snapshot files when unmounting. + */ +void +ffs_snapshot_unmount(mp) + struct mount *mp; +{ + struct vnode *devvp = VFSTOUFS(mp)->um_devvp; + struct snapdata *sn; + struct inode *xp; + struct vnode *vp; + + VI_LOCK(devvp); + sn = devvp->v_rdev->si_snapdata; + while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) { + vp = ITOV(xp); + TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap); + xp->i_nextsnap.tqe_prev = 0; + lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE, + VI_MTX(devvp)); + lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); + KASSERT(vp->v_vnlock == &sn->sn_lock, + ("ffs_snapshot_unmount: lost lock mutation")); + vp->v_vnlock = &vp->v_lock; + lockmgr(&vp->v_lock, LK_RELEASE, NULL); + lockmgr(&sn->sn_lock, LK_RELEASE, NULL); + if (xp->i_effnlink > 0) + vrele(vp); + VI_LOCK(devvp); + sn = devvp->v_rdev->si_snapdata; + } + try_free_snapdata(devvp); + ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); +} + +/* + * Check the buffer block to be belong to device buffer that shall be + * locked after snaplk. devvp shall be locked on entry, and will be + * leaved locked upon exit. + */ +static int +ffs_bp_snapblk(devvp, bp) + struct vnode *devvp; + struct buf *bp; +{ + struct snapdata *sn; + struct fs *fs; + ufs2_daddr_t lbn, *snapblklist; + int lower, upper, mid; + + ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk"); + KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp)); + sn = devvp->v_rdev->si_snapdata; + if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL) + return (0); + fs = ITOFS(TAILQ_FIRST(&sn->sn_head)); + lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); + snapblklist = sn->sn_blklist; + upper = sn->sn_listsize - 1; + lower = 1; + while (lower <= upper) { + mid = (lower + upper) / 2; + if (snapblklist[mid] == lbn) + break; + if (snapblklist[mid] < lbn) + lower = mid + 1; + else + upper = mid - 1; + } + if (lower <= upper) + return (1); + return (0); +} + +void +ffs_bdflush(bo, bp) + struct bufobj *bo; + struct buf *bp; +{ + struct thread *td; + struct vnode *vp, *devvp; + struct buf *nbp; + int bp_bdskip; + + if (bo->bo_dirty.bv_cnt <= dirtybufthresh) + return; + + td = curthread; + vp = bp->b_vp; + devvp = bo->__bo_vnode; + KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp)); + + VI_LOCK(devvp); + bp_bdskip = ffs_bp_snapblk(devvp, bp); + if (bp_bdskip) + bdwriteskip++; + VI_UNLOCK(devvp); + if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) { + (void) VOP_FSYNC(vp, MNT_NOWAIT, td); + altbufferflushes++; + } else { + BO_LOCK(bo); + /* + * Try to find a buffer to flush. + */ + TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { + if ((nbp->b_vflags & BV_BKGRDINPROG) || + BUF_LOCK(nbp, + LK_EXCLUSIVE | LK_NOWAIT, NULL)) + continue; + if (bp == nbp) + panic("bdwrite: found ourselves"); + BO_UNLOCK(bo); + /* + * Don't countdeps with the bo lock + * held. + */ + if (buf_countdeps(nbp, 0)) { + BO_LOCK(bo); + BUF_UNLOCK(nbp); + continue; + } + if (bp_bdskip) { + VI_LOCK(devvp); + if (!ffs_bp_snapblk(vp, nbp)) { + VI_UNLOCK(devvp); + BO_LOCK(bo); + BUF_UNLOCK(nbp); + continue; + } + VI_UNLOCK(devvp); + } + if (nbp->b_flags & B_CLUSTEROK) { + vfs_bio_awrite(nbp); + } else { + bremfree(nbp); + bawrite(nbp); + } + dirtybufferflushes++; + break; + } + if (nbp == NULL) + BO_UNLOCK(bo); + } +} + +/* + * Check for need to copy block that is about to be written, + * copying the block if necessary. + */ +int +ffs_copyonwrite(devvp, bp) + struct vnode *devvp; + struct buf *bp; +{ + struct snapdata *sn; + struct buf *ibp, *cbp, *savedcbp = NULL; + struct thread *td = curthread; + struct fs *fs; + struct inode *ip; + struct vnode *vp = NULL; + ufs2_daddr_t lbn, blkno, *snapblklist; + int lower, upper, mid, indiroff, error = 0; + int launched_async_io, prev_norunningbuf; + long saved_runningbufspace; + + if (devvp != bp->b_vp && IS_SNAPSHOT(VTOI(bp->b_vp))) + return (0); /* Update on a snapshot file */ + if (td->td_pflags & TDP_COWINPROGRESS) + panic("ffs_copyonwrite: recursive call"); + /* + * First check to see if it is in the preallocated list. + * By doing this check we avoid several potential deadlocks. + */ + VI_LOCK(devvp); + sn = devvp->v_rdev->si_snapdata; + if (sn == NULL || + TAILQ_EMPTY(&sn->sn_head)) { + VI_UNLOCK(devvp); + return (0); /* No snapshot */ + } + ip = TAILQ_FIRST(&sn->sn_head); + fs = ITOFS(ip); + lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); + snapblklist = sn->sn_blklist; + upper = sn->sn_listsize - 1; + lower = 1; + while (lower <= upper) { + mid = (lower + upper) / 2; + if (snapblklist[mid] == lbn) + break; + if (snapblklist[mid] < lbn) + lower = mid + 1; + else + upper = mid - 1; + } + if (lower <= upper) { + VI_UNLOCK(devvp); + return (0); + } + launched_async_io = 0; + prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF; + /* + * Since I/O on bp isn't yet in progress and it may be blocked + * for a long time waiting on snaplk, back it out of + * runningbufspace, possibly waking other threads waiting for space. + */ + saved_runningbufspace = bp->b_runningbufspace; + if (saved_runningbufspace != 0) + runningbufwakeup(bp); + /* + * Not in the precomputed list, so check the snapshots. + */ + while (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, + VI_MTX(devvp)) != 0) { + VI_LOCK(devvp); + sn = devvp->v_rdev->si_snapdata; + if (sn == NULL || + TAILQ_EMPTY(&sn->sn_head)) { + VI_UNLOCK(devvp); + if (saved_runningbufspace != 0) { + bp->b_runningbufspace = saved_runningbufspace; + atomic_add_long(&runningbufspace, + bp->b_runningbufspace); + } + return (0); /* Snapshot gone */ + } + } + TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { + vp = ITOV(ip); + if (DOINGSOFTDEP(vp)) + softdep_prealloc(vp, MNT_WAIT); + /* + * We ensure that everything of our own that needs to be + * copied will be done at the time that ffs_snapshot is + * called. Thus we can skip the check here which can + * deadlock in doing the lookup in UFS_BALLOC. + */ + if (bp->b_vp == vp) + continue; + /* + * Check to see if block needs to be copied. We do not have + * to hold the snapshot lock while doing this lookup as it + * will never require any additional allocations for the + * snapshot inode. + */ + if (lbn < NDADDR) { + blkno = DIP(ip, i_db[lbn]); + } else { + td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; + error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), + fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); + td->td_pflags &= ~TDP_COWINPROGRESS; + if (error) + break; + indiroff = (lbn - NDADDR) % NINDIR(fs); + if (I_IS_UFS1(ip)) + blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; + else + blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; + bqrelse(ibp); + } +#ifdef INVARIANTS + if (blkno == BLK_SNAP && bp->b_lblkno >= 0) + panic("ffs_copyonwrite: bad copy block"); +#endif + if (blkno != 0) + continue; + /* + * Allocate the block into which to do the copy. Since + * multiple processes may all try to copy the same block, + * we have to recheck our need to do a copy if we sleep + * waiting for the lock. + * + * Because all snapshots on a filesystem share a single + * lock, we ensure that we will never be in competition + * with another process to allocate a block. + */ + td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; + error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), + fs->fs_bsize, KERNCRED, 0, &cbp); + td->td_pflags &= ~TDP_COWINPROGRESS; + if (error) + break; +#ifdef DEBUG + if (snapdebug) { + printf("Copyonwrite: snapino %ju lbn %jd for ", + (uintmax_t)ip->i_number, (intmax_t)lbn); + if (bp->b_vp == devvp) + printf("fs metadata"); + else + printf("inum %ju", + (uintmax_t)VTOI(bp->b_vp)->i_number); + printf(" lblkno %jd to blkno %jd\n", + (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); + } +#endif + /* + * If we have already read the old block contents, then + * simply copy them to the new block. Note that we need + * to synchronously write snapshots that have not been + * unlinked, and hence will be visible after a crash, + * to ensure their integrity. At a minimum we ensure the + * integrity of the filesystem metadata, but use the + * dopersistence sysctl-setable flag to decide on the + * persistence needed for file content data. + */ + if (savedcbp != NULL) { + bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); + bawrite(cbp); + if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || + dopersistence) && ip->i_effnlink > 0) + (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); + else + launched_async_io = 1; + continue; + } + /* + * Otherwise, read the old block contents into the buffer. + */ + if ((error = readblock(vp, cbp, lbn)) != 0) { + bzero(cbp->b_data, fs->fs_bsize); + bawrite(cbp); + if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || + dopersistence) && ip->i_effnlink > 0) + (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); + else + launched_async_io = 1; + break; + } + savedcbp = cbp; + } + /* + * Note that we need to synchronously write snapshots that + * have not been unlinked, and hence will be visible after + * a crash, to ensure their integrity. At a minimum we + * ensure the integrity of the filesystem metadata, but + * use the dopersistence sysctl-setable flag to decide on + * the persistence needed for file content data. + */ + if (savedcbp) { + vp = savedcbp->b_vp; + bawrite(savedcbp); + if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || + dopersistence) && VTOI(vp)->i_effnlink > 0) + (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); + else + launched_async_io = 1; + } + lockmgr(vp->v_vnlock, LK_RELEASE, NULL); + td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) | + prev_norunningbuf; + if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0) + waitrunningbufspace(); + /* + * I/O on bp will now be started, so count it in runningbufspace. + */ + if (saved_runningbufspace != 0) { + bp->b_runningbufspace = saved_runningbufspace; + atomic_add_long(&runningbufspace, bp->b_runningbufspace); + } + return (error); +} + +/* + * sync snapshots to force freework records waiting on snapshots to claim + * blocks to free. + */ +void +ffs_sync_snap(mp, waitfor) + struct mount *mp; + int waitfor; +{ + struct snapdata *sn; + struct vnode *devvp; + struct vnode *vp; + struct inode *ip; + + devvp = VFSTOUFS(mp)->um_devvp; + if ((devvp->v_vflag & VV_COPYONWRITE) == 0) + return; + for (;;) { + VI_LOCK(devvp); + sn = devvp->v_rdev->si_snapdata; + if (sn == NULL) { + VI_UNLOCK(devvp); + return; + } + if (lockmgr(&sn->sn_lock, + LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, + VI_MTX(devvp)) == 0) + break; + } + TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { + vp = ITOV(ip); + ffs_syncvnode(vp, waitfor, NO_INO_UPDT); + } + lockmgr(&sn->sn_lock, LK_RELEASE, NULL); +} + +/* + * Read the specified block into the given buffer. + * Much of this boiler-plate comes from bwrite(). + */ +static int +readblock(vp, bp, lbn) + struct vnode *vp; + struct buf *bp; + ufs2_daddr_t lbn; +{ + struct inode *ip = VTOI(vp); + struct bio *bip; + struct fs *fs; + + ip = VTOI(vp); + fs = ITOFS(ip); + + bip = g_alloc_bio(); + bip->bio_cmd = BIO_READ; + bip->bio_offset = dbtob(fsbtodb(fs, blkstofrags(fs, lbn))); + bip->bio_data = bp->b_data; + bip->bio_length = bp->b_bcount; + bip->bio_done = NULL; + + g_io_request(bip, ITODEVVP(ip)->v_bufobj.bo_private); + bp->b_error = biowait(bip, "snaprdb"); + g_destroy_bio(bip); + return (bp->b_error); +} + +#endif + +/* + * Process file deletes that were deferred by ufs_inactive() due to + * the file system being suspended. Transfer IN_LAZYACCESS into + * IN_MODIFIED for vnodes that were accessed during suspension. + */ +void +process_deferred_inactive(struct mount *mp) +{ + struct vnode *vp, *mvp; + struct inode *ip; + struct thread *td; + int error; + + td = curthread; + (void) vn_start_secondary_write(NULL, &mp, V_WAIT); + loop: + MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { + /* + * IN_LAZYACCESS is checked here without holding any + * vnode lock, but this flag is set only while holding + * vnode interlock. + */ + if (vp->v_type == VNON || + ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 && + ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0))) { + VI_UNLOCK(vp); + continue; + } + vholdl(vp); + error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); + if (error != 0) { + vdrop(vp); + if (error == ENOENT) + continue; /* vnode recycled */ + MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); + goto loop; + } + ip = VTOI(vp); + if ((ip->i_flag & IN_LAZYACCESS) != 0) { + ip->i_flag &= ~IN_LAZYACCESS; + ip->i_flag |= IN_MODIFIED; + } + VI_LOCK(vp); + if ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0) { + VI_UNLOCK(vp); + VOP_UNLOCK(vp, 0); + vdrop(vp); + continue; + } + vinactive(vp, td); + VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, + ("process_deferred_inactive: got VI_OWEINACT")); + VI_UNLOCK(vp); + VOP_UNLOCK(vp, 0); + vdrop(vp); + } + vn_finished_secondary_write(mp); +} + +#ifndef NO_FFS_SNAPSHOT + +static struct snapdata * +ffs_snapdata_alloc(void) +{ + struct snapdata *sn; + + /* + * Fetch a snapdata from the free list if there is one available. + */ + mtx_lock(&snapfree_lock); + sn = LIST_FIRST(&snapfree); + if (sn != NULL) + LIST_REMOVE(sn, sn_link); + mtx_unlock(&snapfree_lock); + if (sn != NULL) + return (sn); + /* + * If there were no free snapdatas allocate one. + */ + sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); + TAILQ_INIT(&sn->sn_head); + lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, + LK_CANRECURSE | LK_NOSHARE); + return (sn); +} + +/* + * The snapdata is never freed because we can not be certain that + * there are no threads sleeping on the snap lock. Persisting + * them permanently avoids costly synchronization in ffs_lock(). + */ +static void +ffs_snapdata_free(struct snapdata *sn) +{ + mtx_lock(&snapfree_lock); + LIST_INSERT_HEAD(&snapfree, sn, sn_link); + mtx_unlock(&snapfree_lock); +} + +/* Try to free snapdata associated with devvp */ +static void +try_free_snapdata(struct vnode *devvp) +{ + struct snapdata *sn; + ufs2_daddr_t *snapblklist; + + ASSERT_VI_LOCKED(devvp, "try_free_snapdata"); + sn = devvp->v_rdev->si_snapdata; + + if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL || + (devvp->v_vflag & VV_COPYONWRITE) == 0) { + VI_UNLOCK(devvp); + return; + } + + devvp->v_rdev->si_snapdata = NULL; + devvp->v_vflag &= ~VV_COPYONWRITE; + lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp)); + snapblklist = sn->sn_blklist; + sn->sn_blklist = NULL; + sn->sn_listsize = 0; + lockmgr(&sn->sn_lock, LK_RELEASE, NULL); + if (snapblklist != NULL) + free(snapblklist, M_UFSMNT); + ffs_snapdata_free(sn); +} + +static struct snapdata * +ffs_snapdata_acquire(struct vnode *devvp) +{ + struct snapdata *nsn, *sn; + int error; + + /* + * Allocate a free snapdata. This is done before acquiring the + * devvp lock to avoid allocation while the devvp interlock is + * held. + */ + nsn = ffs_snapdata_alloc(); + + for (;;) { + VI_LOCK(devvp); + sn = devvp->v_rdev->si_snapdata; + if (sn == NULL) { + /* + * This is the first snapshot on this + * filesystem and we use our pre-allocated + * snapdata. Publish sn with the sn_lock + * owned by us, to avoid the race. + */ + error = lockmgr(&nsn->sn_lock, LK_EXCLUSIVE | + LK_NOWAIT, NULL); + if (error != 0) + panic("leaked sn, lockmgr error %d", error); + sn = devvp->v_rdev->si_snapdata = nsn; + VI_UNLOCK(devvp); + nsn = NULL; + break; + } + + /* + * There is a snapshots which already exists on this + * filesystem, grab a reference to the common lock. + */ + error = lockmgr(&sn->sn_lock, LK_INTERLOCK | + LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)); + if (error == 0) + break; + } + + /* + * Free any unused snapdata. + */ + if (nsn != NULL) + ffs_snapdata_free(nsn); + + return (sn); +} + +#endif diff --git a/Dump/ufs/ffs/ffs_softdep.c b/Dump/ufs/ffs/ffs_softdep.c new file mode 100644 index 0000000..c154435 --- /dev/null +++ b/Dump/ufs/ffs/ffs_softdep.c @@ -0,0 +1,14469 @@ +/*- + * Copyright 1998, 2000 Marshall Kirk McKusick. + * Copyright 2009, 2010 Jeffrey W. Roberson + * All rights reserved. + * + * The soft updates code is derived from the appendix of a University + * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, + * "Soft Updates: A Solution to the Metadata Update Problem in File + * Systems", CSE-TR-254-95, August 1995). + * + * Further information about soft updates can be obtained from: + * + * Marshall Kirk McKusick http://www.mckusick.com/softdep/ + * 1614 Oxford Street mckusick@mckusick.com + * Berkeley, CA 94709-1608 +1-510-843-9542 + * USA + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR + * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 + */ + +#include +__FBSDID("$FreeBSD: releng/11.2/sys/ufs/ffs/ffs_softdep.c 330446 2018-03-05 06:59:30Z eadler $"); + +#include "opt_ffs.h" +#include "opt_quota.h" +#include "opt_ddb.h" + +/* + * For now we want the safety net that the DEBUG flag provides. + */ +#ifndef DEBUG +#define DEBUG +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +#define KTR_SUJ 0 /* Define to KTR_SPARE. */ + +#ifndef SOFTUPDATES + +int +softdep_flushfiles(oldmnt, flags, td) + struct mount *oldmnt; + int flags; + struct thread *td; +{ + + panic("softdep_flushfiles called"); +} + +int +softdep_mount(devvp, mp, fs, cred) + struct vnode *devvp; + struct mount *mp; + struct fs *fs; + struct ucred *cred; +{ + + return (0); +} + +void +softdep_initialize() +{ + + return; +} + +void +softdep_uninitialize() +{ + + return; +} + +void +softdep_unmount(mp) + struct mount *mp; +{ + + panic("softdep_unmount called"); +} + +void +softdep_setup_sbupdate(ump, fs, bp) + struct ufsmount *ump; + struct fs *fs; + struct buf *bp; +{ + + panic("softdep_setup_sbupdate called"); +} + +void +softdep_setup_inomapdep(bp, ip, newinum, mode) + struct buf *bp; + struct inode *ip; + ino_t newinum; + int mode; +{ + + panic("softdep_setup_inomapdep called"); +} + +void +softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) + struct buf *bp; + struct mount *mp; + ufs2_daddr_t newblkno; + int frags; + int oldfrags; +{ + + panic("softdep_setup_blkmapdep called"); +} + +void +softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) + struct inode *ip; + ufs_lbn_t lbn; + ufs2_daddr_t newblkno; + ufs2_daddr_t oldblkno; + long newsize; + long oldsize; + struct buf *bp; +{ + + panic("softdep_setup_allocdirect called"); +} + +void +softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) + struct inode *ip; + ufs_lbn_t lbn; + ufs2_daddr_t newblkno; + ufs2_daddr_t oldblkno; + long newsize; + long oldsize; + struct buf *bp; +{ + + panic("softdep_setup_allocext called"); +} + +void +softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) + struct inode *ip; + ufs_lbn_t lbn; + struct buf *bp; + int ptrno; + ufs2_daddr_t newblkno; + ufs2_daddr_t oldblkno; + struct buf *nbp; +{ + + panic("softdep_setup_allocindir_page called"); +} + +void +softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) + struct buf *nbp; + struct inode *ip; + struct buf *bp; + int ptrno; + ufs2_daddr_t newblkno; +{ + + panic("softdep_setup_allocindir_meta called"); +} + +void +softdep_journal_freeblocks(ip, cred, length, flags) + struct inode *ip; + struct ucred *cred; + off_t length; + int flags; +{ + + panic("softdep_journal_freeblocks called"); +} + +void +softdep_journal_fsync(ip) + struct inode *ip; +{ + + panic("softdep_journal_fsync called"); +} + +void +softdep_setup_freeblocks(ip, length, flags) + struct inode *ip; + off_t length; + int flags; +{ + + panic("softdep_setup_freeblocks called"); +} + +void +softdep_freefile(pvp, ino, mode) + struct vnode *pvp; + ino_t ino; + int mode; +{ + + panic("softdep_freefile called"); +} + +int +softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) + struct buf *bp; + struct inode *dp; + off_t diroffset; + ino_t newinum; + struct buf *newdirbp; + int isnewblk; +{ + + panic("softdep_setup_directory_add called"); +} + +void +softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) + struct buf *bp; + struct inode *dp; + caddr_t base; + caddr_t oldloc; + caddr_t newloc; + int entrysize; +{ + + panic("softdep_change_directoryentry_offset called"); +} + +void +softdep_setup_remove(bp, dp, ip, isrmdir) + struct buf *bp; + struct inode *dp; + struct inode *ip; + int isrmdir; +{ + + panic("softdep_setup_remove called"); +} + +void +softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) + struct buf *bp; + struct inode *dp; + struct inode *ip; + ino_t newinum; + int isrmdir; +{ + + panic("softdep_setup_directory_change called"); +} + +void +softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) + struct mount *mp; + struct buf *bp; + ufs2_daddr_t blkno; + int frags; + struct workhead *wkhd; +{ + + panic("%s called", __FUNCTION__); +} + +void +softdep_setup_inofree(mp, bp, ino, wkhd) + struct mount *mp; + struct buf *bp; + ino_t ino; + struct workhead *wkhd; +{ + + panic("%s called", __FUNCTION__); +} + +void +softdep_setup_unlink(dp, ip) + struct inode *dp; + struct inode *ip; +{ + + panic("%s called", __FUNCTION__); +} + +void +softdep_setup_link(dp, ip) + struct inode *dp; + struct inode *ip; +{ + + panic("%s called", __FUNCTION__); +} + +void +softdep_revert_link(dp, ip) + struct inode *dp; + struct inode *ip; +{ + + panic("%s called", __FUNCTION__); +} + +void +softdep_setup_rmdir(dp, ip) + struct inode *dp; + struct inode *ip; +{ + + panic("%s called", __FUNCTION__); +} + +void +softdep_revert_rmdir(dp, ip) + struct inode *dp; + struct inode *ip; +{ + + panic("%s called", __FUNCTION__); +} + +void +softdep_setup_create(dp, ip) + struct inode *dp; + struct inode *ip; +{ + + panic("%s called", __FUNCTION__); +} + +void +softdep_revert_create(dp, ip) + struct inode *dp; + struct inode *ip; +{ + + panic("%s called", __FUNCTION__); +} + +void +softdep_setup_mkdir(dp, ip) + struct inode *dp; + struct inode *ip; +{ + + panic("%s called", __FUNCTION__); +} + +void +softdep_revert_mkdir(dp, ip) + struct inode *dp; + struct inode *ip; +{ + + panic("%s called", __FUNCTION__); +} + +void +softdep_setup_dotdot_link(dp, ip) + struct inode *dp; + struct inode *ip; +{ + + panic("%s called", __FUNCTION__); +} + +int +softdep_prealloc(vp, waitok) + struct vnode *vp; + int waitok; +{ + + panic("%s called", __FUNCTION__); +} + +int +softdep_journal_lookup(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + + return (ENOENT); +} + +void +softdep_change_linkcnt(ip) + struct inode *ip; +{ + + panic("softdep_change_linkcnt called"); +} + +void +softdep_load_inodeblock(ip) + struct inode *ip; +{ + + panic("softdep_load_inodeblock called"); +} + +void +softdep_update_inodeblock(ip, bp, waitfor) + struct inode *ip; + struct buf *bp; + int waitfor; +{ + + panic("softdep_update_inodeblock called"); +} + +int +softdep_fsync(vp) + struct vnode *vp; /* the "in_core" copy of the inode */ +{ + + return (0); +} + +void +softdep_fsync_mountdev(vp) + struct vnode *vp; +{ + + return; +} + +int +softdep_flushworklist(oldmnt, countp, td) + struct mount *oldmnt; + int *countp; + struct thread *td; +{ + + *countp = 0; + return (0); +} + +int +softdep_sync_metadata(struct vnode *vp) +{ + + panic("softdep_sync_metadata called"); +} + +int +softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) +{ + + panic("softdep_sync_buf called"); +} + +int +softdep_slowdown(vp) + struct vnode *vp; +{ + + panic("softdep_slowdown called"); +} + +int +softdep_request_cleanup(fs, vp, cred, resource) + struct fs *fs; + struct vnode *vp; + struct ucred *cred; + int resource; +{ + + return (0); +} + +int +softdep_check_suspend(struct mount *mp, + struct vnode *devvp, + int softdep_depcnt, + int softdep_accdepcnt, + int secondary_writes, + int secondary_accwrites) +{ + struct bufobj *bo; + int error; + + (void) softdep_depcnt, + (void) softdep_accdepcnt; + + bo = &devvp->v_bufobj; + ASSERT_BO_WLOCKED(bo); + + MNT_ILOCK(mp); + while (mp->mnt_secondary_writes != 0) { + BO_UNLOCK(bo); + msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), + (PUSER - 1) | PDROP, "secwr", 0); + BO_LOCK(bo); + MNT_ILOCK(mp); + } + + /* + * Reasons for needing more work before suspend: + * - Dirty buffers on devvp. + * - Secondary writes occurred after start of vnode sync loop + */ + error = 0; + if (bo->bo_numoutput > 0 || + bo->bo_dirty.bv_cnt > 0 || + secondary_writes != 0 || + mp->mnt_secondary_writes != 0 || + secondary_accwrites != mp->mnt_secondary_accwrites) + error = EAGAIN; + BO_UNLOCK(bo); + return (error); +} + +void +softdep_get_depcounts(struct mount *mp, + int *softdepactivep, + int *softdepactiveaccp) +{ + (void) mp; + *softdepactivep = 0; + *softdepactiveaccp = 0; +} + +void +softdep_buf_append(bp, wkhd) + struct buf *bp; + struct workhead *wkhd; +{ + + panic("softdep_buf_appendwork called"); +} + +void +softdep_inode_append(ip, cred, wkhd) + struct inode *ip; + struct ucred *cred; + struct workhead *wkhd; +{ + + panic("softdep_inode_appendwork called"); +} + +void +softdep_freework(wkhd) + struct workhead *wkhd; +{ + + panic("softdep_freework called"); +} + +#else + +FEATURE(softupdates, "FFS soft-updates support"); + +static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, + "soft updates stats"); +static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0, + "total dependencies allocated"); +static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse, CTLFLAG_RW, 0, + "high use dependencies allocated"); +static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0, + "current dependencies allocated"); +static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0, + "current dependencies written"); + +unsigned long dep_current[D_LAST + 1]; +unsigned long dep_highuse[D_LAST + 1]; +unsigned long dep_total[D_LAST + 1]; +unsigned long dep_write[D_LAST + 1]; + +#define SOFTDEP_TYPE(type, str, long) \ + static MALLOC_DEFINE(M_ ## type, #str, long); \ + SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \ + &dep_total[D_ ## type], 0, ""); \ + SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \ + &dep_current[D_ ## type], 0, ""); \ + SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD, \ + &dep_highuse[D_ ## type], 0, ""); \ + SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, \ + &dep_write[D_ ## type], 0, ""); + +SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); +SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies"); +SOFTDEP_TYPE(BMSAFEMAP, bmsafemap, + "Block or frag allocated from cyl group map"); +SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency"); +SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode"); +SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies"); +SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block"); +SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode"); +SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode"); +SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated"); +SOFTDEP_TYPE(DIRADD, diradd, "New directory entry"); +SOFTDEP_TYPE(MKDIR, mkdir, "New directory"); +SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted"); +SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block"); +SOFTDEP_TYPE(FREEWORK, freework, "free an inode block"); +SOFTDEP_TYPE(FREEDEP, freedep, "track a block free"); +SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add"); +SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove"); +SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move"); +SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block"); +SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block"); +SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag"); +SOFTDEP_TYPE(JSEG, jseg, "Journal segment"); +SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete"); +SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency"); +SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation"); +SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete"); + +static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel"); + +static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes"); +static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations"); +static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data"); + +#define M_SOFTDEP_FLAGS (M_WAITOK) + +/* + * translate from workitem type to memory type + * MUST match the defines above, such that memtype[D_XXX] == M_XXX + */ +static struct malloc_type *memtype[] = { + M_PAGEDEP, + M_INODEDEP, + M_BMSAFEMAP, + M_NEWBLK, + M_ALLOCDIRECT, + M_INDIRDEP, + M_ALLOCINDIR, + M_FREEFRAG, + M_FREEBLKS, + M_FREEFILE, + M_DIRADD, + M_MKDIR, + M_DIRREM, + M_NEWDIRBLK, + M_FREEWORK, + M_FREEDEP, + M_JADDREF, + M_JREMREF, + M_JMVREF, + M_JNEWBLK, + M_JFREEBLK, + M_JFREEFRAG, + M_JSEG, + M_JSEGDEP, + M_SBDEP, + M_JTRUNC, + M_JFSYNC, + M_SENTINEL +}; + +#define DtoM(type) (memtype[type]) + +/* + * Names of malloc types. + */ +#define TYPENAME(type) \ + ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???") +/* + * End system adaptation definitions. + */ + +#define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino) +#define DOT_OFFSET offsetof(struct dirtemplate, dot_ino) + +/* + * Internal function prototypes. + */ +static void check_clear_deps(struct mount *); +static void softdep_error(char *, int); +static int softdep_process_worklist(struct mount *, int); +static int softdep_waitidle(struct mount *, int); +static void drain_output(struct vnode *); +static struct buf *getdirtybuf(struct buf *, struct rwlock *, int); +static int check_inodedep_free(struct inodedep *); +static void clear_remove(struct mount *); +static void clear_inodedeps(struct mount *); +static void unlinked_inodedep(struct mount *, struct inodedep *); +static void clear_unlinked_inodedep(struct inodedep *); +static struct inodedep *first_unlinked_inodedep(struct ufsmount *); +static int flush_pagedep_deps(struct vnode *, struct mount *, + struct diraddhd *); +static int free_pagedep(struct pagedep *); +static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t); +static int flush_inodedep_deps(struct vnode *, struct mount *, ino_t); +static int flush_deplist(struct allocdirectlst *, int, int *); +static int sync_cgs(struct mount *, int); +static int handle_written_filepage(struct pagedep *, struct buf *, int); +static int handle_written_sbdep(struct sbdep *, struct buf *); +static void initiate_write_sbdep(struct sbdep *); +static void diradd_inode_written(struct diradd *, struct inodedep *); +static int handle_written_indirdep(struct indirdep *, struct buf *, + struct buf**, int); +static int handle_written_inodeblock(struct inodedep *, struct buf *, int); +static int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *, + uint8_t *); +static int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int); +static void handle_written_jaddref(struct jaddref *); +static void handle_written_jremref(struct jremref *); +static void handle_written_jseg(struct jseg *, struct buf *); +static void handle_written_jnewblk(struct jnewblk *); +static void handle_written_jblkdep(struct jblkdep *); +static void handle_written_jfreefrag(struct jfreefrag *); +static void complete_jseg(struct jseg *); +static void complete_jsegs(struct jseg *); +static void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *); +static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *); +static void jremref_write(struct jremref *, struct jseg *, uint8_t *); +static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *); +static void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *); +static void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data); +static void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *); +static void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *); +static void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *); +static inline void inoref_write(struct inoref *, struct jseg *, + struct jrefrec *); +static void handle_allocdirect_partdone(struct allocdirect *, + struct workhead *); +static struct jnewblk *cancel_newblk(struct newblk *, struct worklist *, + struct workhead *); +static void indirdep_complete(struct indirdep *); +static int indirblk_lookup(struct mount *, ufs2_daddr_t); +static void indirblk_insert(struct freework *); +static void indirblk_remove(struct freework *); +static void handle_allocindir_partdone(struct allocindir *); +static void initiate_write_filepage(struct pagedep *, struct buf *); +static void initiate_write_indirdep(struct indirdep*, struct buf *); +static void handle_written_mkdir(struct mkdir *, int); +static int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *, + uint8_t *); +static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *); +static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); +static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); +static void handle_workitem_freefile(struct freefile *); +static int handle_workitem_remove(struct dirrem *, int); +static struct dirrem *newdirrem(struct buf *, struct inode *, + struct inode *, int, struct dirrem **); +static struct indirdep *indirdep_lookup(struct mount *, struct inode *, + struct buf *); +static void cancel_indirdep(struct indirdep *, struct buf *, + struct freeblks *); +static void free_indirdep(struct indirdep *); +static void free_diradd(struct diradd *, struct workhead *); +static void merge_diradd(struct inodedep *, struct diradd *); +static void complete_diradd(struct diradd *); +static struct diradd *diradd_lookup(struct pagedep *, int); +static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *, + struct jremref *); +static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *, + struct jremref *); +static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *, + struct jremref *, struct jremref *); +static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *, + struct jremref *); +static void cancel_allocindir(struct allocindir *, struct buf *bp, + struct freeblks *, int); +static int setup_trunc_indir(struct freeblks *, struct inode *, + ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t); +static void complete_trunc_indir(struct freework *); +static void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *, + int); +static void complete_mkdir(struct mkdir *); +static void free_newdirblk(struct newdirblk *); +static void free_jremref(struct jremref *); +static void free_jaddref(struct jaddref *); +static void free_jsegdep(struct jsegdep *); +static void free_jsegs(struct jblocks *); +static void rele_jseg(struct jseg *); +static void free_jseg(struct jseg *, struct jblocks *); +static void free_jnewblk(struct jnewblk *); +static void free_jblkdep(struct jblkdep *); +static void free_jfreefrag(struct jfreefrag *); +static void free_freedep(struct freedep *); +static void journal_jremref(struct dirrem *, struct jremref *, + struct inodedep *); +static void cancel_jnewblk(struct jnewblk *, struct workhead *); +static int cancel_jaddref(struct jaddref *, struct inodedep *, + struct workhead *); +static void cancel_jfreefrag(struct jfreefrag *); +static inline void setup_freedirect(struct freeblks *, struct inode *, + int, int); +static inline void setup_freeext(struct freeblks *, struct inode *, int, int); +static inline void setup_freeindir(struct freeblks *, struct inode *, int, + ufs_lbn_t, int); +static inline struct freeblks *newfreeblks(struct mount *, struct inode *); +static void freeblks_free(struct ufsmount *, struct freeblks *, int); +static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t); +static ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t); +static int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int); +static void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t, + int, int); +static void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int); +static int cancel_pagedep(struct pagedep *, struct freeblks *, int); +static int deallocate_dependencies(struct buf *, struct freeblks *, int); +static void newblk_freefrag(struct newblk*); +static void free_newblk(struct newblk *); +static void cancel_allocdirect(struct allocdirectlst *, + struct allocdirect *, struct freeblks *); +static int check_inode_unwritten(struct inodedep *); +static int free_inodedep(struct inodedep *); +static void freework_freeblock(struct freework *); +static void freework_enqueue(struct freework *); +static int handle_workitem_freeblocks(struct freeblks *, int); +static int handle_complete_freeblocks(struct freeblks *, int); +static void handle_workitem_indirblk(struct freework *); +static void handle_written_freework(struct freework *); +static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); +static struct worklist *jnewblk_merge(struct worklist *, struct worklist *, + struct workhead *); +static struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *, + struct inodedep *, struct allocindir *, ufs_lbn_t); +static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, + ufs2_daddr_t, ufs_lbn_t); +static void handle_workitem_freefrag(struct freefrag *); +static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long, + ufs_lbn_t); +static void allocdirect_merge(struct allocdirectlst *, + struct allocdirect *, struct allocdirect *); +static struct freefrag *allocindir_merge(struct allocindir *, + struct allocindir *); +static int bmsafemap_find(struct bmsafemap_hashhead *, int, + struct bmsafemap **); +static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *, + int cg, struct bmsafemap *); +static int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int, + struct newblk **); +static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **); +static int inodedep_find(struct inodedep_hashhead *, ino_t, + struct inodedep **); +static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); +static int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t, + int, struct pagedep **); +static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, + struct pagedep **); +static void pause_timer(void *); +static int request_cleanup(struct mount *, int); +static int softdep_request_cleanup_flush(struct mount *, struct ufsmount *); +static void schedule_cleanup(struct mount *); +static void softdep_ast_cleanup_proc(struct thread *); +static struct ufsmount *softdep_bp_to_mp(struct buf *bp); +static int process_worklist_item(struct mount *, int, int); +static void process_removes(struct vnode *); +static void process_truncates(struct vnode *); +static void jwork_move(struct workhead *, struct workhead *); +static void jwork_insert(struct workhead *, struct jsegdep *); +static void add_to_worklist(struct worklist *, int); +static void wake_worklist(struct worklist *); +static void wait_worklist(struct worklist *, char *); +static void remove_from_worklist(struct worklist *); +static void softdep_flush(void *); +static void softdep_flushjournal(struct mount *); +static int softdep_speedup(struct ufsmount *); +static void worklist_speedup(struct mount *); +static int journal_mount(struct mount *, struct fs *, struct ucred *); +static void journal_unmount(struct ufsmount *); +static int journal_space(struct ufsmount *, int); +static void journal_suspend(struct ufsmount *); +static int journal_unsuspend(struct ufsmount *ump); +static void softdep_prelink(struct vnode *, struct vnode *); +static void add_to_journal(struct worklist *); +static void remove_from_journal(struct worklist *); +static bool softdep_excess_items(struct ufsmount *, int); +static void softdep_process_journal(struct mount *, struct worklist *, int); +static struct jremref *newjremref(struct dirrem *, struct inode *, + struct inode *ip, off_t, nlink_t); +static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t, + uint16_t); +static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t, + uint16_t); +static inline struct jsegdep *inoref_jseg(struct inoref *); +static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t); +static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t, + ufs2_daddr_t, int); +static void adjust_newfreework(struct freeblks *, int); +static struct jtrunc *newjtrunc(struct freeblks *, off_t, int); +static void move_newblock_dep(struct jaddref *, struct inodedep *); +static void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t); +static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *, + ufs2_daddr_t, long, ufs_lbn_t); +static struct freework *newfreework(struct ufsmount *, struct freeblks *, + struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int); +static int jwait(struct worklist *, int); +static struct inodedep *inodedep_lookup_ip(struct inode *); +static int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *); +static struct freefile *handle_bufwait(struct inodedep *, struct workhead *); +static void handle_jwork(struct workhead *); +static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *, + struct mkdir **); +static struct jblocks *jblocks_create(void); +static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *); +static void jblocks_free(struct jblocks *, struct mount *, int); +static void jblocks_destroy(struct jblocks *); +static void jblocks_add(struct jblocks *, ufs2_daddr_t, int); + +/* + * Exported softdep operations. + */ +static void softdep_disk_io_initiation(struct buf *); +static void softdep_disk_write_complete(struct buf *); +static void softdep_deallocate_dependencies(struct buf *); +static int softdep_count_dependencies(struct buf *bp, int); + +/* + * Global lock over all of soft updates. + */ +static struct mtx lk; +MTX_SYSINIT(softdep_lock, &lk, "Global Softdep Lock", MTX_DEF); + +#define ACQUIRE_GBLLOCK(lk) mtx_lock(lk) +#define FREE_GBLLOCK(lk) mtx_unlock(lk) +#define GBLLOCK_OWNED(lk) mtx_assert((lk), MA_OWNED) + +/* + * Per-filesystem soft-updates locking. + */ +#define LOCK_PTR(ump) (&(ump)->um_softdep->sd_fslock) +#define TRY_ACQUIRE_LOCK(ump) rw_try_wlock(&(ump)->um_softdep->sd_fslock) +#define ACQUIRE_LOCK(ump) rw_wlock(&(ump)->um_softdep->sd_fslock) +#define FREE_LOCK(ump) rw_wunlock(&(ump)->um_softdep->sd_fslock) +#define LOCK_OWNED(ump) rw_assert(&(ump)->um_softdep->sd_fslock, \ + RA_WLOCKED) + +#define BUF_AREC(bp) lockallowrecurse(&(bp)->b_lock) +#define BUF_NOREC(bp) lockdisablerecurse(&(bp)->b_lock) + +/* + * Worklist queue management. + * These routines require that the lock be held. + */ +#ifndef /* NOT */ DEBUG +#define WORKLIST_INSERT(head, item) do { \ + (item)->wk_state |= ONWORKLIST; \ + LIST_INSERT_HEAD(head, item, wk_list); \ +} while (0) +#define WORKLIST_REMOVE(item) do { \ + (item)->wk_state &= ~ONWORKLIST; \ + LIST_REMOVE(item, wk_list); \ +} while (0) +#define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT +#define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE + +#else /* DEBUG */ +static void worklist_insert(struct workhead *, struct worklist *, int); +static void worklist_remove(struct worklist *, int); + +#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1) +#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0) +#define WORKLIST_REMOVE(item) worklist_remove(item, 1) +#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0) + +static void +worklist_insert(head, item, locked) + struct workhead *head; + struct worklist *item; + int locked; +{ + + if (locked) + LOCK_OWNED(VFSTOUFS(item->wk_mp)); + if (item->wk_state & ONWORKLIST) + panic("worklist_insert: %p %s(0x%X) already on list", + item, TYPENAME(item->wk_type), item->wk_state); + item->wk_state |= ONWORKLIST; + LIST_INSERT_HEAD(head, item, wk_list); +} + +static void +worklist_remove(item, locked) + struct worklist *item; + int locked; +{ + + if (locked) + LOCK_OWNED(VFSTOUFS(item->wk_mp)); + if ((item->wk_state & ONWORKLIST) == 0) + panic("worklist_remove: %p %s(0x%X) not on list", + item, TYPENAME(item->wk_type), item->wk_state); + item->wk_state &= ~ONWORKLIST; + LIST_REMOVE(item, wk_list); +} +#endif /* DEBUG */ + +/* + * Merge two jsegdeps keeping only the oldest one as newer references + * can't be discarded until after older references. + */ +static inline struct jsegdep * +jsegdep_merge(struct jsegdep *one, struct jsegdep *two) +{ + struct jsegdep *swp; + + if (two == NULL) + return (one); + + if (one->jd_seg->js_seq > two->jd_seg->js_seq) { + swp = one; + one = two; + two = swp; + } + WORKLIST_REMOVE(&two->jd_list); + free_jsegdep(two); + + return (one); +} + +/* + * If two freedeps are compatible free one to reduce list size. + */ +static inline struct freedep * +freedep_merge(struct freedep *one, struct freedep *two) +{ + if (two == NULL) + return (one); + + if (one->fd_freework == two->fd_freework) { + WORKLIST_REMOVE(&two->fd_list); + free_freedep(two); + } + return (one); +} + +/* + * Move journal work from one list to another. Duplicate freedeps and + * jsegdeps are coalesced to keep the lists as small as possible. + */ +static void +jwork_move(dst, src) + struct workhead *dst; + struct workhead *src; +{ + struct freedep *freedep; + struct jsegdep *jsegdep; + struct worklist *wkn; + struct worklist *wk; + + KASSERT(dst != src, + ("jwork_move: dst == src")); + freedep = NULL; + jsegdep = NULL; + LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) { + if (wk->wk_type == D_JSEGDEP) + jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); + else if (wk->wk_type == D_FREEDEP) + freedep = freedep_merge(WK_FREEDEP(wk), freedep); + } + + while ((wk = LIST_FIRST(src)) != NULL) { + WORKLIST_REMOVE(wk); + WORKLIST_INSERT(dst, wk); + if (wk->wk_type == D_JSEGDEP) { + jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); + continue; + } + if (wk->wk_type == D_FREEDEP) + freedep = freedep_merge(WK_FREEDEP(wk), freedep); + } +} + +static void +jwork_insert(dst, jsegdep) + struct workhead *dst; + struct jsegdep *jsegdep; +{ + struct jsegdep *jsegdepn; + struct worklist *wk; + + LIST_FOREACH(wk, dst, wk_list) + if (wk->wk_type == D_JSEGDEP) + break; + if (wk == NULL) { + WORKLIST_INSERT(dst, &jsegdep->jd_list); + return; + } + jsegdepn = WK_JSEGDEP(wk); + if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) { + WORKLIST_REMOVE(wk); + free_jsegdep(jsegdepn); + WORKLIST_INSERT(dst, &jsegdep->jd_list); + } else + free_jsegdep(jsegdep); +} + +/* + * Routines for tracking and managing workitems. + */ +static void workitem_free(struct worklist *, int); +static void workitem_alloc(struct worklist *, int, struct mount *); +static void workitem_reassign(struct worklist *, int); + +#define WORKITEM_FREE(item, type) \ + workitem_free((struct worklist *)(item), (type)) +#define WORKITEM_REASSIGN(item, type) \ + workitem_reassign((struct worklist *)(item), (type)) + +static void +workitem_free(item, type) + struct worklist *item; + int type; +{ + struct ufsmount *ump; + +#ifdef DEBUG + if (item->wk_state & ONWORKLIST) + panic("workitem_free: %s(0x%X) still on list", + TYPENAME(item->wk_type), item->wk_state); + if (item->wk_type != type && type != D_NEWBLK) + panic("workitem_free: type mismatch %s != %s", + TYPENAME(item->wk_type), TYPENAME(type)); +#endif + if (item->wk_state & IOWAITING) + wakeup(item); + ump = VFSTOUFS(item->wk_mp); + LOCK_OWNED(ump); + KASSERT(ump->softdep_deps > 0, + ("workitem_free: %s: softdep_deps going negative", + ump->um_fs->fs_fsmnt)); + if (--ump->softdep_deps == 0 && ump->softdep_req) + wakeup(&ump->softdep_deps); + KASSERT(dep_current[item->wk_type] > 0, + ("workitem_free: %s: dep_current[%s] going negative", + ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); + KASSERT(ump->softdep_curdeps[item->wk_type] > 0, + ("workitem_free: %s: softdep_curdeps[%s] going negative", + ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); + atomic_subtract_long(&dep_current[item->wk_type], 1); + ump->softdep_curdeps[item->wk_type] -= 1; + free(item, DtoM(type)); +} + +static void +workitem_alloc(item, type, mp) + struct worklist *item; + int type; + struct mount *mp; +{ + struct ufsmount *ump; + + item->wk_type = type; + item->wk_mp = mp; + item->wk_state = 0; + + ump = VFSTOUFS(mp); + ACQUIRE_GBLLOCK(&lk); + dep_current[type]++; + if (dep_current[type] > dep_highuse[type]) + dep_highuse[type] = dep_current[type]; + dep_total[type]++; + FREE_GBLLOCK(&lk); + ACQUIRE_LOCK(ump); + ump->softdep_curdeps[type] += 1; + ump->softdep_deps++; + ump->softdep_accdeps++; + FREE_LOCK(ump); +} + +static void +workitem_reassign(item, newtype) + struct worklist *item; + int newtype; +{ + struct ufsmount *ump; + + ump = VFSTOUFS(item->wk_mp); + LOCK_OWNED(ump); + KASSERT(ump->softdep_curdeps[item->wk_type] > 0, + ("workitem_reassign: %s: softdep_curdeps[%s] going negative", + VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); + ump->softdep_curdeps[item->wk_type] -= 1; + ump->softdep_curdeps[newtype] += 1; + KASSERT(dep_current[item->wk_type] > 0, + ("workitem_reassign: %s: dep_current[%s] going negative", + VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); + ACQUIRE_GBLLOCK(&lk); + dep_current[newtype]++; + dep_current[item->wk_type]--; + if (dep_current[newtype] > dep_highuse[newtype]) + dep_highuse[newtype] = dep_current[newtype]; + dep_total[newtype]++; + FREE_GBLLOCK(&lk); + item->wk_type = newtype; +} + +/* + * Workitem queue management + */ +static int max_softdeps; /* maximum number of structs before slowdown */ +static int tickdelay = 2; /* number of ticks to pause during slowdown */ +static int proc_waiting; /* tracks whether we have a timeout posted */ +static int *stat_countp; /* statistic to count in proc_waiting timeout */ +static struct callout softdep_callout; +static int req_clear_inodedeps; /* syncer process flush some inodedeps */ +static int req_clear_remove; /* syncer process flush some freeblks */ +static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */ + +/* + * runtime statistics + */ +static int stat_flush_threads; /* number of softdep flushing threads */ +static int stat_worklist_push; /* number of worklist cleanups */ +static int stat_blk_limit_push; /* number of times block limit neared */ +static int stat_ino_limit_push; /* number of times inode limit neared */ +static int stat_blk_limit_hit; /* number of times block slowdown imposed */ +static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ +static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ +static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ +static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ +static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ +static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ +static int stat_jaddref; /* bufs redirtied as ino bitmap can not write */ +static int stat_jnewblk; /* bufs redirtied as blk bitmap can not write */ +static int stat_journal_min; /* Times hit journal min threshold */ +static int stat_journal_low; /* Times hit journal low threshold */ +static int stat_journal_wait; /* Times blocked in jwait(). */ +static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */ +static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */ +static int stat_jwait_inode; /* Times blocked in jwait() for inodes. */ +static int stat_jwait_newblk; /* Times blocked in jwait() for newblks. */ +static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */ +static int stat_cleanup_blkrequests; /* Number of block cleanup requests */ +static int stat_cleanup_inorequests; /* Number of inode cleanup requests */ +static int stat_cleanup_retries; /* Number of cleanups that needed to flush */ +static int stat_cleanup_failures; /* Number of cleanup requests that failed */ +static int stat_emptyjblocks; /* Number of potentially empty journal blocks */ + +SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW, + &max_softdeps, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW, + &tickdelay, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD, + &stat_flush_threads, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW, + &stat_worklist_push, 0,""); +SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW, + &stat_blk_limit_push, 0,""); +SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW, + &stat_ino_limit_push, 0,""); +SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW, + &stat_blk_limit_hit, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW, + &stat_ino_limit_hit, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW, + &stat_sync_limit_hit, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, + &stat_indir_blk_ptrs, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW, + &stat_inode_bitmap, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, + &stat_direct_blk_ptrs, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW, + &stat_dir_entry, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW, + &stat_jaddref, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW, + &stat_jnewblk, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW, + &stat_journal_low, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW, + &stat_journal_min, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW, + &stat_journal_wait, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW, + &stat_jwait_filepage, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW, + &stat_jwait_freeblks, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW, + &stat_jwait_inode, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW, + &stat_jwait_newblk, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW, + &stat_cleanup_blkrequests, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW, + &stat_cleanup_inorequests, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW, + &stat_cleanup_high_delay, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW, + &stat_cleanup_retries, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW, + &stat_cleanup_failures, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW, + &softdep_flushcache, 0, ""); +SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD, + &stat_emptyjblocks, 0, ""); + +SYSCTL_DECL(_vfs_ffs); + +/* Whether to recompute the summary at mount time */ +static int compute_summary_at_mount = 0; +SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, + &compute_summary_at_mount, 0, "Recompute summary at mount"); +static int print_threads = 0; +SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW, + &print_threads, 0, "Notify flusher thread start/stop"); + +/* List of all filesystems mounted with soft updates */ +static TAILQ_HEAD(, mount_softdeps) softdepmounts; + +/* + * This function cleans the worklist for a filesystem. + * Each filesystem running with soft dependencies gets its own + * thread to run in this function. The thread is started up in + * softdep_mount and shutdown in softdep_unmount. They show up + * as part of the kernel "bufdaemon" process whose process + * entry is available in bufdaemonproc. + */ +static int searchfailed; +extern struct proc *bufdaemonproc; +static void +softdep_flush(addr) + void *addr; +{ + struct mount *mp; + struct thread *td; + struct ufsmount *ump; + + td = curthread; + td->td_pflags |= TDP_NORUNNINGBUF; + mp = (struct mount *)addr; + ump = VFSTOUFS(mp); + atomic_add_int(&stat_flush_threads, 1); + ACQUIRE_LOCK(ump); + ump->softdep_flags &= ~FLUSH_STARTING; + wakeup(&ump->softdep_flushtd); + FREE_LOCK(ump); + if (print_threads) { + if (stat_flush_threads == 1) + printf("Running %s at pid %d\n", bufdaemonproc->p_comm, + bufdaemonproc->p_pid); + printf("Start thread %s\n", td->td_name); + } + for (;;) { + while (softdep_process_worklist(mp, 0) > 0 || + (MOUNTEDSUJ(mp) && + VFSTOUFS(mp)->softdep_jblocks->jb_suspended)) + kthread_suspend_check(); + ACQUIRE_LOCK(ump); + if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) + msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, + "sdflush", hz / 2); + ump->softdep_flags &= ~FLUSH_CLEANUP; + /* + * Check to see if we are done and need to exit. + */ + if ((ump->softdep_flags & FLUSH_EXIT) == 0) { + FREE_LOCK(ump); + continue; + } + ump->softdep_flags &= ~FLUSH_EXIT; + FREE_LOCK(ump); + wakeup(&ump->softdep_flags); + if (print_threads) + printf("Stop thread %s: searchfailed %d, did cleanups %d\n", td->td_name, searchfailed, ump->um_softdep->sd_cleanups); + atomic_subtract_int(&stat_flush_threads, 1); + kthread_exit(); + panic("kthread_exit failed\n"); + } +} + +static void +worklist_speedup(mp) + struct mount *mp; +{ + struct ufsmount *ump; + + ump = VFSTOUFS(mp); + LOCK_OWNED(ump); + if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) + ump->softdep_flags |= FLUSH_CLEANUP; + wakeup(&ump->softdep_flushtd); +} + +static int +softdep_speedup(ump) + struct ufsmount *ump; +{ + struct ufsmount *altump; + struct mount_softdeps *sdp; + + LOCK_OWNED(ump); + worklist_speedup(ump->um_mountp); + bd_speedup(); + /* + * If we have global shortages, then we need other + * filesystems to help with the cleanup. Here we wakeup a + * flusher thread for a filesystem that is over its fair + * share of resources. + */ + if (req_clear_inodedeps || req_clear_remove) { + ACQUIRE_GBLLOCK(&lk); + TAILQ_FOREACH(sdp, &softdepmounts, sd_next) { + if ((altump = sdp->sd_ump) == ump) + continue; + if (((req_clear_inodedeps && + altump->softdep_curdeps[D_INODEDEP] > + max_softdeps / stat_flush_threads) || + (req_clear_remove && + altump->softdep_curdeps[D_DIRREM] > + (max_softdeps / 2) / stat_flush_threads)) && + TRY_ACQUIRE_LOCK(altump)) + break; + } + if (sdp == NULL) { + searchfailed++; + FREE_GBLLOCK(&lk); + } else { + /* + * Move to the end of the list so we pick a + * different one on out next try. + */ + TAILQ_REMOVE(&softdepmounts, sdp, sd_next); + TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next); + FREE_GBLLOCK(&lk); + if ((altump->softdep_flags & + (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) + altump->softdep_flags |= FLUSH_CLEANUP; + altump->um_softdep->sd_cleanups++; + wakeup(&altump->softdep_flushtd); + FREE_LOCK(altump); + } + } + return (speedup_syncer()); +} + +/* + * Add an item to the end of the work queue. + * This routine requires that the lock be held. + * This is the only routine that adds items to the list. + * The following routine is the only one that removes items + * and does so in order from first to last. + */ + +#define WK_HEAD 0x0001 /* Add to HEAD. */ +#define WK_NODELAY 0x0002 /* Process immediately. */ + +static void +add_to_worklist(wk, flags) + struct worklist *wk; + int flags; +{ + struct ufsmount *ump; + + ump = VFSTOUFS(wk->wk_mp); + LOCK_OWNED(ump); + if (wk->wk_state & ONWORKLIST) + panic("add_to_worklist: %s(0x%X) already on list", + TYPENAME(wk->wk_type), wk->wk_state); + wk->wk_state |= ONWORKLIST; + if (ump->softdep_on_worklist == 0) { + LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); + ump->softdep_worklist_tail = wk; + } else if (flags & WK_HEAD) { + LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); + } else { + LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list); + ump->softdep_worklist_tail = wk; + } + ump->softdep_on_worklist += 1; + if (flags & WK_NODELAY) + worklist_speedup(wk->wk_mp); +} + +/* + * Remove the item to be processed. If we are removing the last + * item on the list, we need to recalculate the tail pointer. + */ +static void +remove_from_worklist(wk) + struct worklist *wk; +{ + struct ufsmount *ump; + + ump = VFSTOUFS(wk->wk_mp); + if (ump->softdep_worklist_tail == wk) + ump->softdep_worklist_tail = + (struct worklist *)wk->wk_list.le_prev; + WORKLIST_REMOVE(wk); + ump->softdep_on_worklist -= 1; +} + +static void +wake_worklist(wk) + struct worklist *wk; +{ + if (wk->wk_state & IOWAITING) { + wk->wk_state &= ~IOWAITING; + wakeup(wk); + } +} + +static void +wait_worklist(wk, wmesg) + struct worklist *wk; + char *wmesg; +{ + struct ufsmount *ump; + + ump = VFSTOUFS(wk->wk_mp); + wk->wk_state |= IOWAITING; + msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0); +} + +/* + * Process that runs once per second to handle items in the background queue. + * + * Note that we ensure that everything is done in the order in which they + * appear in the queue. The code below depends on this property to ensure + * that blocks of a file are freed before the inode itself is freed. This + * ordering ensures that no new triples will be generated + * until all the old ones have been purged from the dependency lists. + */ +static int +softdep_process_worklist(mp, full) + struct mount *mp; + int full; +{ + int cnt, matchcnt; + struct ufsmount *ump; + long starttime; + + KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp")); + if (MOUNTEDSOFTDEP(mp) == 0) + return (0); + matchcnt = 0; + ump = VFSTOUFS(mp); + ACQUIRE_LOCK(ump); + starttime = time_second; + softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0); + check_clear_deps(mp); + while (ump->softdep_on_worklist > 0) { + if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0) + break; + else + matchcnt += cnt; + check_clear_deps(mp); + /* + * We do not generally want to stop for buffer space, but if + * we are really being a buffer hog, we will stop and wait. + */ + if (should_yield()) { + FREE_LOCK(ump); + kern_yield(PRI_USER); + bwillwrite(); + ACQUIRE_LOCK(ump); + } + /* + * Never allow processing to run for more than one + * second. This gives the syncer thread the opportunity + * to pause if appropriate. + */ + if (!full && starttime != time_second) + break; + } + if (full == 0) + journal_unsuspend(ump); + FREE_LOCK(ump); + return (matchcnt); +} + +/* + * Process all removes associated with a vnode if we are running out of + * journal space. Any other process which attempts to flush these will + * be unable as we have the vnodes locked. + */ +static void +process_removes(vp) + struct vnode *vp; +{ + struct inodedep *inodedep; + struct dirrem *dirrem; + struct ufsmount *ump; + struct mount *mp; + ino_t inum; + + mp = vp->v_mount; + ump = VFSTOUFS(mp); + LOCK_OWNED(ump); + inum = VTOI(vp)->i_number; + for (;;) { +top: + if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) + return; + LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) { + /* + * If another thread is trying to lock this vnode + * it will fail but we must wait for it to do so + * before we can proceed. + */ + if (dirrem->dm_state & INPROGRESS) { + wait_worklist(&dirrem->dm_list, "pwrwait"); + goto top; + } + if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) == + (COMPLETE | ONWORKLIST)) + break; + } + if (dirrem == NULL) + return; + remove_from_worklist(&dirrem->dm_list); + FREE_LOCK(ump); + if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) + panic("process_removes: suspended filesystem"); + handle_workitem_remove(dirrem, 0); + vn_finished_secondary_write(mp); + ACQUIRE_LOCK(ump); + } +} + +/* + * Process all truncations associated with a vnode if we are running out + * of journal space. This is called when the vnode lock is already held + * and no other process can clear the truncation. This function returns + * a value greater than zero if it did any work. + */ +static void +process_truncates(vp) + struct vnode *vp; +{ + struct inodedep *inodedep; + struct freeblks *freeblks; + struct ufsmount *ump; + struct mount *mp; + ino_t inum; + int cgwait; + + mp = vp->v_mount; + ump = VFSTOUFS(mp); + LOCK_OWNED(ump); + inum = VTOI(vp)->i_number; + for (;;) { + if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) + return; + cgwait = 0; + TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) { + /* Journal entries not yet written. */ + if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) { + jwait(&LIST_FIRST( + &freeblks->fb_jblkdephd)->jb_list, + MNT_WAIT); + break; + } + /* Another thread is executing this item. */ + if (freeblks->fb_state & INPROGRESS) { + wait_worklist(&freeblks->fb_list, "ptrwait"); + break; + } + /* Freeblks is waiting on a inode write. */ + if ((freeblks->fb_state & COMPLETE) == 0) { + FREE_LOCK(ump); + ffs_update(vp, 1); + ACQUIRE_LOCK(ump); + break; + } + if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) == + (ALLCOMPLETE | ONWORKLIST)) { + remove_from_worklist(&freeblks->fb_list); + freeblks->fb_state |= INPROGRESS; + FREE_LOCK(ump); + if (vn_start_secondary_write(NULL, &mp, + V_NOWAIT)) + panic("process_truncates: " + "suspended filesystem"); + handle_workitem_freeblocks(freeblks, 0); + vn_finished_secondary_write(mp); + ACQUIRE_LOCK(ump); + break; + } + if (freeblks->fb_cgwait) + cgwait++; + } + if (cgwait) { + FREE_LOCK(ump); + sync_cgs(mp, MNT_WAIT); + ffs_sync_snap(mp, MNT_WAIT); + ACQUIRE_LOCK(ump); + continue; + } + if (freeblks == NULL) + break; + } + return; +} + +/* + * Process one item on the worklist. + */ +static int +process_worklist_item(mp, target, flags) + struct mount *mp; + int target; + int flags; +{ + struct worklist sentinel; + struct worklist *wk; + struct ufsmount *ump; + int matchcnt; + int error; + + KASSERT(mp != NULL, ("process_worklist_item: NULL mp")); + /* + * If we are being called because of a process doing a + * copy-on-write, then it is not safe to write as we may + * recurse into the copy-on-write routine. + */ + if (curthread->td_pflags & TDP_COWINPROGRESS) + return (-1); + PHOLD(curproc); /* Don't let the stack go away. */ + ump = VFSTOUFS(mp); + LOCK_OWNED(ump); + matchcnt = 0; + sentinel.wk_mp = NULL; + sentinel.wk_type = D_SENTINEL; + LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list); + for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL; + wk = LIST_NEXT(&sentinel, wk_list)) { + if (wk->wk_type == D_SENTINEL) { + LIST_REMOVE(&sentinel, wk_list); + LIST_INSERT_AFTER(wk, &sentinel, wk_list); + continue; + } + if (wk->wk_state & INPROGRESS) + panic("process_worklist_item: %p already in progress.", + wk); + wk->wk_state |= INPROGRESS; + remove_from_worklist(wk); + FREE_LOCK(ump); + if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) + panic("process_worklist_item: suspended filesystem"); + switch (wk->wk_type) { + case D_DIRREM: + /* removal of a directory entry */ + error = handle_workitem_remove(WK_DIRREM(wk), flags); + break; + + case D_FREEBLKS: + /* releasing blocks and/or fragments from a file */ + error = handle_workitem_freeblocks(WK_FREEBLKS(wk), + flags); + break; + + case D_FREEFRAG: + /* releasing a fragment when replaced as a file grows */ + handle_workitem_freefrag(WK_FREEFRAG(wk)); + error = 0; + break; + + case D_FREEFILE: + /* releasing an inode when its link count drops to 0 */ + handle_workitem_freefile(WK_FREEFILE(wk)); + error = 0; + break; + + default: + panic("%s_process_worklist: Unknown type %s", + "softdep", TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + vn_finished_secondary_write(mp); + ACQUIRE_LOCK(ump); + if (error == 0) { + if (++matchcnt == target) + break; + continue; + } + /* + * We have to retry the worklist item later. Wake up any + * waiters who may be able to complete it immediately and + * add the item back to the head so we don't try to execute + * it again. + */ + wk->wk_state &= ~INPROGRESS; + wake_worklist(wk); + add_to_worklist(wk, WK_HEAD); + } + /* Sentinal could've become the tail from remove_from_worklist. */ + if (ump->softdep_worklist_tail == &sentinel) + ump->softdep_worklist_tail = + (struct worklist *)sentinel.wk_list.le_prev; + LIST_REMOVE(&sentinel, wk_list); + PRELE(curproc); + return (matchcnt); +} + +/* + * Move dependencies from one buffer to another. + */ +int +softdep_move_dependencies(oldbp, newbp) + struct buf *oldbp; + struct buf *newbp; +{ + struct worklist *wk, *wktail; + struct ufsmount *ump; + int dirty; + + if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL) + return (0); + KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0, + ("softdep_move_dependencies called on non-softdep filesystem")); + dirty = 0; + wktail = NULL; + ump = VFSTOUFS(wk->wk_mp); + ACQUIRE_LOCK(ump); + while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { + LIST_REMOVE(wk, wk_list); + if (wk->wk_type == D_BMSAFEMAP && + bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp)) + dirty = 1; + if (wktail == NULL) + LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); + else + LIST_INSERT_AFTER(wktail, wk, wk_list); + wktail = wk; + } + FREE_LOCK(ump); + + return (dirty); +} + +/* + * Purge the work list of all items associated with a particular mount point. + */ +int +softdep_flushworklist(oldmnt, countp, td) + struct mount *oldmnt; + int *countp; + struct thread *td; +{ + struct vnode *devvp; + struct ufsmount *ump; + int count, error; + + /* + * Alternately flush the block device associated with the mount + * point and process any dependencies that the flushing + * creates. We continue until no more worklist dependencies + * are found. + */ + *countp = 0; + error = 0; + ump = VFSTOUFS(oldmnt); + devvp = ump->um_devvp; + while ((count = softdep_process_worklist(oldmnt, 1)) > 0) { + *countp += count; + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_FSYNC(devvp, MNT_WAIT, td); + VOP_UNLOCK(devvp, 0); + if (error != 0) + break; + } + return (error); +} + +#define SU_WAITIDLE_RETRIES 20 +static int +softdep_waitidle(struct mount *mp, int flags __unused) +{ + struct ufsmount *ump; + struct vnode *devvp; + struct thread *td; + int error, i; + + ump = VFSTOUFS(mp); + devvp = ump->um_devvp; + td = curthread; + error = 0; + ACQUIRE_LOCK(ump); + for (i = 0; i < SU_WAITIDLE_RETRIES && ump->softdep_deps != 0; i++) { + ump->softdep_req = 1; + KASSERT((flags & FORCECLOSE) == 0 || + ump->softdep_on_worklist == 0, + ("softdep_waitidle: work added after flush")); + msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM | PDROP, + "softdeps", 10 * hz); + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_FSYNC(devvp, MNT_WAIT, td); + VOP_UNLOCK(devvp, 0); + ACQUIRE_LOCK(ump); + if (error != 0) + break; + } + ump->softdep_req = 0; + if (i == SU_WAITIDLE_RETRIES && error == 0 && ump->softdep_deps != 0) { + error = EBUSY; + printf("softdep_waitidle: Failed to flush worklist for %p\n", + mp); + } + FREE_LOCK(ump); + return (error); +} + +/* + * Flush all vnodes and worklist items associated with a specified mount point. + */ +int +softdep_flushfiles(oldmnt, flags, td) + struct mount *oldmnt; + int flags; + struct thread *td; +{ +#ifdef QUOTA + struct ufsmount *ump; + int i; +#endif + int error, early, depcount, loopcnt, retry_flush_count, retry; + int morework; + + KASSERT(MOUNTEDSOFTDEP(oldmnt) != 0, + ("softdep_flushfiles called on non-softdep filesystem")); + loopcnt = 10; + retry_flush_count = 3; +retry_flush: + error = 0; + + /* + * Alternately flush the vnodes associated with the mount + * point and process any dependencies that the flushing + * creates. In theory, this loop can happen at most twice, + * but we give it a few extra just to be sure. + */ + for (; loopcnt > 0; loopcnt--) { + /* + * Do another flush in case any vnodes were brought in + * as part of the cleanup operations. + */ + early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag & + MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH; + if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0) + break; + if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 || + depcount == 0) + break; + } + /* + * If we are unmounting then it is an error to fail. If we + * are simply trying to downgrade to read-only, then filesystem + * activity can keep us busy forever, so we just fail with EBUSY. + */ + if (loopcnt == 0) { + if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) + panic("softdep_flushfiles: looping"); + error = EBUSY; + } + if (!error) + error = softdep_waitidle(oldmnt, flags); + if (!error) { + if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) { + retry = 0; + MNT_ILOCK(oldmnt); + KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0, + ("softdep_flushfiles: !MNTK_NOINSMNTQ")); + morework = oldmnt->mnt_nvnodelistsize > 0; +#ifdef QUOTA + ump = VFSTOUFS(oldmnt); + UFS_LOCK(ump); + for (i = 0; i < MAXQUOTAS; i++) { + if (ump->um_quotas[i] != NULLVP) + morework = 1; + } + UFS_UNLOCK(ump); +#endif + if (morework) { + if (--retry_flush_count > 0) { + retry = 1; + loopcnt = 3; + } else + error = EBUSY; + } + MNT_IUNLOCK(oldmnt); + if (retry) + goto retry_flush; + } + } + return (error); +} + +/* + * Structure hashing. + * + * There are four types of structures that can be looked up: + * 1) pagedep structures identified by mount point, inode number, + * and logical block. + * 2) inodedep structures identified by mount point and inode number. + * 3) newblk structures identified by mount point and + * physical block number. + * 4) bmsafemap structures identified by mount point and + * cylinder group number. + * + * The "pagedep" and "inodedep" dependency structures are hashed + * separately from the file blocks and inodes to which they correspond. + * This separation helps when the in-memory copy of an inode or + * file block must be replaced. It also obviates the need to access + * an inode or file page when simply updating (or de-allocating) + * dependency structures. Lookup of newblk structures is needed to + * find newly allocated blocks when trying to associate them with + * their allocdirect or allocindir structure. + * + * The lookup routines optionally create and hash a new instance when + * an existing entry is not found. The bmsafemap lookup routine always + * allocates a new structure if an existing one is not found. + */ +#define DEPALLOC 0x0001 /* allocate structure if lookup fails */ + +/* + * Structures and routines associated with pagedep caching. + */ +#define PAGEDEP_HASH(ump, inum, lbn) \ + (&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size]) + +static int +pagedep_find(pagedephd, ino, lbn, pagedeppp) + struct pagedep_hashhead *pagedephd; + ino_t ino; + ufs_lbn_t lbn; + struct pagedep **pagedeppp; +{ + struct pagedep *pagedep; + + LIST_FOREACH(pagedep, pagedephd, pd_hash) { + if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) { + *pagedeppp = pagedep; + return (1); + } + } + *pagedeppp = NULL; + return (0); +} +/* + * Look up a pagedep. Return 1 if found, 0 otherwise. + * If not found, allocate if DEPALLOC flag is passed. + * Found or allocated entry is returned in pagedeppp. + * This routine must be called with splbio interrupts blocked. + */ +static int +pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp) + struct mount *mp; + struct buf *bp; + ino_t ino; + ufs_lbn_t lbn; + int flags; + struct pagedep **pagedeppp; +{ + struct pagedep *pagedep; + struct pagedep_hashhead *pagedephd; + struct worklist *wk; + struct ufsmount *ump; + int ret; + int i; + + ump = VFSTOUFS(mp); + LOCK_OWNED(ump); + if (bp) { + LIST_FOREACH(wk, &bp->b_dep, wk_list) { + if (wk->wk_type == D_PAGEDEP) { + *pagedeppp = WK_PAGEDEP(wk); + return (1); + } + } + } + pagedephd = PAGEDEP_HASH(ump, ino, lbn); + ret = pagedep_find(pagedephd, ino, lbn, pagedeppp); + if (ret) { + if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp) + WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list); + return (1); + } + if ((flags & DEPALLOC) == 0) + return (0); + FREE_LOCK(ump); + pagedep = malloc(sizeof(struct pagedep), + M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); + workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); + ACQUIRE_LOCK(ump); + ret = pagedep_find(pagedephd, ino, lbn, pagedeppp); + if (*pagedeppp) { + /* + * This should never happen since we only create pagedeps + * with the vnode lock held. Could be an assert. + */ + WORKITEM_FREE(pagedep, D_PAGEDEP); + return (ret); + } + pagedep->pd_ino = ino; + pagedep->pd_lbn = lbn; + LIST_INIT(&pagedep->pd_dirremhd); + LIST_INIT(&pagedep->pd_pendinghd); + for (i = 0; i < DAHASHSZ; i++) + LIST_INIT(&pagedep->pd_diraddhd[i]); + LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); + WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); + *pagedeppp = pagedep; + return (0); +} + +/* + * Structures and routines associated with inodedep caching. + */ +#define INODEDEP_HASH(ump, inum) \ + (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size]) + +static int +inodedep_find(inodedephd, inum, inodedeppp) + struct inodedep_hashhead *inodedephd; + ino_t inum; + struct inodedep **inodedeppp; +{ + struct inodedep *inodedep; + + LIST_FOREACH(inodedep, inodedephd, id_hash) + if (inum == inodedep->id_ino) + break; + if (inodedep) { + *inodedeppp = inodedep; + return (1); + } + *inodedeppp = NULL; + + return (0); +} +/* + * Look up an inodedep. Return 1 if found, 0 if not found. + * If not found, allocate if DEPALLOC flag is passed. + * Found or allocated entry is returned in inodedeppp. + * This routine must be called with splbio interrupts blocked. + */ +static int +inodedep_lookup(mp, inum, flags, inodedeppp) + struct mount *mp; + ino_t inum; + int flags; + struct inodedep **inodedeppp; +{ + struct inodedep *inodedep; + struct inodedep_hashhead *inodedephd; + struct ufsmount *ump; + struct fs *fs; + + ump = VFSTOUFS(mp); + LOCK_OWNED(ump); + fs = ump->um_fs; + inodedephd = INODEDEP_HASH(ump, inum); + + if (inodedep_find(inodedephd, inum, inodedeppp)) + return (1); + if ((flags & DEPALLOC) == 0) + return (0); + /* + * If the system is over its limit and our filesystem is + * responsible for more than our share of that usage and + * we are not in a rush, request some inodedep cleanup. + */ + if (softdep_excess_items(ump, D_INODEDEP)) + schedule_cleanup(mp); + else + FREE_LOCK(ump); + inodedep = malloc(sizeof(struct inodedep), + M_INODEDEP, M_SOFTDEP_FLAGS); + workitem_alloc(&inodedep->id_list, D_INODEDEP, mp); + ACQUIRE_LOCK(ump); + if (inodedep_find(inodedephd, inum, inodedeppp)) { + WORKITEM_FREE(inodedep, D_INODEDEP); + return (1); + } + inodedep->id_fs = fs; + inodedep->id_ino = inum; + inodedep->id_state = ALLCOMPLETE; + inodedep->id_nlinkdelta = 0; + inodedep->id_savedino1 = NULL; + inodedep->id_savedsize = -1; + inodedep->id_savedextsize = -1; + inodedep->id_savednlink = -1; + inodedep->id_bmsafemap = NULL; + inodedep->id_mkdiradd = NULL; + LIST_INIT(&inodedep->id_dirremhd); + LIST_INIT(&inodedep->id_pendinghd); + LIST_INIT(&inodedep->id_inowait); + LIST_INIT(&inodedep->id_bufwait); + TAILQ_INIT(&inodedep->id_inoreflst); + TAILQ_INIT(&inodedep->id_inoupdt); + TAILQ_INIT(&inodedep->id_newinoupdt); + TAILQ_INIT(&inodedep->id_extupdt); + TAILQ_INIT(&inodedep->id_newextupdt); + TAILQ_INIT(&inodedep->id_freeblklst); + LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); + *inodedeppp = inodedep; + return (0); +} + +/* + * Structures and routines associated with newblk caching. + */ +#define NEWBLK_HASH(ump, inum) \ + (&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size]) + +static int +newblk_find(newblkhd, newblkno, flags, newblkpp) + struct newblk_hashhead *newblkhd; + ufs2_daddr_t newblkno; + int flags; + struct newblk **newblkpp; +{ + struct newblk *newblk; + + LIST_FOREACH(newblk, newblkhd, nb_hash) { + if (newblkno != newblk->nb_newblkno) + continue; + /* + * If we're creating a new dependency don't match those that + * have already been converted to allocdirects. This is for + * a frag extend. + */ + if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK) + continue; + break; + } + if (newblk) { + *newblkpp = newblk; + return (1); + } + *newblkpp = NULL; + return (0); +} + +/* + * Look up a newblk. Return 1 if found, 0 if not found. + * If not found, allocate if DEPALLOC flag is passed. + * Found or allocated entry is returned in newblkpp. + */ +static int +newblk_lookup(mp, newblkno, flags, newblkpp) + struct mount *mp; + ufs2_daddr_t newblkno; + int flags; + struct newblk **newblkpp; +{ + struct newblk *newblk; + struct newblk_hashhead *newblkhd; + struct ufsmount *ump; + + ump = VFSTOUFS(mp); + LOCK_OWNED(ump); + newblkhd = NEWBLK_HASH(ump, newblkno); + if (newblk_find(newblkhd, newblkno, flags, newblkpp)) + return (1); + if ((flags & DEPALLOC) == 0) + return (0); + if (softdep_excess_items(ump, D_NEWBLK) || + softdep_excess_items(ump, D_ALLOCDIRECT) || + softdep_excess_items(ump, D_ALLOCINDIR)) + schedule_cleanup(mp); + else + FREE_LOCK(ump); + newblk = malloc(sizeof(union allblk), M_NEWBLK, + M_SOFTDEP_FLAGS | M_ZERO); + workitem_alloc(&newblk->nb_list, D_NEWBLK, mp); + ACQUIRE_LOCK(ump); + if (newblk_find(newblkhd, newblkno, flags, newblkpp)) { + WORKITEM_FREE(newblk, D_NEWBLK); + return (1); + } + newblk->nb_freefrag = NULL; + LIST_INIT(&newblk->nb_indirdeps); + LIST_INIT(&newblk->nb_newdirblk); + LIST_INIT(&newblk->nb_jwork); + newblk->nb_state = ATTACHED; + newblk->nb_newblkno = newblkno; + LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); + *newblkpp = newblk; + return (0); +} + +/* + * Structures and routines associated with freed indirect block caching. + */ +#define INDIR_HASH(ump, blkno) \ + (&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size]) + +/* + * Lookup an indirect block in the indir hash table. The freework is + * removed and potentially freed. The caller must do a blocking journal + * write before writing to the blkno. + */ +static int +indirblk_lookup(mp, blkno) + struct mount *mp; + ufs2_daddr_t blkno; +{ + struct freework *freework; + struct indir_hashhead *wkhd; + struct ufsmount *ump; + + ump = VFSTOUFS(mp); + wkhd = INDIR_HASH(ump, blkno); + TAILQ_FOREACH(freework, wkhd, fw_next) { + if (freework->fw_blkno != blkno) + continue; + indirblk_remove(freework); + return (1); + } + return (0); +} + +/* + * Insert an indirect block represented by freework into the indirblk + * hash table so that it may prevent the block from being re-used prior + * to the journal being written. + */ +static void +indirblk_insert(freework) + struct freework *freework; +{ + struct jblocks *jblocks; + struct jseg *jseg; + struct ufsmount *ump; + + ump = VFSTOUFS(freework->fw_list.wk_mp); + jblocks = ump->softdep_jblocks; + jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst); + if (jseg == NULL) + return; + + LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs); + TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework, + fw_next); + freework->fw_state &= ~DEPCOMPLETE; +} + +static void +indirblk_remove(freework) + struct freework *freework; +{ + struct ufsmount *ump; + + ump = VFSTOUFS(freework->fw_list.wk_mp); + LIST_REMOVE(freework, fw_segs); + TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next); + freework->fw_state |= DEPCOMPLETE; + if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) + WORKITEM_FREE(freework, D_FREEWORK); +} + +/* + * Executed during filesystem system initialization before + * mounting any filesystems. + */ +void +softdep_initialize() +{ + + TAILQ_INIT(&softdepmounts); +#ifdef __LP64__ + max_softdeps = desiredvnodes * 4; +#else + max_softdeps = desiredvnodes * 2; +#endif + + /* initialise bioops hack */ + bioops.io_start = softdep_disk_io_initiation; + bioops.io_complete = softdep_disk_write_complete; + bioops.io_deallocate = softdep_deallocate_dependencies; + bioops.io_countdeps = softdep_count_dependencies; + softdep_ast_cleanup = softdep_ast_cleanup_proc; + + /* Initialize the callout with an mtx. */ + callout_init_mtx(&softdep_callout, &lk, 0); +} + +/* + * Executed after all filesystems have been unmounted during + * filesystem module unload. + */ +void +softdep_uninitialize() +{ + + /* clear bioops hack */ + bioops.io_start = NULL; + bioops.io_complete = NULL; + bioops.io_deallocate = NULL; + bioops.io_countdeps = NULL; + softdep_ast_cleanup = NULL; + + callout_drain(&softdep_callout); +} + +/* + * Called at mount time to notify the dependency code that a + * filesystem wishes to use it. + */ +int +softdep_mount(devvp, mp, fs, cred) + struct vnode *devvp; + struct mount *mp; + struct fs *fs; + struct ucred *cred; +{ + struct csum_total cstotal; + struct mount_softdeps *sdp; + struct ufsmount *ump; + struct cg *cgp; + struct buf *bp; + int i, error, cyl; + + sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA, + M_WAITOK | M_ZERO); + MNT_ILOCK(mp); + mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP; + if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) { + mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | + MNTK_SOFTDEP | MNTK_NOASYNC; + } + ump = VFSTOUFS(mp); + ump->um_softdep = sdp; + MNT_IUNLOCK(mp); + rw_init(LOCK_PTR(ump), "Per-Filesystem Softdep Lock"); + sdp->sd_ump = ump; + LIST_INIT(&ump->softdep_workitem_pending); + LIST_INIT(&ump->softdep_journal_pending); + TAILQ_INIT(&ump->softdep_unlinked); + LIST_INIT(&ump->softdep_dirtycg); + ump->softdep_worklist_tail = NULL; + ump->softdep_on_worklist = 0; + ump->softdep_deps = 0; + LIST_INIT(&ump->softdep_mkdirlisthd); + ump->pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, + &ump->pagedep_hash_size); + ump->pagedep_nextclean = 0; + ump->inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, + &ump->inodedep_hash_size); + ump->inodedep_nextclean = 0; + ump->newblk_hashtbl = hashinit(max_softdeps / 2, M_NEWBLK, + &ump->newblk_hash_size); + ump->bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, + &ump->bmsafemap_hash_size); + i = 1 << (ffs(desiredvnodes / 10) - 1); + ump->indir_hashtbl = malloc(i * sizeof(struct indir_hashhead), + M_FREEWORK, M_WAITOK); + ump->indir_hash_size = i - 1; + for (i = 0; i <= ump->indir_hash_size; i++) + TAILQ_INIT(&ump->indir_hashtbl[i]); + ACQUIRE_GBLLOCK(&lk); + TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next); + FREE_GBLLOCK(&lk); + if ((fs->fs_flags & FS_SUJ) && + (error = journal_mount(mp, fs, cred)) != 0) { + printf("Failed to start journal: %d\n", error); + softdep_unmount(mp); + return (error); + } + /* + * Start our flushing thread in the bufdaemon process. + */ + ACQUIRE_LOCK(ump); + ump->softdep_flags |= FLUSH_STARTING; + FREE_LOCK(ump); + kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc, + &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker", + mp->mnt_stat.f_mntonname); + ACQUIRE_LOCK(ump); + while ((ump->softdep_flags & FLUSH_STARTING) != 0) { + msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart", + hz / 2); + } + FREE_LOCK(ump); + /* + * When doing soft updates, the counters in the + * superblock may have gotten out of sync. Recomputation + * can take a long time and can be deferred for background + * fsck. However, the old behavior of scanning the cylinder + * groups and recalculating them at mount time is available + * by setting vfs.ffs.compute_summary_at_mount to one. + */ + if (compute_summary_at_mount == 0 || fs->fs_clean != 0) + return (0); + bzero(&cstotal, sizeof cstotal); + for (cyl = 0; cyl < fs->fs_ncg; cyl++) { + if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), + fs->fs_cgsize, cred, &bp)) != 0) { + brelse(bp); + softdep_unmount(mp); + return (error); + } + cgp = (struct cg *)bp->b_data; + cstotal.cs_nffree += cgp->cg_cs.cs_nffree; + cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; + cstotal.cs_nifree += cgp->cg_cs.cs_nifree; + cstotal.cs_ndir += cgp->cg_cs.cs_ndir; + fs->fs_cs(fs, cyl) = cgp->cg_cs; + brelse(bp); + } +#ifdef DEBUG + if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) + printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); +#endif + bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); + return (0); +} + +void +softdep_unmount(mp) + struct mount *mp; +{ + struct ufsmount *ump; +#ifdef INVARIANTS + int i; +#endif + + KASSERT(MOUNTEDSOFTDEP(mp) != 0, + ("softdep_unmount called on non-softdep filesystem")); + ump = VFSTOUFS(mp); + MNT_ILOCK(mp); + mp->mnt_flag &= ~MNT_SOFTDEP; + if (MOUNTEDSUJ(mp) == 0) { + MNT_IUNLOCK(mp); + } else { + mp->mnt_flag &= ~MNT_SUJ; + MNT_IUNLOCK(mp); + journal_unmount(ump); + } + /* + * Shut down our flushing thread. Check for NULL is if + * softdep_mount errors out before the thread has been created. + */ + if (ump->softdep_flushtd != NULL) { + ACQUIRE_LOCK(ump); + ump->softdep_flags |= FLUSH_EXIT; + wakeup(&ump->softdep_flushtd); + msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM | PDROP, + "sdwait", 0); + KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0, + ("Thread shutdown failed")); + } + /* + * Free up our resources. + */ + ACQUIRE_GBLLOCK(&lk); + TAILQ_REMOVE(&softdepmounts, ump->um_softdep, sd_next); + FREE_GBLLOCK(&lk); + rw_destroy(LOCK_PTR(ump)); + hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size); + hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size); + hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size); + hashdestroy(ump->bmsafemap_hashtbl, M_BMSAFEMAP, + ump->bmsafemap_hash_size); + free(ump->indir_hashtbl, M_FREEWORK); +#ifdef INVARIANTS + for (i = 0; i <= D_LAST; i++) + KASSERT(ump->softdep_curdeps[i] == 0, + ("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt, + TYPENAME(i), ump->softdep_curdeps[i])); +#endif + free(ump->um_softdep, M_MOUNTDATA); +} + +static struct jblocks * +jblocks_create(void) +{ + struct jblocks *jblocks; + + jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO); + TAILQ_INIT(&jblocks->jb_segs); + jblocks->jb_avail = 10; + jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail, + M_JBLOCKS, M_WAITOK | M_ZERO); + + return (jblocks); +} + +static ufs2_daddr_t +jblocks_alloc(jblocks, bytes, actual) + struct jblocks *jblocks; + int bytes; + int *actual; +{ + ufs2_daddr_t daddr; + struct jextent *jext; + int freecnt; + int blocks; + + blocks = bytes / DEV_BSIZE; + jext = &jblocks->jb_extent[jblocks->jb_head]; + freecnt = jext->je_blocks - jblocks->jb_off; + if (freecnt == 0) { + jblocks->jb_off = 0; + if (++jblocks->jb_head > jblocks->jb_used) + jblocks->jb_head = 0; + jext = &jblocks->jb_extent[jblocks->jb_head]; + freecnt = jext->je_blocks; + } + if (freecnt > blocks) + freecnt = blocks; + *actual = freecnt * DEV_BSIZE; + daddr = jext->je_daddr + jblocks->jb_off; + jblocks->jb_off += freecnt; + jblocks->jb_free -= freecnt; + + return (daddr); +} + +static void +jblocks_free(jblocks, mp, bytes) + struct jblocks *jblocks; + struct mount *mp; + int bytes; +{ + + LOCK_OWNED(VFSTOUFS(mp)); + jblocks->jb_free += bytes / DEV_BSIZE; + if (jblocks->jb_suspended) + worklist_speedup(mp); + wakeup(jblocks); +} + +static void +jblocks_destroy(jblocks) + struct jblocks *jblocks; +{ + + if (jblocks->jb_extent) + free(jblocks->jb_extent, M_JBLOCKS); + free(jblocks, M_JBLOCKS); +} + +static void +jblocks_add(jblocks, daddr, blocks) + struct jblocks *jblocks; + ufs2_daddr_t daddr; + int blocks; +{ + struct jextent *jext; + + jblocks->jb_blocks += blocks; + jblocks->jb_free += blocks; + jext = &jblocks->jb_extent[jblocks->jb_used]; + /* Adding the first block. */ + if (jext->je_daddr == 0) { + jext->je_daddr = daddr; + jext->je_blocks = blocks; + return; + } + /* Extending the last extent. */ + if (jext->je_daddr + jext->je_blocks == daddr) { + jext->je_blocks += blocks; + return; + } + /* Adding a new extent. */ + if (++jblocks->jb_used == jblocks->jb_avail) { + jblocks->jb_avail *= 2; + jext = malloc(sizeof(struct jextent) * jblocks->jb_avail, + M_JBLOCKS, M_WAITOK | M_ZERO); + memcpy(jext, jblocks->jb_extent, + sizeof(struct jextent) * jblocks->jb_used); + free(jblocks->jb_extent, M_JBLOCKS); + jblocks->jb_extent = jext; + } + jext = &jblocks->jb_extent[jblocks->jb_used]; + jext->je_daddr = daddr; + jext->je_blocks = blocks; + return; +} + +int +softdep_journal_lookup(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + struct componentname cnp; + struct vnode *dvp; + ino_t sujournal; + int error; + + error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp); + if (error) + return (error); + bzero(&cnp, sizeof(cnp)); + cnp.cn_nameiop = LOOKUP; + cnp.cn_flags = ISLASTCN; + cnp.cn_thread = curthread; + cnp.cn_cred = curthread->td_ucred; + cnp.cn_pnbuf = SUJ_FILE; + cnp.cn_nameptr = SUJ_FILE; + cnp.cn_namelen = strlen(SUJ_FILE); + error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal); + vput(dvp); + if (error != 0) + return (error); + error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp); + return (error); +} + +/* + * Open and verify the journal file. + */ +static int +journal_mount(mp, fs, cred) + struct mount *mp; + struct fs *fs; + struct ucred *cred; +{ + struct jblocks *jblocks; + struct ufsmount *ump; + struct vnode *vp; + struct inode *ip; + ufs2_daddr_t blkno; + int bcount; + int error; + int i; + + ump = VFSTOUFS(mp); + ump->softdep_journal_tail = NULL; + ump->softdep_on_journal = 0; + ump->softdep_accdeps = 0; + ump->softdep_req = 0; + ump->softdep_jblocks = NULL; + error = softdep_journal_lookup(mp, &vp); + if (error != 0) { + printf("Failed to find journal. Use tunefs to create one\n"); + return (error); + } + ip = VTOI(vp); + if (ip->i_size < SUJ_MIN) { + error = ENOSPC; + goto out; + } + bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */ + jblocks = jblocks_create(); + for (i = 0; i < bcount; i++) { + error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL); + if (error) + break; + jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag)); + } + if (error) { + jblocks_destroy(jblocks); + goto out; + } + jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */ + jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */ + ump->softdep_jblocks = jblocks; +out: + if (error == 0) { + MNT_ILOCK(mp); + mp->mnt_flag |= MNT_SUJ; + mp->mnt_flag &= ~MNT_SOFTDEP; + MNT_IUNLOCK(mp); + /* + * Only validate the journal contents if the + * filesystem is clean, otherwise we write the logs + * but they'll never be used. If the filesystem was + * still dirty when we mounted it the journal is + * invalid and a new journal can only be valid if it + * starts from a clean mount. + */ + if (fs->fs_clean) { + DIP_SET(ip, i_modrev, fs->fs_mtime); + ip->i_flags |= IN_MODIFIED; + ffs_update(vp, 1); + } + } + vput(vp); + return (error); +} + +static void +journal_unmount(ump) + struct ufsmount *ump; +{ + + if (ump->softdep_jblocks) + jblocks_destroy(ump->softdep_jblocks); + ump->softdep_jblocks = NULL; +} + +/* + * Called when a journal record is ready to be written. Space is allocated + * and the journal entry is created when the journal is flushed to stable + * store. + */ +static void +add_to_journal(wk) + struct worklist *wk; +{ + struct ufsmount *ump; + + ump = VFSTOUFS(wk->wk_mp); + LOCK_OWNED(ump); + if (wk->wk_state & ONWORKLIST) + panic("add_to_journal: %s(0x%X) already on list", + TYPENAME(wk->wk_type), wk->wk_state); + wk->wk_state |= ONWORKLIST | DEPCOMPLETE; + if (LIST_EMPTY(&ump->softdep_journal_pending)) { + ump->softdep_jblocks->jb_age = ticks; + LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list); + } else + LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list); + ump->softdep_journal_tail = wk; + ump->softdep_on_journal += 1; +} + +/* + * Remove an arbitrary item for the journal worklist maintain the tail + * pointer. This happens when a new operation obviates the need to + * journal an old operation. + */ +static void +remove_from_journal(wk) + struct worklist *wk; +{ + struct ufsmount *ump; + + ump = VFSTOUFS(wk->wk_mp); + LOCK_OWNED(ump); +#ifdef SUJ_DEBUG + { + struct worklist *wkn; + + LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list) + if (wkn == wk) + break; + if (wkn == NULL) + panic("remove_from_journal: %p is not in journal", wk); + } +#endif + /* + * We emulate a TAILQ to save space in most structures which do not + * require TAILQ semantics. Here we must update the tail position + * when removing the tail which is not the final entry. This works + * only if the worklist linkage are at the beginning of the structure. + */ + if (ump->softdep_journal_tail == wk) + ump->softdep_journal_tail = + (struct worklist *)wk->wk_list.le_prev; + WORKLIST_REMOVE(wk); + ump->softdep_on_journal -= 1; +} + +/* + * Check for journal space as well as dependency limits so the prelink + * code can throttle both journaled and non-journaled filesystems. + * Threshold is 0 for low and 1 for min. + */ +static int +journal_space(ump, thresh) + struct ufsmount *ump; + int thresh; +{ + struct jblocks *jblocks; + int limit, avail; + + jblocks = ump->softdep_jblocks; + if (jblocks == NULL) + return (1); + /* + * We use a tighter restriction here to prevent request_cleanup() + * running in threads from running into locks we currently hold. + * We have to be over the limit and our filesystem has to be + * responsible for more than our share of that usage. + */ + limit = (max_softdeps / 10) * 9; + if (dep_current[D_INODEDEP] > limit && + ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads) + return (0); + if (thresh) + thresh = jblocks->jb_min; + else + thresh = jblocks->jb_low; + avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE; + avail = jblocks->jb_free - avail; + + return (avail > thresh); +} + +static void +journal_suspend(ump) + struct ufsmount *ump; +{ + struct jblocks *jblocks; + struct mount *mp; + + mp = UFSTOVFS(ump); + jblocks = ump->softdep_jblocks; + MNT_ILOCK(mp); + if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { + stat_journal_min++; + mp->mnt_kern_flag |= MNTK_SUSPEND; + mp->mnt_susp_owner = ump->softdep_flushtd; + } + jblocks->jb_suspended = 1; + MNT_IUNLOCK(mp); +} + +static int +journal_unsuspend(struct ufsmount *ump) +{ + struct jblocks *jblocks; + struct mount *mp; + + mp = UFSTOVFS(ump); + jblocks = ump->softdep_jblocks; + + if (jblocks != NULL && jblocks->jb_suspended && + journal_space(ump, jblocks->jb_min)) { + jblocks->jb_suspended = 0; + FREE_LOCK(ump); + mp->mnt_susp_owner = curthread; + vfs_write_resume(mp, 0); + ACQUIRE_LOCK(ump); + return (1); + } + return (0); +} + +/* + * Called before any allocation function to be certain that there is + * sufficient space in the journal prior to creating any new records. + * Since in the case of block allocation we may have multiple locked + * buffers at the time of the actual allocation we can not block + * when the journal records are created. Doing so would create a deadlock + * if any of these buffers needed to be flushed to reclaim space. Instead + * we require a sufficiently large amount of available space such that + * each thread in the system could have passed this allocation check and + * still have sufficient free space. With 20% of a minimum journal size + * of 1MB we have 6553 records available. + */ +int +softdep_prealloc(vp, waitok) + struct vnode *vp; + int waitok; +{ + struct ufsmount *ump; + + KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, + ("softdep_prealloc called on non-softdep filesystem")); + /* + * Nothing to do if we are not running journaled soft updates. + * If we currently hold the snapshot lock, we must avoid + * handling other resources that could cause deadlock. Do not + * touch quotas vnode since it is typically recursed with + * other vnode locks held. + */ + if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)) || + (vp->v_vflag & VV_SYSTEM) != 0) + return (0); + ump = VFSTOUFS(vp->v_mount); + ACQUIRE_LOCK(ump); + if (journal_space(ump, 0)) { + FREE_LOCK(ump); + return (0); + } + stat_journal_low++; + FREE_LOCK(ump); + if (waitok == MNT_NOWAIT) + return (ENOSPC); + /* + * Attempt to sync this vnode once to flush any journal + * work attached to it. + */ + if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0) + ffs_syncvnode(vp, waitok, 0); + ACQUIRE_LOCK(ump); + process_removes(vp); + process_truncates(vp); + if (journal_space(ump, 0) == 0) { + softdep_speedup(ump); + if (journal_space(ump, 1) == 0) + journal_suspend(ump); + } + FREE_LOCK(ump); + + return (0); +} + +/* + * Before adjusting a link count on a vnode verify that we have sufficient + * journal space. If not, process operations that depend on the currently + * locked pair of vnodes to try to flush space as the syncer, buf daemon, + * and softdep flush threads can not acquire these locks to reclaim space. + */ +static void +softdep_prelink(dvp, vp) + struct vnode *dvp; + struct vnode *vp; +{ + struct ufsmount *ump; + + ump = VFSTOUFS(dvp->v_mount); + LOCK_OWNED(ump); + /* + * Nothing to do if we have sufficient journal space. + * If we currently hold the snapshot lock, we must avoid + * handling other resources that could cause deadlock. + */ + if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp)))) + return; + stat_journal_low++; + FREE_LOCK(ump); + if (vp) + ffs_syncvnode(vp, MNT_NOWAIT, 0); + ffs_syncvnode(dvp, MNT_WAIT, 0); + ACQUIRE_LOCK(ump); + /* Process vp before dvp as it may create .. removes. */ + if (vp) { + process_removes(vp); + process_truncates(vp); + } + process_removes(dvp); + process_truncates(dvp); + softdep_speedup(ump); + process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT); + if (journal_space(ump, 0) == 0) { + softdep_speedup(ump); + if (journal_space(ump, 1) == 0) + journal_suspend(ump); + } +} + +static void +jseg_write(ump, jseg, data) + struct ufsmount *ump; + struct jseg *jseg; + uint8_t *data; +{ + struct jsegrec *rec; + + rec = (struct jsegrec *)data; + rec->jsr_seq = jseg->js_seq; + rec->jsr_oldest = jseg->js_oldseq; + rec->jsr_cnt = jseg->js_cnt; + rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize; + rec->jsr_crc = 0; + rec->jsr_time = ump->um_fs->fs_mtime; +} + +static inline void +inoref_write(inoref, jseg, rec) + struct inoref *inoref; + struct jseg *jseg; + struct jrefrec *rec; +{ + + inoref->if_jsegdep->jd_seg = jseg; + rec->jr_ino = inoref->if_ino; + rec->jr_parent = inoref->if_parent; + rec->jr_nlink = inoref->if_nlink; + rec->jr_mode = inoref->if_mode; + rec->jr_diroff = inoref->if_diroff; +} + +static void +jaddref_write(jaddref, jseg, data) + struct jaddref *jaddref; + struct jseg *jseg; + uint8_t *data; +{ + struct jrefrec *rec; + + rec = (struct jrefrec *)data; + rec->jr_op = JOP_ADDREF; + inoref_write(&jaddref->ja_ref, jseg, rec); +} + +static void +jremref_write(jremref, jseg, data) + struct jremref *jremref; + struct jseg *jseg; + uint8_t *data; +{ + struct jrefrec *rec; + + rec = (struct jrefrec *)data; + rec->jr_op = JOP_REMREF; + inoref_write(&jremref->jr_ref, jseg, rec); +} + +static void +jmvref_write(jmvref, jseg, data) + struct jmvref *jmvref; + struct jseg *jseg; + uint8_t *data; +{ + struct jmvrec *rec; + + rec = (struct jmvrec *)data; + rec->jm_op = JOP_MVREF; + rec->jm_ino = jmvref->jm_ino; + rec->jm_parent = jmvref->jm_parent; + rec->jm_oldoff = jmvref->jm_oldoff; + rec->jm_newoff = jmvref->jm_newoff; +} + +static void +jnewblk_write(jnewblk, jseg, data) + struct jnewblk *jnewblk; + struct jseg *jseg; + uint8_t *data; +{ + struct jblkrec *rec; + + jnewblk->jn_jsegdep->jd_seg = jseg; + rec = (struct jblkrec *)data; + rec->jb_op = JOP_NEWBLK; + rec->jb_ino = jnewblk->jn_ino; + rec->jb_blkno = jnewblk->jn_blkno; + rec->jb_lbn = jnewblk->jn_lbn; + rec->jb_frags = jnewblk->jn_frags; + rec->jb_oldfrags = jnewblk->jn_oldfrags; +} + +static void +jfreeblk_write(jfreeblk, jseg, data) + struct jfreeblk *jfreeblk; + struct jseg *jseg; + uint8_t *data; +{ + struct jblkrec *rec; + + jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg; + rec = (struct jblkrec *)data; + rec->jb_op = JOP_FREEBLK; + rec->jb_ino = jfreeblk->jf_ino; + rec->jb_blkno = jfreeblk->jf_blkno; + rec->jb_lbn = jfreeblk->jf_lbn; + rec->jb_frags = jfreeblk->jf_frags; + rec->jb_oldfrags = 0; +} + +static void +jfreefrag_write(jfreefrag, jseg, data) + struct jfreefrag *jfreefrag; + struct jseg *jseg; + uint8_t *data; +{ + struct jblkrec *rec; + + jfreefrag->fr_jsegdep->jd_seg = jseg; + rec = (struct jblkrec *)data; + rec->jb_op = JOP_FREEBLK; + rec->jb_ino = jfreefrag->fr_ino; + rec->jb_blkno = jfreefrag->fr_blkno; + rec->jb_lbn = jfreefrag->fr_lbn; + rec->jb_frags = jfreefrag->fr_frags; + rec->jb_oldfrags = 0; +} + +static void +jtrunc_write(jtrunc, jseg, data) + struct jtrunc *jtrunc; + struct jseg *jseg; + uint8_t *data; +{ + struct jtrncrec *rec; + + jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg; + rec = (struct jtrncrec *)data; + rec->jt_op = JOP_TRUNC; + rec->jt_ino = jtrunc->jt_ino; + rec->jt_size = jtrunc->jt_size; + rec->jt_extsize = jtrunc->jt_extsize; +} + +static void +jfsync_write(jfsync, jseg, data) + struct jfsync *jfsync; + struct jseg *jseg; + uint8_t *data; +{ + struct jtrncrec *rec; + + rec = (struct jtrncrec *)data; + rec->jt_op = JOP_SYNC; + rec->jt_ino = jfsync->jfs_ino; + rec->jt_size = jfsync->jfs_size; + rec->jt_extsize = jfsync->jfs_extsize; +} + +static void +softdep_flushjournal(mp) + struct mount *mp; +{ + struct jblocks *jblocks; + struct ufsmount *ump; + + if (MOUNTEDSUJ(mp) == 0) + return; + ump = VFSTOUFS(mp); + jblocks = ump->softdep_jblocks; + ACQUIRE_LOCK(ump); + while (ump->softdep_on_journal) { + jblocks->jb_needseg = 1; + softdep_process_journal(mp, NULL, MNT_WAIT); + } + FREE_LOCK(ump); +} + +static void softdep_synchronize_completed(struct bio *); +static void softdep_synchronize(struct bio *, struct ufsmount *, void *); + +static void +softdep_synchronize_completed(bp) + struct bio *bp; +{ + struct jseg *oldest; + struct jseg *jseg; + struct ufsmount *ump; + + /* + * caller1 marks the last segment written before we issued the + * synchronize cache. + */ + jseg = bp->bio_caller1; + if (jseg == NULL) { + g_destroy_bio(bp); + return; + } + ump = VFSTOUFS(jseg->js_list.wk_mp); + ACQUIRE_LOCK(ump); + oldest = NULL; + /* + * Mark all the journal entries waiting on the synchronize cache + * as completed so they may continue on. + */ + while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) { + jseg->js_state |= COMPLETE; + oldest = jseg; + jseg = TAILQ_PREV(jseg, jseglst, js_next); + } + /* + * Restart deferred journal entry processing from the oldest + * completed jseg. + */ + if (oldest) + complete_jsegs(oldest); + + FREE_LOCK(ump); + g_destroy_bio(bp); +} + +/* + * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering + * barriers. The journal must be written prior to any blocks that depend + * on it and the journal can not be released until the blocks have be + * written. This code handles both barriers simultaneously. + */ +static void +softdep_synchronize(bp, ump, caller1) + struct bio *bp; + struct ufsmount *ump; + void *caller1; +{ + + bp->bio_cmd = BIO_FLUSH; + bp->bio_flags |= BIO_ORDERED; + bp->bio_data = NULL; + bp->bio_offset = ump->um_cp->provider->mediasize; + bp->bio_length = 0; + bp->bio_done = softdep_synchronize_completed; + bp->bio_caller1 = caller1; + g_io_request(bp, + (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private); +} + +/* + * Flush some journal records to disk. + */ +static void +softdep_process_journal(mp, needwk, flags) + struct mount *mp; + struct worklist *needwk; + int flags; +{ + struct jblocks *jblocks; + struct ufsmount *ump; + struct worklist *wk; + struct jseg *jseg; + struct buf *bp; + struct bio *bio; + uint8_t *data; + struct fs *fs; + int shouldflush; + int segwritten; + int jrecmin; /* Minimum records per block. */ + int jrecmax; /* Maximum records per block. */ + int size; + int cnt; + int off; + int devbsize; + + if (MOUNTEDSUJ(mp) == 0) + return; + shouldflush = softdep_flushcache; + bio = NULL; + jseg = NULL; + ump = VFSTOUFS(mp); + LOCK_OWNED(ump); + fs = ump->um_fs; + jblocks = ump->softdep_jblocks; + devbsize = ump->um_devvp->v_bufobj.bo_bsize; + /* + * We write anywhere between a disk block and fs block. The upper + * bound is picked to prevent buffer cache fragmentation and limit + * processing time per I/O. + */ + jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */ + jrecmax = (fs->fs_bsize / devbsize) * jrecmin; + segwritten = 0; + for (;;) { + cnt = ump->softdep_on_journal; + /* + * Criteria for writing a segment: + * 1) We have a full block. + * 2) We're called from jwait() and haven't found the + * journal item yet. + * 3) Always write if needseg is set. + * 4) If we are called from process_worklist and have + * not yet written anything we write a partial block + * to enforce a 1 second maximum latency on journal + * entries. + */ + if (cnt < (jrecmax - 1) && needwk == NULL && + jblocks->jb_needseg == 0 && (segwritten || cnt == 0)) + break; + cnt++; + /* + * Verify some free journal space. softdep_prealloc() should + * guarantee that we don't run out so this is indicative of + * a problem with the flow control. Try to recover + * gracefully in any event. + */ + while (jblocks->jb_free == 0) { + if (flags != MNT_WAIT) + break; + printf("softdep: Out of journal space!\n"); + softdep_speedup(ump); + msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz); + } + FREE_LOCK(ump); + jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS); + workitem_alloc(&jseg->js_list, D_JSEG, mp); + LIST_INIT(&jseg->js_entries); + LIST_INIT(&jseg->js_indirs); + jseg->js_state = ATTACHED; + if (shouldflush == 0) + jseg->js_state |= COMPLETE; + else if (bio == NULL) + bio = g_alloc_bio(); + jseg->js_jblocks = jblocks; + bp = geteblk(fs->fs_bsize, 0); + ACQUIRE_LOCK(ump); + /* + * If there was a race while we were allocating the block + * and jseg the entry we care about was likely written. + * We bail out in both the WAIT and NOWAIT case and assume + * the caller will loop if the entry it cares about is + * not written. + */ + cnt = ump->softdep_on_journal; + if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) { + bp->b_flags |= B_INVAL | B_NOCACHE; + WORKITEM_FREE(jseg, D_JSEG); + FREE_LOCK(ump); + brelse(bp); + ACQUIRE_LOCK(ump); + break; + } + /* + * Calculate the disk block size required for the available + * records rounded to the min size. + */ + if (cnt == 0) + size = devbsize; + else if (cnt < jrecmax) + size = howmany(cnt, jrecmin) * devbsize; + else + size = fs->fs_bsize; + /* + * Allocate a disk block for this journal data and account + * for truncation of the requested size if enough contiguous + * space was not available. + */ + bp->b_blkno = jblocks_alloc(jblocks, size, &size); + bp->b_lblkno = bp->b_blkno; + bp->b_offset = bp->b_blkno * DEV_BSIZE; + bp->b_bcount = size; + bp->b_flags &= ~B_INVAL; + bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY; + /* + * Initialize our jseg with cnt records. Assign the next + * sequence number to it and link it in-order. + */ + cnt = MIN(cnt, (size / devbsize) * jrecmin); + jseg->js_buf = bp; + jseg->js_cnt = cnt; + jseg->js_refs = cnt + 1; /* Self ref. */ + jseg->js_size = size; + jseg->js_seq = jblocks->jb_nextseq++; + if (jblocks->jb_oldestseg == NULL) + jblocks->jb_oldestseg = jseg; + jseg->js_oldseq = jblocks->jb_oldestseg->js_seq; + TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next); + if (jblocks->jb_writeseg == NULL) + jblocks->jb_writeseg = jseg; + /* + * Start filling in records from the pending list. + */ + data = bp->b_data; + off = 0; + + /* + * Always put a header on the first block. + * XXX As with below, there might not be a chance to get + * into the loop. Ensure that something valid is written. + */ + jseg_write(ump, jseg, data); + off += JREC_SIZE; + data = bp->b_data + off; + + /* + * XXX Something is wrong here. There's no work to do, + * but we need to perform and I/O and allow it to complete + * anyways. + */ + if (LIST_EMPTY(&ump->softdep_journal_pending)) + stat_emptyjblocks++; + + while ((wk = LIST_FIRST(&ump->softdep_journal_pending)) + != NULL) { + if (cnt == 0) + break; + /* Place a segment header on every device block. */ + if ((off % devbsize) == 0) { + jseg_write(ump, jseg, data); + off += JREC_SIZE; + data = bp->b_data + off; + } + if (wk == needwk) + needwk = NULL; + remove_from_journal(wk); + wk->wk_state |= INPROGRESS; + WORKLIST_INSERT(&jseg->js_entries, wk); + switch (wk->wk_type) { + case D_JADDREF: + jaddref_write(WK_JADDREF(wk), jseg, data); + break; + case D_JREMREF: + jremref_write(WK_JREMREF(wk), jseg, data); + break; + case D_JMVREF: + jmvref_write(WK_JMVREF(wk), jseg, data); + break; + case D_JNEWBLK: + jnewblk_write(WK_JNEWBLK(wk), jseg, data); + break; + case D_JFREEBLK: + jfreeblk_write(WK_JFREEBLK(wk), jseg, data); + break; + case D_JFREEFRAG: + jfreefrag_write(WK_JFREEFRAG(wk), jseg, data); + break; + case D_JTRUNC: + jtrunc_write(WK_JTRUNC(wk), jseg, data); + break; + case D_JFSYNC: + jfsync_write(WK_JFSYNC(wk), jseg, data); + break; + default: + panic("process_journal: Unknown type %s", + TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + off += JREC_SIZE; + data = bp->b_data + off; + cnt--; + } + + /* Clear any remaining space so we don't leak kernel data */ + if (size > off) + bzero(data, size - off); + + /* + * Write this one buffer and continue. + */ + segwritten = 1; + jblocks->jb_needseg = 0; + WORKLIST_INSERT(&bp->b_dep, &jseg->js_list); + FREE_LOCK(ump); + pbgetvp(ump->um_devvp, bp); + /* + * We only do the blocking wait once we find the journal + * entry we're looking for. + */ + if (needwk == NULL && flags == MNT_WAIT) + bwrite(bp); + else + bawrite(bp); + ACQUIRE_LOCK(ump); + } + /* + * If we wrote a segment issue a synchronize cache so the journal + * is reflected on disk before the data is written. Since reclaiming + * journal space also requires writing a journal record this + * process also enforces a barrier before reclamation. + */ + if (segwritten && shouldflush) { + softdep_synchronize(bio, ump, + TAILQ_LAST(&jblocks->jb_segs, jseglst)); + } else if (bio) + g_destroy_bio(bio); + /* + * If we've suspended the filesystem because we ran out of journal + * space either try to sync it here to make some progress or + * unsuspend it if we already have. + */ + if (flags == 0 && jblocks->jb_suspended) { + if (journal_unsuspend(ump)) + return; + FREE_LOCK(ump); + VFS_SYNC(mp, MNT_NOWAIT); + ffs_sbupdate(ump, MNT_WAIT, 0); + ACQUIRE_LOCK(ump); + } +} + +/* + * Complete a jseg, allowing all dependencies awaiting journal writes + * to proceed. Each journal dependency also attaches a jsegdep to dependent + * structures so that the journal segment can be freed to reclaim space. + */ +static void +complete_jseg(jseg) + struct jseg *jseg; +{ + struct worklist *wk; + struct jmvref *jmvref; +#ifdef INVARIANTS + int i = 0; +#endif + + while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) { + WORKLIST_REMOVE(wk); + wk->wk_state &= ~INPROGRESS; + wk->wk_state |= COMPLETE; + KASSERT(i++ < jseg->js_cnt, + ("handle_written_jseg: overflow %d >= %d", + i - 1, jseg->js_cnt)); + switch (wk->wk_type) { + case D_JADDREF: + handle_written_jaddref(WK_JADDREF(wk)); + break; + case D_JREMREF: + handle_written_jremref(WK_JREMREF(wk)); + break; + case D_JMVREF: + rele_jseg(jseg); /* No jsegdep. */ + jmvref = WK_JMVREF(wk); + LIST_REMOVE(jmvref, jm_deps); + if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0) + free_pagedep(jmvref->jm_pagedep); + WORKITEM_FREE(jmvref, D_JMVREF); + break; + case D_JNEWBLK: + handle_written_jnewblk(WK_JNEWBLK(wk)); + break; + case D_JFREEBLK: + handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep); + break; + case D_JTRUNC: + handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep); + break; + case D_JFSYNC: + rele_jseg(jseg); /* No jsegdep. */ + WORKITEM_FREE(wk, D_JFSYNC); + break; + case D_JFREEFRAG: + handle_written_jfreefrag(WK_JFREEFRAG(wk)); + break; + default: + panic("handle_written_jseg: Unknown type %s", + TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + } + /* Release the self reference so the structure may be freed. */ + rele_jseg(jseg); +} + +/* + * Determine which jsegs are ready for completion processing. Waits for + * synchronize cache to complete as well as forcing in-order completion + * of journal entries. + */ +static void +complete_jsegs(jseg) + struct jseg *jseg; +{ + struct jblocks *jblocks; + struct jseg *jsegn; + + jblocks = jseg->js_jblocks; + /* + * Don't allow out of order completions. If this isn't the first + * block wait for it to write before we're done. + */ + if (jseg != jblocks->jb_writeseg) + return; + /* Iterate through available jsegs processing their entries. */ + while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) { + jblocks->jb_oldestwrseq = jseg->js_oldseq; + jsegn = TAILQ_NEXT(jseg, js_next); + complete_jseg(jseg); + jseg = jsegn; + } + jblocks->jb_writeseg = jseg; + /* + * Attempt to free jsegs now that oldestwrseq may have advanced. + */ + free_jsegs(jblocks); +} + +/* + * Mark a jseg as DEPCOMPLETE and throw away the buffer. Attempt to handle + * the final completions. + */ +static void +handle_written_jseg(jseg, bp) + struct jseg *jseg; + struct buf *bp; +{ + + if (jseg->js_refs == 0) + panic("handle_written_jseg: No self-reference on %p", jseg); + jseg->js_state |= DEPCOMPLETE; + /* + * We'll never need this buffer again, set flags so it will be + * discarded. + */ + bp->b_flags |= B_INVAL | B_NOCACHE; + pbrelvp(bp); + complete_jsegs(jseg); +} + +static inline struct jsegdep * +inoref_jseg(inoref) + struct inoref *inoref; +{ + struct jsegdep *jsegdep; + + jsegdep = inoref->if_jsegdep; + inoref->if_jsegdep = NULL; + + return (jsegdep); +} + +/* + * Called once a jremref has made it to stable store. The jremref is marked + * complete and we attempt to free it. Any pagedeps writes sleeping waiting + * for the jremref to complete will be awoken by free_jremref. + */ +static void +handle_written_jremref(jremref) + struct jremref *jremref; +{ + struct inodedep *inodedep; + struct jsegdep *jsegdep; + struct dirrem *dirrem; + + /* Grab the jsegdep. */ + jsegdep = inoref_jseg(&jremref->jr_ref); + /* + * Remove us from the inoref list. + */ + if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, + 0, &inodedep) == 0) + panic("handle_written_jremref: Lost inodedep"); + TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); + /* + * Complete the dirrem. + */ + dirrem = jremref->jr_dirrem; + jremref->jr_dirrem = NULL; + LIST_REMOVE(jremref, jr_deps); + jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT; + jwork_insert(&dirrem->dm_jwork, jsegdep); + if (LIST_EMPTY(&dirrem->dm_jremrefhd) && + (dirrem->dm_state & COMPLETE) != 0) + add_to_worklist(&dirrem->dm_list, 0); + free_jremref(jremref); +} + +/* + * Called once a jaddref has made it to stable store. The dependency is + * marked complete and any dependent structures are added to the inode + * bufwait list to be completed as soon as it is written. If a bitmap write + * depends on this entry we move the inode into the inodedephd of the + * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap. + */ +static void +handle_written_jaddref(jaddref) + struct jaddref *jaddref; +{ + struct jsegdep *jsegdep; + struct inodedep *inodedep; + struct diradd *diradd; + struct mkdir *mkdir; + + /* Grab the jsegdep. */ + jsegdep = inoref_jseg(&jaddref->ja_ref); + mkdir = NULL; + diradd = NULL; + if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, + 0, &inodedep) == 0) + panic("handle_written_jaddref: Lost inodedep."); + if (jaddref->ja_diradd == NULL) + panic("handle_written_jaddref: No dependency"); + if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) { + diradd = jaddref->ja_diradd; + WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list); + } else if (jaddref->ja_state & MKDIR_PARENT) { + mkdir = jaddref->ja_mkdir; + WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list); + } else if (jaddref->ja_state & MKDIR_BODY) + mkdir = jaddref->ja_mkdir; + else + panic("handle_written_jaddref: Unknown dependency %p", + jaddref->ja_diradd); + jaddref->ja_diradd = NULL; /* also clears ja_mkdir */ + /* + * Remove us from the inode list. + */ + TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); + /* + * The mkdir may be waiting on the jaddref to clear before freeing. + */ + if (mkdir) { + KASSERT(mkdir->md_list.wk_type == D_MKDIR, + ("handle_written_jaddref: Incorrect type for mkdir %s", + TYPENAME(mkdir->md_list.wk_type))); + mkdir->md_jaddref = NULL; + diradd = mkdir->md_diradd; + mkdir->md_state |= DEPCOMPLETE; + complete_mkdir(mkdir); + } + jwork_insert(&diradd->da_jwork, jsegdep); + if (jaddref->ja_state & NEWBLOCK) { + inodedep->id_state |= ONDEPLIST; + LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd, + inodedep, id_deps); + } + free_jaddref(jaddref); +} + +/* + * Called once a jnewblk journal is written. The allocdirect or allocindir + * is placed in the bmsafemap to await notification of a written bitmap. If + * the operation was canceled we add the segdep to the appropriate + * dependency to free the journal space once the canceling operation + * completes. + */ +static void +handle_written_jnewblk(jnewblk) + struct jnewblk *jnewblk; +{ + struct bmsafemap *bmsafemap; + struct freefrag *freefrag; + struct freework *freework; + struct jsegdep *jsegdep; + struct newblk *newblk; + + /* Grab the jsegdep. */ + jsegdep = jnewblk->jn_jsegdep; + jnewblk->jn_jsegdep = NULL; + if (jnewblk->jn_dep == NULL) + panic("handle_written_jnewblk: No dependency for the segdep."); + switch (jnewblk->jn_dep->wk_type) { + case D_NEWBLK: + case D_ALLOCDIRECT: + case D_ALLOCINDIR: + /* + * Add the written block to the bmsafemap so it can + * be notified when the bitmap is on disk. + */ + newblk = WK_NEWBLK(jnewblk->jn_dep); + newblk->nb_jnewblk = NULL; + if ((newblk->nb_state & GOINGAWAY) == 0) { + bmsafemap = newblk->nb_bmsafemap; + newblk->nb_state |= ONDEPLIST; + LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, + nb_deps); + } + jwork_insert(&newblk->nb_jwork, jsegdep); + break; + case D_FREEFRAG: + /* + * A newblock being removed by a freefrag when replaced by + * frag extension. + */ + freefrag = WK_FREEFRAG(jnewblk->jn_dep); + freefrag->ff_jdep = NULL; + jwork_insert(&freefrag->ff_jwork, jsegdep); + break; + case D_FREEWORK: + /* + * A direct block was removed by truncate. + */ + freework = WK_FREEWORK(jnewblk->jn_dep); + freework->fw_jnewblk = NULL; + jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep); + break; + default: + panic("handle_written_jnewblk: Unknown type %d.", + jnewblk->jn_dep->wk_type); + } + jnewblk->jn_dep = NULL; + free_jnewblk(jnewblk); +} + +/* + * Cancel a jfreefrag that won't be needed, probably due to colliding with + * an in-flight allocation that has not yet been committed. Divorce us + * from the freefrag and mark it DEPCOMPLETE so that it may be added + * to the worklist. + */ +static void +cancel_jfreefrag(jfreefrag) + struct jfreefrag *jfreefrag; +{ + struct freefrag *freefrag; + + if (jfreefrag->fr_jsegdep) { + free_jsegdep(jfreefrag->fr_jsegdep); + jfreefrag->fr_jsegdep = NULL; + } + freefrag = jfreefrag->fr_freefrag; + jfreefrag->fr_freefrag = NULL; + free_jfreefrag(jfreefrag); + freefrag->ff_state |= DEPCOMPLETE; + CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno); +} + +/* + * Free a jfreefrag when the parent freefrag is rendered obsolete. + */ +static void +free_jfreefrag(jfreefrag) + struct jfreefrag *jfreefrag; +{ + + if (jfreefrag->fr_state & INPROGRESS) + WORKLIST_REMOVE(&jfreefrag->fr_list); + else if (jfreefrag->fr_state & ONWORKLIST) + remove_from_journal(&jfreefrag->fr_list); + if (jfreefrag->fr_freefrag != NULL) + panic("free_jfreefrag: Still attached to a freefrag."); + WORKITEM_FREE(jfreefrag, D_JFREEFRAG); +} + +/* + * Called when the journal write for a jfreefrag completes. The parent + * freefrag is added to the worklist if this completes its dependencies. + */ +static void +handle_written_jfreefrag(jfreefrag) + struct jfreefrag *jfreefrag; +{ + struct jsegdep *jsegdep; + struct freefrag *freefrag; + + /* Grab the jsegdep. */ + jsegdep = jfreefrag->fr_jsegdep; + jfreefrag->fr_jsegdep = NULL; + freefrag = jfreefrag->fr_freefrag; + if (freefrag == NULL) + panic("handle_written_jfreefrag: No freefrag."); + freefrag->ff_state |= DEPCOMPLETE; + freefrag->ff_jdep = NULL; + jwork_insert(&freefrag->ff_jwork, jsegdep); + if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) + add_to_worklist(&freefrag->ff_list, 0); + jfreefrag->fr_freefrag = NULL; + free_jfreefrag(jfreefrag); +} + +/* + * Called when the journal write for a jfreeblk completes. The jfreeblk + * is removed from the freeblks list of pending journal writes and the + * jsegdep is moved to the freeblks jwork to be completed when all blocks + * have been reclaimed. + */ +static void +handle_written_jblkdep(jblkdep) + struct jblkdep *jblkdep; +{ + struct freeblks *freeblks; + struct jsegdep *jsegdep; + + /* Grab the jsegdep. */ + jsegdep = jblkdep->jb_jsegdep; + jblkdep->jb_jsegdep = NULL; + freeblks = jblkdep->jb_freeblks; + LIST_REMOVE(jblkdep, jb_deps); + jwork_insert(&freeblks->fb_jwork, jsegdep); + /* + * If the freeblks is all journaled, we can add it to the worklist. + */ + if (LIST_EMPTY(&freeblks->fb_jblkdephd) && + (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) + add_to_worklist(&freeblks->fb_list, WK_NODELAY); + + free_jblkdep(jblkdep); +} + +static struct jsegdep * +newjsegdep(struct worklist *wk) +{ + struct jsegdep *jsegdep; + + jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS); + workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp); + jsegdep->jd_seg = NULL; + + return (jsegdep); +} + +static struct jmvref * +newjmvref(dp, ino, oldoff, newoff) + struct inode *dp; + ino_t ino; + off_t oldoff; + off_t newoff; +{ + struct jmvref *jmvref; + + jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS); + workitem_alloc(&jmvref->jm_list, D_JMVREF, ITOVFS(dp)); + jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE; + jmvref->jm_parent = dp->i_number; + jmvref->jm_ino = ino; + jmvref->jm_oldoff = oldoff; + jmvref->jm_newoff = newoff; + + return (jmvref); +} + +/* + * Allocate a new jremref that tracks the removal of ip from dp with the + * directory entry offset of diroff. Mark the entry as ATTACHED and + * DEPCOMPLETE as we have all the information required for the journal write + * and the directory has already been removed from the buffer. The caller + * is responsible for linking the jremref into the pagedep and adding it + * to the journal to write. The MKDIR_PARENT flag is set if we're doing + * a DOTDOT addition so handle_workitem_remove() can properly assign + * the jsegdep when we're done. + */ +static struct jremref * +newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip, + off_t diroff, nlink_t nlink) +{ + struct jremref *jremref; + + jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS); + workitem_alloc(&jremref->jr_list, D_JREMREF, ITOVFS(dp)); + jremref->jr_state = ATTACHED; + newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff, + nlink, ip->i_mode); + jremref->jr_dirrem = dirrem; + + return (jremref); +} + +static inline void +newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff, + nlink_t nlink, uint16_t mode) +{ + + inoref->if_jsegdep = newjsegdep(&inoref->if_list); + inoref->if_diroff = diroff; + inoref->if_ino = ino; + inoref->if_parent = parent; + inoref->if_nlink = nlink; + inoref->if_mode = mode; +} + +/* + * Allocate a new jaddref to track the addition of ino to dp at diroff. The + * directory offset may not be known until later. The caller is responsible + * adding the entry to the journal when this information is available. nlink + * should be the link count prior to the addition and mode is only required + * to have the correct FMT. + */ +static struct jaddref * +newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink, + uint16_t mode) +{ + struct jaddref *jaddref; + + jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS); + workitem_alloc(&jaddref->ja_list, D_JADDREF, ITOVFS(dp)); + jaddref->ja_state = ATTACHED; + jaddref->ja_mkdir = NULL; + newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode); + + return (jaddref); +} + +/* + * Create a new free dependency for a freework. The caller is responsible + * for adjusting the reference count when it has the lock held. The freedep + * will track an outstanding bitmap write that will ultimately clear the + * freework to continue. + */ +static struct freedep * +newfreedep(struct freework *freework) +{ + struct freedep *freedep; + + freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS); + workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp); + freedep->fd_freework = freework; + + return (freedep); +} + +/* + * Free a freedep structure once the buffer it is linked to is written. If + * this is the last reference to the freework schedule it for completion. + */ +static void +free_freedep(freedep) + struct freedep *freedep; +{ + struct freework *freework; + + freework = freedep->fd_freework; + freework->fw_freeblks->fb_cgwait--; + if (--freework->fw_ref == 0) + freework_enqueue(freework); + WORKITEM_FREE(freedep, D_FREEDEP); +} + +/* + * Allocate a new freework structure that may be a level in an indirect + * when parent is not NULL or a top level block when it is. The top level + * freework structures are allocated without the per-filesystem lock held + * and before the freeblks is visible outside of softdep_setup_freeblocks(). + */ +static struct freework * +newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal) + struct ufsmount *ump; + struct freeblks *freeblks; + struct freework *parent; + ufs_lbn_t lbn; + ufs2_daddr_t nb; + int frags; + int off; + int journal; +{ + struct freework *freework; + + freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS); + workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp); + freework->fw_state = ATTACHED; + freework->fw_jnewblk = NULL; + freework->fw_freeblks = freeblks; + freework->fw_parent = parent; + freework->fw_lbn = lbn; + freework->fw_blkno = nb; + freework->fw_frags = frags; + freework->fw_indir = NULL; + freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR) + ? 0 : NINDIR(ump->um_fs) + 1; + freework->fw_start = freework->fw_off = off; + if (journal) + newjfreeblk(freeblks, lbn, nb, frags); + if (parent == NULL) { + ACQUIRE_LOCK(ump); + WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); + freeblks->fb_ref++; + FREE_LOCK(ump); + } + + return (freework); +} + +/* + * Eliminate a jfreeblk for a block that does not need journaling. + */ +static void +cancel_jfreeblk(freeblks, blkno) + struct freeblks *freeblks; + ufs2_daddr_t blkno; +{ + struct jfreeblk *jfreeblk; + struct jblkdep *jblkdep; + + LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) { + if (jblkdep->jb_list.wk_type != D_JFREEBLK) + continue; + jfreeblk = WK_JFREEBLK(&jblkdep->jb_list); + if (jfreeblk->jf_blkno == blkno) + break; + } + if (jblkdep == NULL) + return; + CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno); + free_jsegdep(jblkdep->jb_jsegdep); + LIST_REMOVE(jblkdep, jb_deps); + WORKITEM_FREE(jfreeblk, D_JFREEBLK); +} + +/* + * Allocate a new jfreeblk to journal top level block pointer when truncating + * a file. The caller must add this to the worklist when the per-filesystem + * lock is held. + */ +static struct jfreeblk * +newjfreeblk(freeblks, lbn, blkno, frags) + struct freeblks *freeblks; + ufs_lbn_t lbn; + ufs2_daddr_t blkno; + int frags; +{ + struct jfreeblk *jfreeblk; + + jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS); + workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK, + freeblks->fb_list.wk_mp); + jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list); + jfreeblk->jf_dep.jb_freeblks = freeblks; + jfreeblk->jf_ino = freeblks->fb_inum; + jfreeblk->jf_lbn = lbn; + jfreeblk->jf_blkno = blkno; + jfreeblk->jf_frags = frags; + LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps); + + return (jfreeblk); +} + +/* + * The journal is only prepared to handle full-size block numbers, so we + * have to adjust the record to reflect the change to a full-size block. + * For example, suppose we have a block made up of fragments 8-15 and + * want to free its last two fragments. We are given a request that says: + * FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0 + * where frags are the number of fragments to free and oldfrags are the + * number of fragments to keep. To block align it, we have to change it to + * have a valid full-size blkno, so it becomes: + * FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6 + */ +static void +adjust_newfreework(freeblks, frag_offset) + struct freeblks *freeblks; + int frag_offset; +{ + struct jfreeblk *jfreeblk; + + KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != NULL && + LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK), + ("adjust_newfreework: Missing freeblks dependency")); + + jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd)); + jfreeblk->jf_blkno -= frag_offset; + jfreeblk->jf_frags += frag_offset; +} + +/* + * Allocate a new jtrunc to track a partial truncation. + */ +static struct jtrunc * +newjtrunc(freeblks, size, extsize) + struct freeblks *freeblks; + off_t size; + int extsize; +{ + struct jtrunc *jtrunc; + + jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS); + workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC, + freeblks->fb_list.wk_mp); + jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list); + jtrunc->jt_dep.jb_freeblks = freeblks; + jtrunc->jt_ino = freeblks->fb_inum; + jtrunc->jt_size = size; + jtrunc->jt_extsize = extsize; + LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps); + + return (jtrunc); +} + +/* + * If we're canceling a new bitmap we have to search for another ref + * to move into the bmsafemap dep. This might be better expressed + * with another structure. + */ +static void +move_newblock_dep(jaddref, inodedep) + struct jaddref *jaddref; + struct inodedep *inodedep; +{ + struct inoref *inoref; + struct jaddref *jaddrefn; + + jaddrefn = NULL; + for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; + inoref = TAILQ_NEXT(inoref, if_deps)) { + if ((jaddref->ja_state & NEWBLOCK) && + inoref->if_list.wk_type == D_JADDREF) { + jaddrefn = (struct jaddref *)inoref; + break; + } + } + if (jaddrefn == NULL) + return; + jaddrefn->ja_state &= ~(ATTACHED | UNDONE); + jaddrefn->ja_state |= jaddref->ja_state & + (ATTACHED | UNDONE | NEWBLOCK); + jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK); + jaddref->ja_state |= ATTACHED; + LIST_REMOVE(jaddref, ja_bmdeps); + LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn, + ja_bmdeps); +} + +/* + * Cancel a jaddref either before it has been written or while it is being + * written. This happens when a link is removed before the add reaches + * the disk. The jaddref dependency is kept linked into the bmsafemap + * and inode to prevent the link count or bitmap from reaching the disk + * until handle_workitem_remove() re-adjusts the counts and bitmaps as + * required. + * + * Returns 1 if the canceled addref requires journaling of the remove and + * 0 otherwise. + */ +static int +cancel_jaddref(jaddref, inodedep, wkhd) + struct jaddref *jaddref; + struct inodedep *inodedep; + struct workhead *wkhd; +{ + struct inoref *inoref; + struct jsegdep *jsegdep; + int needsj; + + KASSERT((jaddref->ja_state & COMPLETE) == 0, + ("cancel_jaddref: Canceling complete jaddref")); + if (jaddref->ja_state & (INPROGRESS | COMPLETE)) + needsj = 1; + else + needsj = 0; + if (inodedep == NULL) + if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, + 0, &inodedep) == 0) + panic("cancel_jaddref: Lost inodedep"); + /* + * We must adjust the nlink of any reference operation that follows + * us so that it is consistent with the in-memory reference. This + * ensures that inode nlink rollbacks always have the correct link. + */ + if (needsj == 0) { + for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; + inoref = TAILQ_NEXT(inoref, if_deps)) { + if (inoref->if_state & GOINGAWAY) + break; + inoref->if_nlink--; + } + } + jsegdep = inoref_jseg(&jaddref->ja_ref); + if (jaddref->ja_state & NEWBLOCK) + move_newblock_dep(jaddref, inodedep); + wake_worklist(&jaddref->ja_list); + jaddref->ja_mkdir = NULL; + if (jaddref->ja_state & INPROGRESS) { + jaddref->ja_state &= ~INPROGRESS; + WORKLIST_REMOVE(&jaddref->ja_list); + jwork_insert(wkhd, jsegdep); + } else { + free_jsegdep(jsegdep); + if (jaddref->ja_state & DEPCOMPLETE) + remove_from_journal(&jaddref->ja_list); + } + jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE); + /* + * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove + * can arrange for them to be freed with the bitmap. Otherwise we + * no longer need this addref attached to the inoreflst and it + * will incorrectly adjust nlink if we leave it. + */ + if ((jaddref->ja_state & NEWBLOCK) == 0) { + TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, + if_deps); + jaddref->ja_state |= COMPLETE; + free_jaddref(jaddref); + return (needsj); + } + /* + * Leave the head of the list for jsegdeps for fast merging. + */ + if (LIST_FIRST(wkhd) != NULL) { + jaddref->ja_state |= ONWORKLIST; + LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list); + } else + WORKLIST_INSERT(wkhd, &jaddref->ja_list); + + return (needsj); +} + +/* + * Attempt to free a jaddref structure when some work completes. This + * should only succeed once the entry is written and all dependencies have + * been notified. + */ +static void +free_jaddref(jaddref) + struct jaddref *jaddref; +{ + + if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE) + return; + if (jaddref->ja_ref.if_jsegdep) + panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n", + jaddref, jaddref->ja_state); + if (jaddref->ja_state & NEWBLOCK) + LIST_REMOVE(jaddref, ja_bmdeps); + if (jaddref->ja_state & (INPROGRESS | ONWORKLIST)) + panic("free_jaddref: Bad state %p(0x%X)", + jaddref, jaddref->ja_state); + if (jaddref->ja_mkdir != NULL) + panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state); + WORKITEM_FREE(jaddref, D_JADDREF); +} + +/* + * Free a jremref structure once it has been written or discarded. + */ +static void +free_jremref(jremref) + struct jremref *jremref; +{ + + if (jremref->jr_ref.if_jsegdep) + free_jsegdep(jremref->jr_ref.if_jsegdep); + if (jremref->jr_state & INPROGRESS) + panic("free_jremref: IO still pending"); + WORKITEM_FREE(jremref, D_JREMREF); +} + +/* + * Free a jnewblk structure. + */ +static void +free_jnewblk(jnewblk) + struct jnewblk *jnewblk; +{ + + if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE) + return; + LIST_REMOVE(jnewblk, jn_deps); + if (jnewblk->jn_dep != NULL) + panic("free_jnewblk: Dependency still attached."); + WORKITEM_FREE(jnewblk, D_JNEWBLK); +} + +/* + * Cancel a jnewblk which has been been made redundant by frag extension. + */ +static void +cancel_jnewblk(jnewblk, wkhd) + struct jnewblk *jnewblk; + struct workhead *wkhd; +{ + struct jsegdep *jsegdep; + + CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno); + jsegdep = jnewblk->jn_jsegdep; + if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL) + panic("cancel_jnewblk: Invalid state"); + jnewblk->jn_jsegdep = NULL; + jnewblk->jn_dep = NULL; + jnewblk->jn_state |= GOINGAWAY; + if (jnewblk->jn_state & INPROGRESS) { + jnewblk->jn_state &= ~INPROGRESS; + WORKLIST_REMOVE(&jnewblk->jn_list); + jwork_insert(wkhd, jsegdep); + } else { + free_jsegdep(jsegdep); + remove_from_journal(&jnewblk->jn_list); + } + wake_worklist(&jnewblk->jn_list); + WORKLIST_INSERT(wkhd, &jnewblk->jn_list); +} + +static void +free_jblkdep(jblkdep) + struct jblkdep *jblkdep; +{ + + if (jblkdep->jb_list.wk_type == D_JFREEBLK) + WORKITEM_FREE(jblkdep, D_JFREEBLK); + else if (jblkdep->jb_list.wk_type == D_JTRUNC) + WORKITEM_FREE(jblkdep, D_JTRUNC); + else + panic("free_jblkdep: Unexpected type %s", + TYPENAME(jblkdep->jb_list.wk_type)); +} + +/* + * Free a single jseg once it is no longer referenced in memory or on + * disk. Reclaim journal blocks and dependencies waiting for the segment + * to disappear. + */ +static void +free_jseg(jseg, jblocks) + struct jseg *jseg; + struct jblocks *jblocks; +{ + struct freework *freework; + + /* + * Free freework structures that were lingering to indicate freed + * indirect blocks that forced journal write ordering on reallocate. + */ + while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL) + indirblk_remove(freework); + if (jblocks->jb_oldestseg == jseg) + jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next); + TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next); + jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size); + KASSERT(LIST_EMPTY(&jseg->js_entries), + ("free_jseg: Freed jseg has valid entries.")); + WORKITEM_FREE(jseg, D_JSEG); +} + +/* + * Free all jsegs that meet the criteria for being reclaimed and update + * oldestseg. + */ +static void +free_jsegs(jblocks) + struct jblocks *jblocks; +{ + struct jseg *jseg; + + /* + * Free only those jsegs which have none allocated before them to + * preserve the journal space ordering. + */ + while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) { + /* + * Only reclaim space when nothing depends on this journal + * set and another set has written that it is no longer + * valid. + */ + if (jseg->js_refs != 0) { + jblocks->jb_oldestseg = jseg; + return; + } + if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE) + break; + if (jseg->js_seq > jblocks->jb_oldestwrseq) + break; + /* + * We can free jsegs that didn't write entries when + * oldestwrseq == js_seq. + */ + if (jseg->js_seq == jblocks->jb_oldestwrseq && + jseg->js_cnt != 0) + break; + free_jseg(jseg, jblocks); + } + /* + * If we exited the loop above we still must discover the + * oldest valid segment. + */ + if (jseg) + for (jseg = jblocks->jb_oldestseg; jseg != NULL; + jseg = TAILQ_NEXT(jseg, js_next)) + if (jseg->js_refs != 0) + break; + jblocks->jb_oldestseg = jseg; + /* + * The journal has no valid records but some jsegs may still be + * waiting on oldestwrseq to advance. We force a small record + * out to permit these lingering records to be reclaimed. + */ + if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs)) + jblocks->jb_needseg = 1; +} + +/* + * Release one reference to a jseg and free it if the count reaches 0. This + * should eventually reclaim journal space as well. + */ +static void +rele_jseg(jseg) + struct jseg *jseg; +{ + + KASSERT(jseg->js_refs > 0, + ("free_jseg: Invalid refcnt %d", jseg->js_refs)); + if (--jseg->js_refs != 0) + return; + free_jsegs(jseg->js_jblocks); +} + +/* + * Release a jsegdep and decrement the jseg count. + */ +static void +free_jsegdep(jsegdep) + struct jsegdep *jsegdep; +{ + + if (jsegdep->jd_seg) + rele_jseg(jsegdep->jd_seg); + WORKITEM_FREE(jsegdep, D_JSEGDEP); +} + +/* + * Wait for a journal item to make it to disk. Initiate journal processing + * if required. + */ +static int +jwait(wk, waitfor) + struct worklist *wk; + int waitfor; +{ + + LOCK_OWNED(VFSTOUFS(wk->wk_mp)); + /* + * Blocking journal waits cause slow synchronous behavior. Record + * stats on the frequency of these blocking operations. + */ + if (waitfor == MNT_WAIT) { + stat_journal_wait++; + switch (wk->wk_type) { + case D_JREMREF: + case D_JMVREF: + stat_jwait_filepage++; + break; + case D_JTRUNC: + case D_JFREEBLK: + stat_jwait_freeblks++; + break; + case D_JNEWBLK: + stat_jwait_newblk++; + break; + case D_JADDREF: + stat_jwait_inode++; + break; + default: + break; + } + } + /* + * If IO has not started we process the journal. We can't mark the + * worklist item as IOWAITING because we drop the lock while + * processing the journal and the worklist entry may be freed after + * this point. The caller may call back in and re-issue the request. + */ + if ((wk->wk_state & INPROGRESS) == 0) { + softdep_process_journal(wk->wk_mp, wk, waitfor); + if (waitfor != MNT_WAIT) + return (EBUSY); + return (0); + } + if (waitfor != MNT_WAIT) + return (EBUSY); + wait_worklist(wk, "jwait"); + return (0); +} + +/* + * Lookup an inodedep based on an inode pointer and set the nlinkdelta as + * appropriate. This is a convenience function to reduce duplicate code + * for the setup and revert functions below. + */ +static struct inodedep * +inodedep_lookup_ip(ip) + struct inode *ip; +{ + struct inodedep *inodedep; + + KASSERT(ip->i_nlink >= ip->i_effnlink, + ("inodedep_lookup_ip: bad delta")); + (void) inodedep_lookup(ITOVFS(ip), ip->i_number, DEPALLOC, + &inodedep); + inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; + KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked")); + + return (inodedep); +} + +/* + * Called prior to creating a new inode and linking it to a directory. The + * jaddref structure must already be allocated by softdep_setup_inomapdep + * and it is discovered here so we can initialize the mode and update + * nlinkdelta. + */ +void +softdep_setup_create(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct vnode *dvp; + + KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, + ("softdep_setup_create called on non-softdep filesystem")); + KASSERT(ip->i_nlink == 1, + ("softdep_setup_create: Invalid link count.")); + dvp = ITOV(dp); + ACQUIRE_LOCK(ITOUMP(dp)); + inodedep = inodedep_lookup_ip(ip); + if (DOINGSUJ(dvp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, + ("softdep_setup_create: No addref structure present.")); + } + softdep_prelink(dvp, NULL); + FREE_LOCK(ITOUMP(dp)); +} + +/* + * Create a jaddref structure to track the addition of a DOTDOT link when + * we are reparenting an inode as part of a rename. This jaddref will be + * found by softdep_setup_directory_change. Adjusts nlinkdelta for + * non-journaling softdep. + */ +void +softdep_setup_dotdot_link(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct vnode *dvp; + + KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, + ("softdep_setup_dotdot_link called on non-softdep filesystem")); + dvp = ITOV(dp); + jaddref = NULL; + /* + * We don't set MKDIR_PARENT as this is not tied to a mkdir and + * is used as a normal link would be. + */ + if (DOINGSUJ(dvp)) + jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, + dp->i_effnlink - 1, dp->i_mode); + ACQUIRE_LOCK(ITOUMP(dp)); + inodedep = inodedep_lookup_ip(dp); + if (jaddref) + TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, + if_deps); + softdep_prelink(dvp, ITOV(ip)); + FREE_LOCK(ITOUMP(dp)); +} + +/* + * Create a jaddref structure to track a new link to an inode. The directory + * offset is not known until softdep_setup_directory_add or + * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling + * softdep. + */ +void +softdep_setup_link(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct vnode *dvp; + + KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, + ("softdep_setup_link called on non-softdep filesystem")); + dvp = ITOV(dp); + jaddref = NULL; + if (DOINGSUJ(dvp)) + jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1, + ip->i_mode); + ACQUIRE_LOCK(ITOUMP(dp)); + inodedep = inodedep_lookup_ip(ip); + if (jaddref) + TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, + if_deps); + softdep_prelink(dvp, ITOV(ip)); + FREE_LOCK(ITOUMP(dp)); +} + +/* + * Called to create the jaddref structures to track . and .. references as + * well as lookup and further initialize the incomplete jaddref created + * by softdep_setup_inomapdep when the inode was allocated. Adjusts + * nlinkdelta for non-journaling softdep. + */ +void +softdep_setup_mkdir(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *dotdotaddref; + struct jaddref *dotaddref; + struct jaddref *jaddref; + struct vnode *dvp; + + KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, + ("softdep_setup_mkdir called on non-softdep filesystem")); + dvp = ITOV(dp); + dotaddref = dotdotaddref = NULL; + if (DOINGSUJ(dvp)) { + dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1, + ip->i_mode); + dotaddref->ja_state |= MKDIR_BODY; + dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, + dp->i_effnlink - 1, dp->i_mode); + dotdotaddref->ja_state |= MKDIR_PARENT; + } + ACQUIRE_LOCK(ITOUMP(dp)); + inodedep = inodedep_lookup_ip(ip); + if (DOINGSUJ(dvp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref != NULL, + ("softdep_setup_mkdir: No addref structure present.")); + KASSERT(jaddref->ja_parent == dp->i_number, + ("softdep_setup_mkdir: bad parent %ju", + (uintmax_t)jaddref->ja_parent)); + TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref, + if_deps); + } + inodedep = inodedep_lookup_ip(dp); + if (DOINGSUJ(dvp)) + TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, + &dotdotaddref->ja_ref, if_deps); + softdep_prelink(ITOV(dp), NULL); + FREE_LOCK(ITOUMP(dp)); +} + +/* + * Called to track nlinkdelta of the inode and parent directories prior to + * unlinking a directory. + */ +void +softdep_setup_rmdir(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct vnode *dvp; + + KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, + ("softdep_setup_rmdir called on non-softdep filesystem")); + dvp = ITOV(dp); + ACQUIRE_LOCK(ITOUMP(dp)); + (void) inodedep_lookup_ip(ip); + (void) inodedep_lookup_ip(dp); + softdep_prelink(dvp, ITOV(ip)); + FREE_LOCK(ITOUMP(dp)); +} + +/* + * Called to track nlinkdelta of the inode and parent directories prior to + * unlink. + */ +void +softdep_setup_unlink(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct vnode *dvp; + + KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, + ("softdep_setup_unlink called on non-softdep filesystem")); + dvp = ITOV(dp); + ACQUIRE_LOCK(ITOUMP(dp)); + (void) inodedep_lookup_ip(ip); + (void) inodedep_lookup_ip(dp); + softdep_prelink(dvp, ITOV(ip)); + FREE_LOCK(ITOUMP(dp)); +} + +/* + * Called to release the journal structures created by a failed non-directory + * creation. Adjusts nlinkdelta for non-journaling softdep. + */ +void +softdep_revert_create(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct vnode *dvp; + + KASSERT(MOUNTEDSOFTDEP(ITOVFS((dp))) != 0, + ("softdep_revert_create called on non-softdep filesystem")); + dvp = ITOV(dp); + ACQUIRE_LOCK(ITOUMP(dp)); + inodedep = inodedep_lookup_ip(ip); + if (DOINGSUJ(dvp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref->ja_parent == dp->i_number, + ("softdep_revert_create: addref parent mismatch")); + cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); + } + FREE_LOCK(ITOUMP(dp)); +} + +/* + * Called to release the journal structures created by a failed link + * addition. Adjusts nlinkdelta for non-journaling softdep. + */ +void +softdep_revert_link(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct vnode *dvp; + + KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, + ("softdep_revert_link called on non-softdep filesystem")); + dvp = ITOV(dp); + ACQUIRE_LOCK(ITOUMP(dp)); + inodedep = inodedep_lookup_ip(ip); + if (DOINGSUJ(dvp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref->ja_parent == dp->i_number, + ("softdep_revert_link: addref parent mismatch")); + cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); + } + FREE_LOCK(ITOUMP(dp)); +} + +/* + * Called to release the journal structures created by a failed mkdir + * attempt. Adjusts nlinkdelta for non-journaling softdep. + */ +void +softdep_revert_mkdir(dp, ip) + struct inode *dp; + struct inode *ip; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct jaddref *dotaddref; + struct vnode *dvp; + + KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, + ("softdep_revert_mkdir called on non-softdep filesystem")); + dvp = ITOV(dp); + + ACQUIRE_LOCK(ITOUMP(dp)); + inodedep = inodedep_lookup_ip(dp); + if (DOINGSUJ(dvp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref->ja_parent == ip->i_number, + ("softdep_revert_mkdir: dotdot addref parent mismatch")); + cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); + } + inodedep = inodedep_lookup_ip(ip); + if (DOINGSUJ(dvp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref->ja_parent == dp->i_number, + ("softdep_revert_mkdir: addref parent mismatch")); + dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, + inoreflst, if_deps); + cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); + KASSERT(dotaddref->ja_parent == ip->i_number, + ("softdep_revert_mkdir: dot addref parent mismatch")); + cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait); + } + FREE_LOCK(ITOUMP(dp)); +} + +/* + * Called to correct nlinkdelta after a failed rmdir. + */ +void +softdep_revert_rmdir(dp, ip) + struct inode *dp; + struct inode *ip; +{ + + KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, + ("softdep_revert_rmdir called on non-softdep filesystem")); + ACQUIRE_LOCK(ITOUMP(dp)); + (void) inodedep_lookup_ip(ip); + (void) inodedep_lookup_ip(dp); + FREE_LOCK(ITOUMP(dp)); +} + +/* + * Protecting the freemaps (or bitmaps). + * + * To eliminate the need to execute fsck before mounting a filesystem + * after a power failure, one must (conservatively) guarantee that the + * on-disk copy of the bitmaps never indicate that a live inode or block is + * free. So, when a block or inode is allocated, the bitmap should be + * updated (on disk) before any new pointers. When a block or inode is + * freed, the bitmap should not be updated until all pointers have been + * reset. The latter dependency is handled by the delayed de-allocation + * approach described below for block and inode de-allocation. The former + * dependency is handled by calling the following procedure when a block or + * inode is allocated. When an inode is allocated an "inodedep" is created + * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. + * Each "inodedep" is also inserted into the hash indexing structure so + * that any additional link additions can be made dependent on the inode + * allocation. + * + * The ufs filesystem maintains a number of free block counts (e.g., per + * cylinder group, per cylinder and per pair) + * in addition to the bitmaps. These counts are used to improve efficiency + * during allocation and therefore must be consistent with the bitmaps. + * There is no convenient way to guarantee post-crash consistency of these + * counts with simple update ordering, for two main reasons: (1) The counts + * and bitmaps for a single cylinder group block are not in the same disk + * sector. If a disk write is interrupted (e.g., by power failure), one may + * be written and the other not. (2) Some of the counts are located in the + * superblock rather than the cylinder group block. So, we focus our soft + * updates implementation on protecting the bitmaps. When mounting a + * filesystem, we recompute the auxiliary counts from the bitmaps. + */ + +/* + * Called just after updating the cylinder group block to allocate an inode. + */ +void +softdep_setup_inomapdep(bp, ip, newinum, mode) + struct buf *bp; /* buffer for cylgroup block with inode map */ + struct inode *ip; /* inode related to allocation */ + ino_t newinum; /* new inode number being allocated */ + int mode; +{ + struct inodedep *inodedep; + struct bmsafemap *bmsafemap; + struct jaddref *jaddref; + struct mount *mp; + struct fs *fs; + + mp = ITOVFS(ip); + KASSERT(MOUNTEDSOFTDEP(mp) != 0, + ("softdep_setup_inomapdep called on non-softdep filesystem")); + fs = VFSTOUFS(mp)->um_fs; + jaddref = NULL; + + /* + * Allocate the journal reference add structure so that the bitmap + * can be dependent on it. + */ + if (MOUNTEDSUJ(mp)) { + jaddref = newjaddref(ip, newinum, 0, 0, mode); + jaddref->ja_state |= NEWBLOCK; + } + + /* + * Create a dependency for the newly allocated inode. + * Panic if it already exists as something is seriously wrong. + * Otherwise add it to the dependency list for the buffer holding + * the cylinder group map from which it was allocated. + * + * We have to preallocate a bmsafemap entry in case it is needed + * in bmsafemap_lookup since once we allocate the inodedep, we + * have to finish initializing it before we can FREE_LOCK(). + * By preallocating, we avoid FREE_LOCK() while doing a malloc + * in bmsafemap_lookup. We cannot call bmsafemap_lookup before + * creating the inodedep as it can be freed during the time + * that we FREE_LOCK() while allocating the inodedep. We must + * call workitem_alloc() before entering the locked section as + * it also acquires the lock and we must avoid trying doing so + * recursively. + */ + bmsafemap = malloc(sizeof(struct bmsafemap), + M_BMSAFEMAP, M_SOFTDEP_FLAGS); + workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); + ACQUIRE_LOCK(ITOUMP(ip)); + if ((inodedep_lookup(mp, newinum, DEPALLOC, &inodedep))) + panic("softdep_setup_inomapdep: dependency %p for new" + "inode already exists", inodedep); + bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap); + if (jaddref) { + LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps); + TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, + if_deps); + } else { + inodedep->id_state |= ONDEPLIST; + LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); + } + inodedep->id_bmsafemap = bmsafemap; + inodedep->id_state &= ~DEPCOMPLETE; + FREE_LOCK(ITOUMP(ip)); +} + +/* + * Called just after updating the cylinder group block to + * allocate block or fragment. + */ +void +softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) + struct buf *bp; /* buffer for cylgroup block with block map */ + struct mount *mp; /* filesystem doing allocation */ + ufs2_daddr_t newblkno; /* number of newly allocated block */ + int frags; /* Number of fragments. */ + int oldfrags; /* Previous number of fragments for extend. */ +{ + struct newblk *newblk; + struct bmsafemap *bmsafemap; + struct jnewblk *jnewblk; + struct ufsmount *ump; + struct fs *fs; + + KASSERT(MOUNTEDSOFTDEP(mp) != 0, + ("softdep_setup_blkmapdep called on non-softdep filesystem")); + ump = VFSTOUFS(mp); + fs = ump->um_fs; + jnewblk = NULL; + /* + * Create a dependency for the newly allocated block. + * Add it to the dependency list for the buffer holding + * the cylinder group map from which it was allocated. + */ + if (MOUNTEDSUJ(mp)) { + jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS); + workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp); + jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list); + jnewblk->jn_state = ATTACHED; + jnewblk->jn_blkno = newblkno; + jnewblk->jn_frags = frags; + jnewblk->jn_oldfrags = oldfrags; +#ifdef SUJ_DEBUG + { + struct cg *cgp; + uint8_t *blksfree; + long bno; + int i; + + cgp = (struct cg *)bp->b_data; + blksfree = cg_blksfree(cgp); + bno = dtogd(fs, jnewblk->jn_blkno); + for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; + i++) { + if (isset(blksfree, bno + i)) + panic("softdep_setup_blkmapdep: " + "free fragment %d from %d-%d " + "state 0x%X dep %p", i, + jnewblk->jn_oldfrags, + jnewblk->jn_frags, + jnewblk->jn_state, + jnewblk->jn_dep); + } + } +#endif + } + + CTR3(KTR_SUJ, + "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d", + newblkno, frags, oldfrags); + ACQUIRE_LOCK(ump); + if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0) + panic("softdep_setup_blkmapdep: found block"); + newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp, + dtog(fs, newblkno), NULL); + if (jnewblk) { + jnewblk->jn_dep = (struct worklist *)newblk; + LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps); + } else { + newblk->nb_state |= ONDEPLIST; + LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); + } + newblk->nb_bmsafemap = bmsafemap; + newblk->nb_jnewblk = jnewblk; + FREE_LOCK(ump); +} + +#define BMSAFEMAP_HASH(ump, cg) \ + (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size]) + +static int +bmsafemap_find(bmsafemaphd, cg, bmsafemapp) + struct bmsafemap_hashhead *bmsafemaphd; + int cg; + struct bmsafemap **bmsafemapp; +{ + struct bmsafemap *bmsafemap; + + LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash) + if (bmsafemap->sm_cg == cg) + break; + if (bmsafemap) { + *bmsafemapp = bmsafemap; + return (1); + } + *bmsafemapp = NULL; + + return (0); +} + +/* + * Find the bmsafemap associated with a cylinder group buffer. + * If none exists, create one. The buffer must be locked when + * this routine is called and this routine must be called with + * the softdep lock held. To avoid giving up the lock while + * allocating a new bmsafemap, a preallocated bmsafemap may be + * provided. If it is provided but not needed, it is freed. + */ +static struct bmsafemap * +bmsafemap_lookup(mp, bp, cg, newbmsafemap) + struct mount *mp; + struct buf *bp; + int cg; + struct bmsafemap *newbmsafemap; +{ + struct bmsafemap_hashhead *bmsafemaphd; + struct bmsafemap *bmsafemap, *collision; + struct worklist *wk; + struct ufsmount *ump; + + ump = VFSTOUFS(mp); + LOCK_OWNED(ump); + KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer")); + LIST_FOREACH(wk, &bp->b_dep, wk_list) { + if (wk->wk_type == D_BMSAFEMAP) { + if (newbmsafemap) + WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP); + return (WK_BMSAFEMAP(wk)); + } + } + bmsafemaphd = BMSAFEMAP_HASH(ump, cg); + if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) { + if (newbmsafemap) + WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP); + return (bmsafemap); + } + if (newbmsafemap) { + bmsafemap = newbmsafemap; + } else { + FREE_LOCK(ump); + bmsafemap = malloc(sizeof(struct bmsafemap), + M_BMSAFEMAP, M_SOFTDEP_FLAGS); + workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); + ACQUIRE_LOCK(ump); + } + bmsafemap->sm_buf = bp; + LIST_INIT(&bmsafemap->sm_inodedephd); + LIST_INIT(&bmsafemap->sm_inodedepwr); + LIST_INIT(&bmsafemap->sm_newblkhd); + LIST_INIT(&bmsafemap->sm_newblkwr); + LIST_INIT(&bmsafemap->sm_jaddrefhd); + LIST_INIT(&bmsafemap->sm_jnewblkhd); + LIST_INIT(&bmsafemap->sm_freehd); + LIST_INIT(&bmsafemap->sm_freewr); + if (bmsafemap_find(bmsafemaphd, cg, &collision) == 1) { + WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); + return (collision); + } + bmsafemap->sm_cg = cg; + LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash); + LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next); + WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); + return (bmsafemap); +} + +/* + * Direct block allocation dependencies. + * + * When a new block is allocated, the corresponding disk locations must be + * initialized (with zeros or new data) before the on-disk inode points to + * them. Also, the freemap from which the block was allocated must be + * updated (on disk) before the inode's pointer. These two dependencies are + * independent of each other and are needed for all file blocks and indirect + * blocks that are pointed to directly by the inode. Just before the + * "in-core" version of the inode is updated with a newly allocated block + * number, a procedure (below) is called to setup allocation dependency + * structures. These structures are removed when the corresponding + * dependencies are satisfied or when the block allocation becomes obsolete + * (i.e., the file is deleted, the block is de-allocated, or the block is a + * fragment that gets upgraded). All of these cases are handled in + * procedures described later. + * + * When a file extension causes a fragment to be upgraded, either to a larger + * fragment or to a full block, the on-disk location may change (if the + * previous fragment could not simply be extended). In this case, the old + * fragment must be de-allocated, but not until after the inode's pointer has + * been updated. In most cases, this is handled by later procedures, which + * will construct a "freefrag" structure to be added to the workitem queue + * when the inode update is complete (or obsolete). The main exception to + * this is when an allocation occurs while a pending allocation dependency + * (for the same block pointer) remains. This case is handled in the main + * allocation dependency setup procedure by immediately freeing the + * unreferenced fragments. + */ +void +softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp) + struct inode *ip; /* inode to which block is being added */ + ufs_lbn_t off; /* block pointer within inode */ + ufs2_daddr_t newblkno; /* disk block number being added */ + ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ + long newsize; /* size of new block */ + long oldsize; /* size of new block */ + struct buf *bp; /* bp for allocated block */ +{ + struct allocdirect *adp, *oldadp; + struct allocdirectlst *adphead; + struct freefrag *freefrag; + struct inodedep *inodedep; + struct pagedep *pagedep; + struct jnewblk *jnewblk; + struct newblk *newblk; + struct mount *mp; + ufs_lbn_t lbn; + + lbn = bp->b_lblkno; + mp = ITOVFS(ip); + KASSERT(MOUNTEDSOFTDEP(mp) != 0, + ("softdep_setup_allocdirect called on non-softdep filesystem")); + if (oldblkno && oldblkno != newblkno) + freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); + else + freefrag = NULL; + + CTR6(KTR_SUJ, + "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd " + "off %jd newsize %ld oldsize %d", + ip->i_number, newblkno, oldblkno, off, newsize, oldsize); + ACQUIRE_LOCK(ITOUMP(ip)); + if (off >= NDADDR) { + if (lbn > 0) + panic("softdep_setup_allocdirect: bad lbn %jd, off %jd", + lbn, off); + /* allocating an indirect block */ + if (oldblkno != 0) + panic("softdep_setup_allocdirect: non-zero indir"); + } else { + if (off != lbn) + panic("softdep_setup_allocdirect: lbn %jd != off %jd", + lbn, off); + /* + * Allocating a direct block. + * + * If we are allocating a directory block, then we must + * allocate an associated pagedep to track additions and + * deletions. + */ + if ((ip->i_mode & IFMT) == IFDIR) + pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC, + &pagedep); + } + if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) + panic("softdep_setup_allocdirect: lost block"); + KASSERT(newblk->nb_list.wk_type == D_NEWBLK, + ("softdep_setup_allocdirect: newblk already initialized")); + /* + * Convert the newblk to an allocdirect. + */ + WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT); + adp = (struct allocdirect *)newblk; + newblk->nb_freefrag = freefrag; + adp->ad_offset = off; + adp->ad_oldblkno = oldblkno; + adp->ad_newsize = newsize; + adp->ad_oldsize = oldsize; + + /* + * Finish initializing the journal. + */ + if ((jnewblk = newblk->nb_jnewblk) != NULL) { + jnewblk->jn_ino = ip->i_number; + jnewblk->jn_lbn = lbn; + add_to_journal(&jnewblk->jn_list); + } + if (freefrag && freefrag->ff_jdep != NULL && + freefrag->ff_jdep->wk_type == D_JFREEFRAG) + add_to_journal(freefrag->ff_jdep); + inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); + adp->ad_inodedep = inodedep; + + WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); + /* + * The list of allocdirects must be kept in sorted and ascending + * order so that the rollback routines can quickly determine the + * first uncommitted block (the size of the file stored on disk + * ends at the end of the lowest committed fragment, or if there + * are no fragments, at the end of the highest committed block). + * Since files generally grow, the typical case is that the new + * block is to be added at the end of the list. We speed this + * special case by checking against the last allocdirect in the + * list before laboriously traversing the list looking for the + * insertion point. + */ + adphead = &inodedep->id_newinoupdt; + oldadp = TAILQ_LAST(adphead, allocdirectlst); + if (oldadp == NULL || oldadp->ad_offset <= off) { + /* insert at end of list */ + TAILQ_INSERT_TAIL(adphead, adp, ad_next); + if (oldadp != NULL && oldadp->ad_offset == off) + allocdirect_merge(adphead, adp, oldadp); + FREE_LOCK(ITOUMP(ip)); + return; + } + TAILQ_FOREACH(oldadp, adphead, ad_next) { + if (oldadp->ad_offset >= off) + break; + } + if (oldadp == NULL) + panic("softdep_setup_allocdirect: lost entry"); + /* insert in middle of list */ + TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); + if (oldadp->ad_offset == off) + allocdirect_merge(adphead, adp, oldadp); + + FREE_LOCK(ITOUMP(ip)); +} + +/* + * Merge a newer and older journal record to be stored either in a + * newblock or freefrag. This handles aggregating journal records for + * fragment allocation into a second record as well as replacing a + * journal free with an aborted journal allocation. A segment for the + * oldest record will be placed on wkhd if it has been written. If not + * the segment for the newer record will suffice. + */ +static struct worklist * +jnewblk_merge(new, old, wkhd) + struct worklist *new; + struct worklist *old; + struct workhead *wkhd; +{ + struct jnewblk *njnewblk; + struct jnewblk *jnewblk; + + /* Handle NULLs to simplify callers. */ + if (new == NULL) + return (old); + if (old == NULL) + return (new); + /* Replace a jfreefrag with a jnewblk. */ + if (new->wk_type == D_JFREEFRAG) { + if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno) + panic("jnewblk_merge: blkno mismatch: %p, %p", + old, new); + cancel_jfreefrag(WK_JFREEFRAG(new)); + return (old); + } + if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK) + panic("jnewblk_merge: Bad type: old %d new %d\n", + old->wk_type, new->wk_type); + /* + * Handle merging of two jnewblk records that describe + * different sets of fragments in the same block. + */ + jnewblk = WK_JNEWBLK(old); + njnewblk = WK_JNEWBLK(new); + if (jnewblk->jn_blkno != njnewblk->jn_blkno) + panic("jnewblk_merge: Merging disparate blocks."); + /* + * The record may be rolled back in the cg. + */ + if (jnewblk->jn_state & UNDONE) { + jnewblk->jn_state &= ~UNDONE; + njnewblk->jn_state |= UNDONE; + njnewblk->jn_state &= ~ATTACHED; + } + /* + * We modify the newer addref and free the older so that if neither + * has been written the most up-to-date copy will be on disk. If + * both have been written but rolled back we only temporarily need + * one of them to fix the bits when the cg write completes. + */ + jnewblk->jn_state |= ATTACHED | COMPLETE; + njnewblk->jn_oldfrags = jnewblk->jn_oldfrags; + cancel_jnewblk(jnewblk, wkhd); + WORKLIST_REMOVE(&jnewblk->jn_list); + free_jnewblk(jnewblk); + return (new); +} + +/* + * Replace an old allocdirect dependency with a newer one. + * This routine must be called with splbio interrupts blocked. + */ +static void +allocdirect_merge(adphead, newadp, oldadp) + struct allocdirectlst *adphead; /* head of list holding allocdirects */ + struct allocdirect *newadp; /* allocdirect being added */ + struct allocdirect *oldadp; /* existing allocdirect being checked */ +{ + struct worklist *wk; + struct freefrag *freefrag; + + freefrag = NULL; + LOCK_OWNED(VFSTOUFS(newadp->ad_list.wk_mp)); + if (newadp->ad_oldblkno != oldadp->ad_newblkno || + newadp->ad_oldsize != oldadp->ad_newsize || + newadp->ad_offset >= NDADDR) + panic("%s %jd != new %jd || old size %ld != new %ld", + "allocdirect_merge: old blkno", + (intmax_t)newadp->ad_oldblkno, + (intmax_t)oldadp->ad_newblkno, + newadp->ad_oldsize, oldadp->ad_newsize); + newadp->ad_oldblkno = oldadp->ad_oldblkno; + newadp->ad_oldsize = oldadp->ad_oldsize; + /* + * If the old dependency had a fragment to free or had never + * previously had a block allocated, then the new dependency + * can immediately post its freefrag and adopt the old freefrag. + * This action is done by swapping the freefrag dependencies. + * The new dependency gains the old one's freefrag, and the + * old one gets the new one and then immediately puts it on + * the worklist when it is freed by free_newblk. It is + * not possible to do this swap when the old dependency had a + * non-zero size but no previous fragment to free. This condition + * arises when the new block is an extension of the old block. + * Here, the first part of the fragment allocated to the new + * dependency is part of the block currently claimed on disk by + * the old dependency, so cannot legitimately be freed until the + * conditions for the new dependency are fulfilled. + */ + freefrag = newadp->ad_freefrag; + if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { + newadp->ad_freefrag = oldadp->ad_freefrag; + oldadp->ad_freefrag = freefrag; + } + /* + * If we are tracking a new directory-block allocation, + * move it from the old allocdirect to the new allocdirect. + */ + if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) { + WORKLIST_REMOVE(wk); + if (!LIST_EMPTY(&oldadp->ad_newdirblk)) + panic("allocdirect_merge: extra newdirblk"); + WORKLIST_INSERT(&newadp->ad_newdirblk, wk); + } + TAILQ_REMOVE(adphead, oldadp, ad_next); + /* + * We need to move any journal dependencies over to the freefrag + * that releases this block if it exists. Otherwise we are + * extending an existing block and we'll wait until that is + * complete to release the journal space and extend the + * new journal to cover this old space as well. + */ + if (freefrag == NULL) { + if (oldadp->ad_newblkno != newadp->ad_newblkno) + panic("allocdirect_merge: %jd != %jd", + oldadp->ad_newblkno, newadp->ad_newblkno); + newadp->ad_block.nb_jnewblk = (struct jnewblk *) + jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list, + &oldadp->ad_block.nb_jnewblk->jn_list, + &newadp->ad_block.nb_jwork); + oldadp->ad_block.nb_jnewblk = NULL; + cancel_newblk(&oldadp->ad_block, NULL, + &newadp->ad_block.nb_jwork); + } else { + wk = (struct worklist *) cancel_newblk(&oldadp->ad_block, + &freefrag->ff_list, &freefrag->ff_jwork); + freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk, + &freefrag->ff_jwork); + } + free_newblk(&oldadp->ad_block); +} + +/* + * Allocate a jfreefrag structure to journal a single block free. + */ +static struct jfreefrag * +newjfreefrag(freefrag, ip, blkno, size, lbn) + struct freefrag *freefrag; + struct inode *ip; + ufs2_daddr_t blkno; + long size; + ufs_lbn_t lbn; +{ + struct jfreefrag *jfreefrag; + struct fs *fs; + + fs = ITOFS(ip); + jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG, + M_SOFTDEP_FLAGS); + workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, ITOVFS(ip)); + jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list); + jfreefrag->fr_state = ATTACHED | DEPCOMPLETE; + jfreefrag->fr_ino = ip->i_number; + jfreefrag->fr_lbn = lbn; + jfreefrag->fr_blkno = blkno; + jfreefrag->fr_frags = numfrags(fs, size); + jfreefrag->fr_freefrag = freefrag; + + return (jfreefrag); +} + +/* + * Allocate a new freefrag structure. + */ +static struct freefrag * +newfreefrag(ip, blkno, size, lbn) + struct inode *ip; + ufs2_daddr_t blkno; + long size; + ufs_lbn_t lbn; +{ + struct freefrag *freefrag; + struct ufsmount *ump; + struct fs *fs; + + CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd", + ip->i_number, blkno, size, lbn); + ump = ITOUMP(ip); + fs = ump->um_fs; + if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) + panic("newfreefrag: frag size"); + freefrag = malloc(sizeof(struct freefrag), + M_FREEFRAG, M_SOFTDEP_FLAGS); + workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ump)); + freefrag->ff_state = ATTACHED; + LIST_INIT(&freefrag->ff_jwork); + freefrag->ff_inum = ip->i_number; + freefrag->ff_vtype = ITOV(ip)->v_type; + freefrag->ff_blkno = blkno; + freefrag->ff_fragsize = size; + + if (MOUNTEDSUJ(UFSTOVFS(ump))) { + freefrag->ff_jdep = (struct worklist *) + newjfreefrag(freefrag, ip, blkno, size, lbn); + } else { + freefrag->ff_state |= DEPCOMPLETE; + freefrag->ff_jdep = NULL; + } + + return (freefrag); +} + +/* + * This workitem de-allocates fragments that were replaced during + * file block allocation. + */ +static void +handle_workitem_freefrag(freefrag) + struct freefrag *freefrag; +{ + struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); + struct workhead wkhd; + + CTR3(KTR_SUJ, + "handle_workitem_freefrag: ino %d blkno %jd size %ld", + freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize); + /* + * It would be illegal to add new completion items to the + * freefrag after it was schedule to be done so it must be + * safe to modify the list head here. + */ + LIST_INIT(&wkhd); + ACQUIRE_LOCK(ump); + LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list); + /* + * If the journal has not been written we must cancel it here. + */ + if (freefrag->ff_jdep) { + if (freefrag->ff_jdep->wk_type != D_JNEWBLK) + panic("handle_workitem_freefrag: Unexpected type %d\n", + freefrag->ff_jdep->wk_type); + cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd); + } + FREE_LOCK(ump); + ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, + freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd); + ACQUIRE_LOCK(ump); + WORKITEM_FREE(freefrag, D_FREEFRAG); + FREE_LOCK(ump); +} + +/* + * Set up a dependency structure for an external attributes data block. + * This routine follows much of the structure of softdep_setup_allocdirect. + * See the description of softdep_setup_allocdirect above for details. + */ +void +softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp) + struct inode *ip; + ufs_lbn_t off; + ufs2_daddr_t newblkno; + ufs2_daddr_t oldblkno; + long newsize; + long oldsize; + struct buf *bp; +{ + struct allocdirect *adp, *oldadp; + struct allocdirectlst *adphead; + struct freefrag *freefrag; + struct inodedep *inodedep; + struct jnewblk *jnewblk; + struct newblk *newblk; + struct mount *mp; + struct ufsmount *ump; + ufs_lbn_t lbn; + + mp = ITOVFS(ip); + ump = VFSTOUFS(mp); + KASSERT(MOUNTEDSOFTDEP(mp) != 0, + ("softdep_setup_allocext called on non-softdep filesystem")); + KASSERT(off < NXADDR, ("softdep_setup_allocext: lbn %lld > NXADDR", + (long long)off)); + + lbn = bp->b_lblkno; + if (oldblkno && oldblkno != newblkno) + freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); + else + freefrag = NULL; + + ACQUIRE_LOCK(ump); + if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) + panic("softdep_setup_allocext: lost block"); + KASSERT(newblk->nb_list.wk_type == D_NEWBLK, + ("softdep_setup_allocext: newblk already initialized")); + /* + * Convert the newblk to an allocdirect. + */ + WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT); + adp = (struct allocdirect *)newblk; + newblk->nb_freefrag = freefrag; + adp->ad_offset = off; + adp->ad_oldblkno = oldblkno; + adp->ad_newsize = newsize; + adp->ad_oldsize = oldsize; + adp->ad_state |= EXTDATA; + + /* + * Finish initializing the journal. + */ + if ((jnewblk = newblk->nb_jnewblk) != NULL) { + jnewblk->jn_ino = ip->i_number; + jnewblk->jn_lbn = lbn; + add_to_journal(&jnewblk->jn_list); + } + if (freefrag && freefrag->ff_jdep != NULL && + freefrag->ff_jdep->wk_type == D_JFREEFRAG) + add_to_journal(freefrag->ff_jdep); + inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); + adp->ad_inodedep = inodedep; + + WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); + /* + * The list of allocdirects must be kept in sorted and ascending + * order so that the rollback routines can quickly determine the + * first uncommitted block (the size of the file stored on disk + * ends at the end of the lowest committed fragment, or if there + * are no fragments, at the end of the highest committed block). + * Since files generally grow, the typical case is that the new + * block is to be added at the end of the list. We speed this + * special case by checking against the last allocdirect in the + * list before laboriously traversing the list looking for the + * insertion point. + */ + adphead = &inodedep->id_newextupdt; + oldadp = TAILQ_LAST(adphead, allocdirectlst); + if (oldadp == NULL || oldadp->ad_offset <= off) { + /* insert at end of list */ + TAILQ_INSERT_TAIL(adphead, adp, ad_next); + if (oldadp != NULL && oldadp->ad_offset == off) + allocdirect_merge(adphead, adp, oldadp); + FREE_LOCK(ump); + return; + } + TAILQ_FOREACH(oldadp, adphead, ad_next) { + if (oldadp->ad_offset >= off) + break; + } + if (oldadp == NULL) + panic("softdep_setup_allocext: lost entry"); + /* insert in middle of list */ + TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); + if (oldadp->ad_offset == off) + allocdirect_merge(adphead, adp, oldadp); + FREE_LOCK(ump); +} + +/* + * Indirect block allocation dependencies. + * + * The same dependencies that exist for a direct block also exist when + * a new block is allocated and pointed to by an entry in a block of + * indirect pointers. The undo/redo states described above are also + * used here. Because an indirect block contains many pointers that + * may have dependencies, a second copy of the entire in-memory indirect + * block is kept. The buffer cache copy is always completely up-to-date. + * The second copy, which is used only as a source for disk writes, + * contains only the safe pointers (i.e., those that have no remaining + * update dependencies). The second copy is freed when all pointers + * are safe. The cache is not allowed to replace indirect blocks with + * pending update dependencies. If a buffer containing an indirect + * block with dependencies is written, these routines will mark it + * dirty again. It can only be successfully written once all the + * dependencies are removed. The ffs_fsync routine in conjunction with + * softdep_sync_metadata work together to get all the dependencies + * removed so that a file can be successfully written to disk. Three + * procedures are used when setting up indirect block pointer + * dependencies. The division is necessary because of the organization + * of the "balloc" routine and because of the distinction between file + * pages and file metadata blocks. + */ + +/* + * Allocate a new allocindir structure. + */ +static struct allocindir * +newallocindir(ip, ptrno, newblkno, oldblkno, lbn) + struct inode *ip; /* inode for file being extended */ + int ptrno; /* offset of pointer in indirect block */ + ufs2_daddr_t newblkno; /* disk block number being added */ + ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ + ufs_lbn_t lbn; +{ + struct newblk *newblk; + struct allocindir *aip; + struct freefrag *freefrag; + struct jnewblk *jnewblk; + + if (oldblkno) + freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn); + else + freefrag = NULL; + ACQUIRE_LOCK(ITOUMP(ip)); + if (newblk_lookup(ITOVFS(ip), newblkno, 0, &newblk) == 0) + panic("new_allocindir: lost block"); + KASSERT(newblk->nb_list.wk_type == D_NEWBLK, + ("newallocindir: newblk already initialized")); + WORKITEM_REASSIGN(newblk, D_ALLOCINDIR); + newblk->nb_freefrag = freefrag; + aip = (struct allocindir *)newblk; + aip->ai_offset = ptrno; + aip->ai_oldblkno = oldblkno; + aip->ai_lbn = lbn; + if ((jnewblk = newblk->nb_jnewblk) != NULL) { + jnewblk->jn_ino = ip->i_number; + jnewblk->jn_lbn = lbn; + add_to_journal(&jnewblk->jn_list); + } + if (freefrag && freefrag->ff_jdep != NULL && + freefrag->ff_jdep->wk_type == D_JFREEFRAG) + add_to_journal(freefrag->ff_jdep); + return (aip); +} + +/* + * Called just before setting an indirect block pointer + * to a newly allocated file page. + */ +void +softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) + struct inode *ip; /* inode for file being extended */ + ufs_lbn_t lbn; /* allocated block number within file */ + struct buf *bp; /* buffer with indirect blk referencing page */ + int ptrno; /* offset of pointer in indirect block */ + ufs2_daddr_t newblkno; /* disk block number being added */ + ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ + struct buf *nbp; /* buffer holding allocated page */ +{ + struct inodedep *inodedep; + struct freefrag *freefrag; + struct allocindir *aip; + struct pagedep *pagedep; + struct mount *mp; + struct ufsmount *ump; + + mp = ITOVFS(ip); + ump = VFSTOUFS(mp); + KASSERT(MOUNTEDSOFTDEP(mp) != 0, + ("softdep_setup_allocindir_page called on non-softdep filesystem")); + KASSERT(lbn == nbp->b_lblkno, + ("softdep_setup_allocindir_page: lbn %jd != lblkno %jd", + lbn, bp->b_lblkno)); + CTR4(KTR_SUJ, + "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd " + "lbn %jd", ip->i_number, newblkno, oldblkno, lbn); + ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); + aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn); + (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); + /* + * If we are allocating a directory page, then we must + * allocate an associated pagedep to track additions and + * deletions. + */ + if ((ip->i_mode & IFMT) == IFDIR) + pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep); + WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); + freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); + FREE_LOCK(ump); + if (freefrag) + handle_workitem_freefrag(freefrag); +} + +/* + * Called just before setting an indirect block pointer to a + * newly allocated indirect block. + */ +void +softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) + struct buf *nbp; /* newly allocated indirect block */ + struct inode *ip; /* inode for file being extended */ + struct buf *bp; /* indirect block referencing allocated block */ + int ptrno; /* offset of pointer in indirect block */ + ufs2_daddr_t newblkno; /* disk block number being added */ +{ + struct inodedep *inodedep; + struct allocindir *aip; + struct ufsmount *ump; + ufs_lbn_t lbn; + + ump = ITOUMP(ip); + KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, + ("softdep_setup_allocindir_meta called on non-softdep filesystem")); + CTR3(KTR_SUJ, + "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d", + ip->i_number, newblkno, ptrno); + lbn = nbp->b_lblkno; + ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); + aip = newallocindir(ip, ptrno, newblkno, 0, lbn); + inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep); + WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); + if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)) + panic("softdep_setup_allocindir_meta: Block already existed"); + FREE_LOCK(ump); +} + +static void +indirdep_complete(indirdep) + struct indirdep *indirdep; +{ + struct allocindir *aip; + + LIST_REMOVE(indirdep, ir_next); + indirdep->ir_state |= DEPCOMPLETE; + + while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { + LIST_REMOVE(aip, ai_next); + free_newblk(&aip->ai_block); + } + /* + * If this indirdep is not attached to a buf it was simply waiting + * on completion to clear completehd. free_indirdep() asserts + * that nothing is dangling. + */ + if ((indirdep->ir_state & ONWORKLIST) == 0) + free_indirdep(indirdep); +} + +static struct indirdep * +indirdep_lookup(mp, ip, bp) + struct mount *mp; + struct inode *ip; + struct buf *bp; +{ + struct indirdep *indirdep, *newindirdep; + struct newblk *newblk; + struct ufsmount *ump; + struct worklist *wk; + struct fs *fs; + ufs2_daddr_t blkno; + + ump = VFSTOUFS(mp); + LOCK_OWNED(ump); + indirdep = NULL; + newindirdep = NULL; + fs = ump->um_fs; + for (;;) { + LIST_FOREACH(wk, &bp->b_dep, wk_list) { + if (wk->wk_type != D_INDIRDEP) + continue; + indirdep = WK_INDIRDEP(wk); + break; + } + /* Found on the buffer worklist, no new structure to free. */ + if (indirdep != NULL && newindirdep == NULL) + return (indirdep); + if (indirdep != NULL && newindirdep != NULL) + panic("indirdep_lookup: simultaneous create"); + /* None found on the buffer and a new structure is ready. */ + if (indirdep == NULL && newindirdep != NULL) + break; + /* None found and no new structure available. */ + FREE_LOCK(ump); + newindirdep = malloc(sizeof(struct indirdep), + M_INDIRDEP, M_SOFTDEP_FLAGS); + workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp); + newindirdep->ir_state = ATTACHED; + if (I_IS_UFS1(ip)) + newindirdep->ir_state |= UFS1FMT; + TAILQ_INIT(&newindirdep->ir_trunc); + newindirdep->ir_saveddata = NULL; + LIST_INIT(&newindirdep->ir_deplisthd); + LIST_INIT(&newindirdep->ir_donehd); + LIST_INIT(&newindirdep->ir_writehd); + LIST_INIT(&newindirdep->ir_completehd); + if (bp->b_blkno == bp->b_lblkno) { + ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, + NULL, NULL); + bp->b_blkno = blkno; + } + newindirdep->ir_freeblks = NULL; + newindirdep->ir_savebp = + getblk(ump->um_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0); + newindirdep->ir_bp = bp; + BUF_KERNPROC(newindirdep->ir_savebp); + bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); + ACQUIRE_LOCK(ump); + } + indirdep = newindirdep; + WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); + /* + * If the block is not yet allocated we don't set DEPCOMPLETE so + * that we don't free dependencies until the pointers are valid. + * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather + * than using the hash. + */ + if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)) + LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next); + else + indirdep->ir_state |= DEPCOMPLETE; + return (indirdep); +} + +/* + * Called to finish the allocation of the "aip" allocated + * by one of the two routines above. + */ +static struct freefrag * +setup_allocindir_phase2(bp, ip, inodedep, aip, lbn) + struct buf *bp; /* in-memory copy of the indirect block */ + struct inode *ip; /* inode for file being extended */ + struct inodedep *inodedep; /* Inodedep for ip */ + struct allocindir *aip; /* allocindir allocated by the above routines */ + ufs_lbn_t lbn; /* Logical block number for this block. */ +{ + struct fs *fs; + struct indirdep *indirdep; + struct allocindir *oldaip; + struct freefrag *freefrag; + struct mount *mp; + struct ufsmount *ump; + + mp = ITOVFS(ip); + ump = VFSTOUFS(mp); + LOCK_OWNED(ump); + fs = ump->um_fs; + if (bp->b_lblkno >= 0) + panic("setup_allocindir_phase2: not indir blk"); + KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs), + ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset)); + indirdep = indirdep_lookup(mp, ip, bp); + KASSERT(indirdep->ir_savebp != NULL, + ("setup_allocindir_phase2 NULL ir_savebp")); + aip->ai_indirdep = indirdep; + /* + * Check for an unwritten dependency for this indirect offset. If + * there is, merge the old dependency into the new one. This happens + * as a result of reallocblk only. + */ + freefrag = NULL; + if (aip->ai_oldblkno != 0) { + LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) { + if (oldaip->ai_offset == aip->ai_offset) { + freefrag = allocindir_merge(aip, oldaip); + goto done; + } + } + LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) { + if (oldaip->ai_offset == aip->ai_offset) { + freefrag = allocindir_merge(aip, oldaip); + goto done; + } + } + } +done: + LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); + return (freefrag); +} + +/* + * Merge two allocindirs which refer to the same block. Move newblock + * dependencies and setup the freefrags appropriately. + */ +static struct freefrag * +allocindir_merge(aip, oldaip) + struct allocindir *aip; + struct allocindir *oldaip; +{ + struct freefrag *freefrag; + struct worklist *wk; + + if (oldaip->ai_newblkno != aip->ai_oldblkno) + panic("allocindir_merge: blkno"); + aip->ai_oldblkno = oldaip->ai_oldblkno; + freefrag = aip->ai_freefrag; + aip->ai_freefrag = oldaip->ai_freefrag; + oldaip->ai_freefrag = NULL; + KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag")); + /* + * If we are tracking a new directory-block allocation, + * move it from the old allocindir to the new allocindir. + */ + if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) { + WORKLIST_REMOVE(wk); + if (!LIST_EMPTY(&oldaip->ai_newdirblk)) + panic("allocindir_merge: extra newdirblk"); + WORKLIST_INSERT(&aip->ai_newdirblk, wk); + } + /* + * We can skip journaling for this freefrag and just complete + * any pending journal work for the allocindir that is being + * removed after the freefrag completes. + */ + if (freefrag->ff_jdep) + cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep)); + LIST_REMOVE(oldaip, ai_next); + freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block, + &freefrag->ff_list, &freefrag->ff_jwork); + free_newblk(&oldaip->ai_block); + + return (freefrag); +} + +static inline void +setup_freedirect(freeblks, ip, i, needj) + struct freeblks *freeblks; + struct inode *ip; + int i; + int needj; +{ + struct ufsmount *ump; + ufs2_daddr_t blkno; + int frags; + + blkno = DIP(ip, i_db[i]); + if (blkno == 0) + return; + DIP_SET(ip, i_db[i], 0); + ump = ITOUMP(ip); + frags = sblksize(ump->um_fs, ip->i_size, i); + frags = numfrags(ump->um_fs, frags); + newfreework(ump, freeblks, NULL, i, blkno, frags, 0, needj); +} + +static inline void +setup_freeext(freeblks, ip, i, needj) + struct freeblks *freeblks; + struct inode *ip; + int i; + int needj; +{ + struct ufsmount *ump; + ufs2_daddr_t blkno; + int frags; + + blkno = ip->i_din2->di_extb[i]; + if (blkno == 0) + return; + ip->i_din2->di_extb[i] = 0; + ump = ITOUMP(ip); + frags = sblksize(ump->um_fs, ip->i_din2->di_extsize, i); + frags = numfrags(ump->um_fs, frags); + newfreework(ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj); +} + +static inline void +setup_freeindir(freeblks, ip, i, lbn, needj) + struct freeblks *freeblks; + struct inode *ip; + int i; + ufs_lbn_t lbn; + int needj; +{ + struct ufsmount *ump; + ufs2_daddr_t blkno; + + blkno = DIP(ip, i_ib[i]); + if (blkno == 0) + return; + DIP_SET(ip, i_ib[i], 0); + ump = ITOUMP(ip); + newfreework(ump, freeblks, NULL, lbn, blkno, ump->um_fs->fs_frag, + 0, needj); +} + +static inline struct freeblks * +newfreeblks(mp, ip) + struct mount *mp; + struct inode *ip; +{ + struct freeblks *freeblks; + + freeblks = malloc(sizeof(struct freeblks), + M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); + workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); + LIST_INIT(&freeblks->fb_jblkdephd); + LIST_INIT(&freeblks->fb_jwork); + freeblks->fb_ref = 0; + freeblks->fb_cgwait = 0; + freeblks->fb_state = ATTACHED; + freeblks->fb_uid = ip->i_uid; + freeblks->fb_inum = ip->i_number; + freeblks->fb_vtype = ITOV(ip)->v_type; + freeblks->fb_modrev = DIP(ip, i_modrev); + freeblks->fb_devvp = ITODEVVP(ip); + freeblks->fb_chkcnt = 0; + freeblks->fb_len = 0; + + return (freeblks); +} + +static void +trunc_indirdep(indirdep, freeblks, bp, off) + struct indirdep *indirdep; + struct freeblks *freeblks; + struct buf *bp; + int off; +{ + struct allocindir *aip, *aipn; + + /* + * The first set of allocindirs won't be in savedbp. + */ + LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn) + if (aip->ai_offset > off) + cancel_allocindir(aip, bp, freeblks, 1); + LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn) + if (aip->ai_offset > off) + cancel_allocindir(aip, bp, freeblks, 1); + /* + * These will exist in savedbp. + */ + LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn) + if (aip->ai_offset > off) + cancel_allocindir(aip, NULL, freeblks, 0); + LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn) + if (aip->ai_offset > off) + cancel_allocindir(aip, NULL, freeblks, 0); +} + +/* + * Follow the chain of indirects down to lastlbn creating a freework + * structure for each. This will be used to start indir_trunc() at + * the right offset and create the journal records for the parrtial + * truncation. A second step will handle the truncated dependencies. + */ +static int +setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno) + struct freeblks *freeblks; + struct inode *ip; + ufs_lbn_t lbn; + ufs_lbn_t lastlbn; + ufs2_daddr_t blkno; +{ + struct indirdep *indirdep; + struct indirdep *indirn; + struct freework *freework; + struct newblk *newblk; + struct mount *mp; + struct ufsmount *ump; + struct buf *bp; + uint8_t *start; + uint8_t *end; + ufs_lbn_t lbnadd; + int level; + int error; + int off; + + + freework = NULL; + if (blkno == 0) + return (0); + mp = freeblks->fb_list.wk_mp; + ump = VFSTOUFS(mp); + bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0); + if ((bp->b_flags & B_CACHE) == 0) { + bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno); + bp->b_iocmd = BIO_READ; + bp->b_flags &= ~B_INVAL; + bp->b_ioflags &= ~BIO_ERROR; + vfs_busy_pages(bp, 0); + bp->b_iooffset = dbtob(bp->b_blkno); + bstrategy(bp); +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, bp, 0); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ + curthread->td_ru.ru_inblock++; + error = bufwait(bp); + if (error) { + brelse(bp); + return (error); + } + } + level = lbn_level(lbn); + lbnadd = lbn_offset(ump->um_fs, level); + /* + * Compute the offset of the last block we want to keep. Store + * in the freework the first block we want to completely free. + */ + off = (lastlbn - -(lbn + level)) / lbnadd; + if (off + 1 == NINDIR(ump->um_fs)) + goto nowork; + freework = newfreework(ump, freeblks, NULL, lbn, blkno, 0, off + 1, 0); + /* + * Link the freework into the indirdep. This will prevent any new + * allocations from proceeding until we are finished with the + * truncate and the block is written. + */ + ACQUIRE_LOCK(ump); + indirdep = indirdep_lookup(mp, ip, bp); + if (indirdep->ir_freeblks) + panic("setup_trunc_indir: indirdep already truncated."); + TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next); + freework->fw_indir = indirdep; + /* + * Cancel any allocindirs that will not make it to disk. + * We have to do this for all copies of the indirdep that + * live on this newblk. + */ + if ((indirdep->ir_state & DEPCOMPLETE) == 0) { + newblk_lookup(mp, dbtofsb(ump->um_fs, bp->b_blkno), 0, &newblk); + LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next) + trunc_indirdep(indirn, freeblks, bp, off); + } else + trunc_indirdep(indirdep, freeblks, bp, off); + FREE_LOCK(ump); + /* + * Creation is protected by the buf lock. The saveddata is only + * needed if a full truncation follows a partial truncation but it + * is difficult to allocate in that case so we fetch it anyway. + */ + if (indirdep->ir_saveddata == NULL) + indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, + M_SOFTDEP_FLAGS); +nowork: + /* Fetch the blkno of the child and the zero start offset. */ + if (I_IS_UFS1(ip)) { + blkno = ((ufs1_daddr_t *)bp->b_data)[off]; + start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1]; + } else { + blkno = ((ufs2_daddr_t *)bp->b_data)[off]; + start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1]; + } + if (freework) { + /* Zero the truncated pointers. */ + end = bp->b_data + bp->b_bcount; + bzero(start, end - start); + bdwrite(bp); + } else + bqrelse(bp); + if (level == 0) + return (0); + lbn++; /* adjust level */ + lbn -= (off * lbnadd); + return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno); +} + +/* + * Complete the partial truncation of an indirect block setup by + * setup_trunc_indir(). This zeros the truncated pointers in the saved + * copy and writes them to disk before the freeblks is allowed to complete. + */ +static void +complete_trunc_indir(freework) + struct freework *freework; +{ + struct freework *fwn; + struct indirdep *indirdep; + struct ufsmount *ump; + struct buf *bp; + uintptr_t start; + int count; + + ump = VFSTOUFS(freework->fw_list.wk_mp); + LOCK_OWNED(ump); + indirdep = freework->fw_indir; + for (;;) { + bp = indirdep->ir_bp; + /* See if the block was discarded. */ + if (bp == NULL) + break; + /* Inline part of getdirtybuf(). We dont want bremfree. */ + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) + break; + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, + LOCK_PTR(ump)) == 0) + BUF_UNLOCK(bp); + ACQUIRE_LOCK(ump); + } + freework->fw_state |= DEPCOMPLETE; + TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next); + /* + * Zero the pointers in the saved copy. + */ + if (indirdep->ir_state & UFS1FMT) + start = sizeof(ufs1_daddr_t); + else + start = sizeof(ufs2_daddr_t); + start *= freework->fw_start; + count = indirdep->ir_savebp->b_bcount - start; + start += (uintptr_t)indirdep->ir_savebp->b_data; + bzero((char *)start, count); + /* + * We need to start the next truncation in the list if it has not + * been started yet. + */ + fwn = TAILQ_FIRST(&indirdep->ir_trunc); + if (fwn != NULL) { + if (fwn->fw_freeblks == indirdep->ir_freeblks) + TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next); + if ((fwn->fw_state & ONWORKLIST) == 0) + freework_enqueue(fwn); + } + /* + * If bp is NULL the block was fully truncated, restore + * the saved block list otherwise free it if it is no + * longer needed. + */ + if (TAILQ_EMPTY(&indirdep->ir_trunc)) { + if (bp == NULL) + bcopy(indirdep->ir_saveddata, + indirdep->ir_savebp->b_data, + indirdep->ir_savebp->b_bcount); + free(indirdep->ir_saveddata, M_INDIRDEP); + indirdep->ir_saveddata = NULL; + } + /* + * When bp is NULL there is a full truncation pending. We + * must wait for this full truncation to be journaled before + * we can release this freework because the disk pointers will + * never be written as zero. + */ + if (bp == NULL) { + if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd)) + handle_written_freework(freework); + else + WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd, + &freework->fw_list); + } else { + /* Complete when the real copy is written. */ + WORKLIST_INSERT(&bp->b_dep, &freework->fw_list); + BUF_UNLOCK(bp); + } +} + +/* + * Calculate the number of blocks we are going to release where datablocks + * is the current total and length is the new file size. + */ +static ufs2_daddr_t +blkcount(fs, datablocks, length) + struct fs *fs; + ufs2_daddr_t datablocks; + off_t length; +{ + off_t totblks, numblks; + + totblks = 0; + numblks = howmany(length, fs->fs_bsize); + if (numblks <= NDADDR) { + totblks = howmany(length, fs->fs_fsize); + goto out; + } + totblks = blkstofrags(fs, numblks); + numblks -= NDADDR; + /* + * Count all single, then double, then triple indirects required. + * Subtracting one indirects worth of blocks for each pass + * acknowledges one of each pointed to by the inode. + */ + for (;;) { + totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs))); + numblks -= NINDIR(fs); + if (numblks <= 0) + break; + numblks = howmany(numblks, NINDIR(fs)); + } +out: + totblks = fsbtodb(fs, totblks); + /* + * Handle sparse files. We can't reclaim more blocks than the inode + * references. We will correct it later in handle_complete_freeblks() + * when we know the real count. + */ + if (totblks > datablocks) + return (0); + return (datablocks - totblks); +} + +/* + * Handle freeblocks for journaled softupdate filesystems. + * + * Contrary to normal softupdates, we must preserve the block pointers in + * indirects until their subordinates are free. This is to avoid journaling + * every block that is freed which may consume more space than the journal + * itself. The recovery program will see the free block journals at the + * base of the truncated area and traverse them to reclaim space. The + * pointers in the inode may be cleared immediately after the journal + * records are written because each direct and indirect pointer in the + * inode is recorded in a journal. This permits full truncation to proceed + * asynchronously. The write order is journal -> inode -> cgs -> indirects. + * + * The algorithm is as follows: + * 1) Traverse the in-memory state and create journal entries to release + * the relevant blocks and full indirect trees. + * 2) Traverse the indirect block chain adding partial truncation freework + * records to indirects in the path to lastlbn. The freework will + * prevent new allocation dependencies from being satisfied in this + * indirect until the truncation completes. + * 3) Read and lock the inode block, performing an update with the new size + * and pointers. This prevents truncated data from becoming valid on + * disk through step 4. + * 4) Reap unsatisfied dependencies that are beyond the truncated area, + * eliminate journal work for those records that do not require it. + * 5) Schedule the journal records to be written followed by the inode block. + * 6) Allocate any necessary frags for the end of file. + * 7) Zero any partially truncated blocks. + * + * From this truncation proceeds asynchronously using the freework and + * indir_trunc machinery. The file will not be extended again into a + * partially truncated indirect block until all work is completed but + * the normal dependency mechanism ensures that it is rolled back/forward + * as appropriate. Further truncation may occur without delay and is + * serialized in indir_trunc(). + */ +void +softdep_journal_freeblocks(ip, cred, length, flags) + struct inode *ip; /* The inode whose length is to be reduced */ + struct ucred *cred; + off_t length; /* The new length for the file */ + int flags; /* IO_EXT and/or IO_NORMAL */ +{ + struct freeblks *freeblks, *fbn; + struct worklist *wk, *wkn; + struct inodedep *inodedep; + struct jblkdep *jblkdep; + struct allocdirect *adp, *adpn; + struct ufsmount *ump; + struct fs *fs; + struct buf *bp; + struct vnode *vp; + struct mount *mp; + ufs2_daddr_t extblocks, datablocks; + ufs_lbn_t tmpval, lbn, lastlbn; + int frags, lastoff, iboff, allocblock, needj, error, i; + + ump = ITOUMP(ip); + mp = UFSTOVFS(ump); + fs = ump->um_fs; + KASSERT(MOUNTEDSOFTDEP(mp) != 0, + ("softdep_journal_freeblocks called on non-softdep filesystem")); + vp = ITOV(ip); + needj = 1; + iboff = -1; + allocblock = 0; + extblocks = 0; + datablocks = 0; + frags = 0; + freeblks = newfreeblks(mp, ip); + ACQUIRE_LOCK(ump); + /* + * If we're truncating a removed file that will never be written + * we don't need to journal the block frees. The canceled journals + * for the allocations will suffice. + */ + inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); + if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED && + length == 0) + needj = 0; + CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d", + ip->i_number, length, needj); + FREE_LOCK(ump); + /* + * Calculate the lbn that we are truncating to. This results in -1 + * if we're truncating the 0 bytes. So it is the last lbn we want + * to keep, not the first lbn we want to truncate. + */ + lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1; + lastoff = blkoff(fs, length); + /* + * Compute frags we are keeping in lastlbn. 0 means all. + */ + if (lastlbn >= 0 && lastlbn < NDADDR) { + frags = fragroundup(fs, lastoff); + /* adp offset of last valid allocdirect. */ + iboff = lastlbn; + } else if (lastlbn > 0) + iboff = NDADDR; + if (fs->fs_magic == FS_UFS2_MAGIC) + extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); + /* + * Handle normal data blocks and indirects. This section saves + * values used after the inode update to complete frag and indirect + * truncation. + */ + if ((flags & IO_NORMAL) != 0) { + /* + * Handle truncation of whole direct and indirect blocks. + */ + for (i = iboff + 1; i < NDADDR; i++) + setup_freedirect(freeblks, ip, i, needj); + for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; + i++, lbn += tmpval, tmpval *= NINDIR(fs)) { + /* Release a whole indirect tree. */ + if (lbn > lastlbn) { + setup_freeindir(freeblks, ip, i, -lbn -i, + needj); + continue; + } + iboff = i + NDADDR; + /* + * Traverse partially truncated indirect tree. + */ + if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn) + setup_trunc_indir(freeblks, ip, -lbn - i, + lastlbn, DIP(ip, i_ib[i])); + } + /* + * Handle partial truncation to a frag boundary. + */ + if (frags) { + ufs2_daddr_t blkno; + long oldfrags; + + oldfrags = blksize(fs, ip, lastlbn); + blkno = DIP(ip, i_db[lastlbn]); + if (blkno && oldfrags != frags) { + oldfrags -= frags; + oldfrags = numfrags(fs, oldfrags); + blkno += numfrags(fs, frags); + newfreework(ump, freeblks, NULL, lastlbn, + blkno, oldfrags, 0, needj); + if (needj) + adjust_newfreework(freeblks, + numfrags(fs, frags)); + } else if (blkno == 0) + allocblock = 1; + } + /* + * Add a journal record for partial truncate if we are + * handling indirect blocks. Non-indirects need no extra + * journaling. + */ + if (length != 0 && lastlbn >= NDADDR) { + ip->i_flag |= IN_TRUNCATED; + newjtrunc(freeblks, length, 0); + } + ip->i_size = length; + DIP_SET(ip, i_size, ip->i_size); + datablocks = DIP(ip, i_blocks) - extblocks; + if (length != 0) + datablocks = blkcount(fs, datablocks, length); + freeblks->fb_len = length; + } + if ((flags & IO_EXT) != 0) { + for (i = 0; i < NXADDR; i++) + setup_freeext(freeblks, ip, i, needj); + ip->i_din2->di_extsize = 0; + datablocks += extblocks; + } +#ifdef QUOTA + /* Reference the quotas in case the block count is wrong in the end. */ + quotaref(vp, freeblks->fb_quota); + (void) chkdq(ip, -datablocks, NOCRED, 0); +#endif + freeblks->fb_chkcnt = -datablocks; + UFS_LOCK(ump); + fs->fs_pendingblocks += datablocks; + UFS_UNLOCK(ump); + DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); + /* + * Handle truncation of incomplete alloc direct dependencies. We + * hold the inode block locked to prevent incomplete dependencies + * from reaching the disk while we are eliminating those that + * have been truncated. This is a partially inlined ffs_update(). + */ + ufs_itimes(vp); + ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED); + error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), + (int)fs->fs_bsize, cred, &bp); + if (error) { + brelse(bp); + softdep_error("softdep_journal_freeblocks", error); + return; + } + if (bp->b_bufsize == fs->fs_bsize) + bp->b_flags |= B_CLUSTEROK; + softdep_update_inodeblock(ip, bp, 0); + if (ump->um_fstype == UFS1) + *((struct ufs1_dinode *)bp->b_data + + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1; + else + *((struct ufs2_dinode *)bp->b_data + + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2; + ACQUIRE_LOCK(ump); + (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); + if ((inodedep->id_state & IOSTARTED) != 0) + panic("softdep_setup_freeblocks: inode busy"); + /* + * Add the freeblks structure to the list of operations that + * must await the zero'ed inode being written to disk. If we + * still have a bitmap dependency (needj), then the inode + * has never been written to disk, so we can process the + * freeblks below once we have deleted the dependencies. + */ + if (needj) + WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); + else + freeblks->fb_state |= COMPLETE; + if ((flags & IO_NORMAL) != 0) { + TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) { + if (adp->ad_offset > iboff) + cancel_allocdirect(&inodedep->id_inoupdt, adp, + freeblks); + /* + * Truncate the allocdirect. We could eliminate + * or modify journal records as well. + */ + else if (adp->ad_offset == iboff && frags) + adp->ad_newsize = frags; + } + } + if ((flags & IO_EXT) != 0) + while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) + cancel_allocdirect(&inodedep->id_extupdt, adp, + freeblks); + /* + * Scan the bufwait list for newblock dependencies that will never + * make it to disk. + */ + LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) { + if (wk->wk_type != D_ALLOCDIRECT) + continue; + adp = WK_ALLOCDIRECT(wk); + if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) || + ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) { + cancel_jfreeblk(freeblks, adp->ad_newblkno); + cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork); + WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk); + } + } + /* + * Add journal work. + */ + LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) + add_to_journal(&jblkdep->jb_list); + FREE_LOCK(ump); + bdwrite(bp); + /* + * Truncate dependency structures beyond length. + */ + trunc_dependencies(ip, freeblks, lastlbn, frags, flags); + /* + * This is only set when we need to allocate a fragment because + * none existed at the end of a frag-sized file. It handles only + * allocating a new, zero filled block. + */ + if (allocblock) { + ip->i_size = length - lastoff; + DIP_SET(ip, i_size, ip->i_size); + error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp); + if (error != 0) { + softdep_error("softdep_journal_freeblks", error); + return; + } + ip->i_size = length; + DIP_SET(ip, i_size, length); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + allocbuf(bp, frags); + ffs_update(vp, 0); + bawrite(bp); + } else if (lastoff != 0 && vp->v_type != VDIR) { + int size; + + /* + * Zero the end of a truncated frag or block. + */ + size = sblksize(fs, length, lastlbn); + error = bread(vp, lastlbn, size, cred, &bp); + if (error) { + softdep_error("softdep_journal_freeblks", error); + return; + } + bzero((char *)bp->b_data + lastoff, size - lastoff); + bawrite(bp); + + } + ACQUIRE_LOCK(ump); + inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); + TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next); + freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST; + /* + * We zero earlier truncations so they don't erroneously + * update i_blocks. + */ + if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0) + TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next) + fbn->fb_len = 0; + if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE && + LIST_EMPTY(&freeblks->fb_jblkdephd)) + freeblks->fb_state |= INPROGRESS; + else + freeblks = NULL; + FREE_LOCK(ump); + if (freeblks) + handle_workitem_freeblocks(freeblks, 0); + trunc_pages(ip, length, extblocks, flags); + +} + +/* + * Flush a JOP_SYNC to the journal. + */ +void +softdep_journal_fsync(ip) + struct inode *ip; +{ + struct jfsync *jfsync; + struct ufsmount *ump; + + ump = ITOUMP(ip); + KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, + ("softdep_journal_fsync called on non-softdep filesystem")); + if ((ip->i_flag & IN_TRUNCATED) == 0) + return; + ip->i_flag &= ~IN_TRUNCATED; + jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO); + workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ump)); + jfsync->jfs_size = ip->i_size; + jfsync->jfs_ino = ip->i_number; + ACQUIRE_LOCK(ump); + add_to_journal(&jfsync->jfs_list); + jwait(&jfsync->jfs_list, MNT_WAIT); + FREE_LOCK(ump); +} + +/* + * Block de-allocation dependencies. + * + * When blocks are de-allocated, the on-disk pointers must be nullified before + * the blocks are made available for use by other files. (The true + * requirement is that old pointers must be nullified before new on-disk + * pointers are set. We chose this slightly more stringent requirement to + * reduce complexity.) Our implementation handles this dependency by updating + * the inode (or indirect block) appropriately but delaying the actual block + * de-allocation (i.e., freemap and free space count manipulation) until + * after the updated versions reach stable storage. After the disk is + * updated, the blocks can be safely de-allocated whenever it is convenient. + * This implementation handles only the common case of reducing a file's + * length to zero. Other cases are handled by the conventional synchronous + * write approach. + * + * The ffs implementation with which we worked double-checks + * the state of the block pointers and file size as it reduces + * a file's length. Some of this code is replicated here in our + * soft updates implementation. The freeblks->fb_chkcnt field is + * used to transfer a part of this information to the procedure + * that eventually de-allocates the blocks. + * + * This routine should be called from the routine that shortens + * a file's length, before the inode's size or block pointers + * are modified. It will save the block pointer information for + * later release and zero the inode so that the calling routine + * can release it. + */ +void +softdep_setup_freeblocks(ip, length, flags) + struct inode *ip; /* The inode whose length is to be reduced */ + off_t length; /* The new length for the file */ + int flags; /* IO_EXT and/or IO_NORMAL */ +{ + struct ufs1_dinode *dp1; + struct ufs2_dinode *dp2; + struct freeblks *freeblks; + struct inodedep *inodedep; + struct allocdirect *adp; + struct ufsmount *ump; + struct buf *bp; + struct fs *fs; + ufs2_daddr_t extblocks, datablocks; + struct mount *mp; + int i, delay, error; + ufs_lbn_t tmpval; + ufs_lbn_t lbn; + + ump = ITOUMP(ip); + mp = UFSTOVFS(ump); + KASSERT(MOUNTEDSOFTDEP(mp) != 0, + ("softdep_setup_freeblocks called on non-softdep filesystem")); + CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld", + ip->i_number, length); + KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length")); + fs = ump->um_fs; + if ((error = bread(ump->um_devvp, + fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), + (int)fs->fs_bsize, NOCRED, &bp)) != 0) { + brelse(bp); + softdep_error("softdep_setup_freeblocks", error); + return; + } + freeblks = newfreeblks(mp, ip); + extblocks = 0; + datablocks = 0; + if (fs->fs_magic == FS_UFS2_MAGIC) + extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); + if ((flags & IO_NORMAL) != 0) { + for (i = 0; i < NDADDR; i++) + setup_freedirect(freeblks, ip, i, 0); + for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; + i++, lbn += tmpval, tmpval *= NINDIR(fs)) + setup_freeindir(freeblks, ip, i, -lbn -i, 0); + ip->i_size = 0; + DIP_SET(ip, i_size, 0); + datablocks = DIP(ip, i_blocks) - extblocks; + } + if ((flags & IO_EXT) != 0) { + for (i = 0; i < NXADDR; i++) + setup_freeext(freeblks, ip, i, 0); + ip->i_din2->di_extsize = 0; + datablocks += extblocks; + } +#ifdef QUOTA + /* Reference the quotas in case the block count is wrong in the end. */ + quotaref(ITOV(ip), freeblks->fb_quota); + (void) chkdq(ip, -datablocks, NOCRED, 0); +#endif + freeblks->fb_chkcnt = -datablocks; + UFS_LOCK(ump); + fs->fs_pendingblocks += datablocks; + UFS_UNLOCK(ump); + DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); + /* + * Push the zero'ed inode to its disk buffer so that we are free + * to delete its dependencies below. Once the dependencies are gone + * the buffer can be safely released. + */ + if (ump->um_fstype == UFS1) { + dp1 = ((struct ufs1_dinode *)bp->b_data + + ino_to_fsbo(fs, ip->i_number)); + ip->i_din1->di_freelink = dp1->di_freelink; + *dp1 = *ip->i_din1; + } else { + dp2 = ((struct ufs2_dinode *)bp->b_data + + ino_to_fsbo(fs, ip->i_number)); + ip->i_din2->di_freelink = dp2->di_freelink; + *dp2 = *ip->i_din2; + } + /* + * Find and eliminate any inode dependencies. + */ + ACQUIRE_LOCK(ump); + (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); + if ((inodedep->id_state & IOSTARTED) != 0) + panic("softdep_setup_freeblocks: inode busy"); + /* + * Add the freeblks structure to the list of operations that + * must await the zero'ed inode being written to disk. If we + * still have a bitmap dependency (delay == 0), then the inode + * has never been written to disk, so we can process the + * freeblks below once we have deleted the dependencies. + */ + delay = (inodedep->id_state & DEPCOMPLETE); + if (delay) + WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); + else + freeblks->fb_state |= COMPLETE; + /* + * Because the file length has been truncated to zero, any + * pending block allocation dependency structures associated + * with this inode are obsolete and can simply be de-allocated. + * We must first merge the two dependency lists to get rid of + * any duplicate freefrag structures, then purge the merged list. + * If we still have a bitmap dependency, then the inode has never + * been written to disk, so we can free any fragments without delay. + */ + if (flags & IO_NORMAL) { + merge_inode_lists(&inodedep->id_newinoupdt, + &inodedep->id_inoupdt); + while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) + cancel_allocdirect(&inodedep->id_inoupdt, adp, + freeblks); + } + if (flags & IO_EXT) { + merge_inode_lists(&inodedep->id_newextupdt, + &inodedep->id_extupdt); + while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) + cancel_allocdirect(&inodedep->id_extupdt, adp, + freeblks); + } + FREE_LOCK(ump); + bdwrite(bp); + trunc_dependencies(ip, freeblks, -1, 0, flags); + ACQUIRE_LOCK(ump); + if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) + (void) free_inodedep(inodedep); + freeblks->fb_state |= DEPCOMPLETE; + /* + * If the inode with zeroed block pointers is now on disk + * we can start freeing blocks. + */ + if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) + freeblks->fb_state |= INPROGRESS; + else + freeblks = NULL; + FREE_LOCK(ump); + if (freeblks) + handle_workitem_freeblocks(freeblks, 0); + trunc_pages(ip, length, extblocks, flags); +} + +/* + * Eliminate pages from the page cache that back parts of this inode and + * adjust the vnode pager's idea of our size. This prevents stale data + * from hanging around in the page cache. + */ +static void +trunc_pages(ip, length, extblocks, flags) + struct inode *ip; + off_t length; + ufs2_daddr_t extblocks; + int flags; +{ + struct vnode *vp; + struct fs *fs; + ufs_lbn_t lbn; + off_t end, extend; + + vp = ITOV(ip); + fs = ITOFS(ip); + extend = OFF_TO_IDX(lblktosize(fs, -extblocks)); + if ((flags & IO_EXT) != 0) + vn_pages_remove(vp, extend, 0); + if ((flags & IO_NORMAL) == 0) + return; + BO_LOCK(&vp->v_bufobj); + drain_output(vp); + BO_UNLOCK(&vp->v_bufobj); + /* + * The vnode pager eliminates file pages we eliminate indirects + * below. + */ + vnode_pager_setsize(vp, length); + /* + * Calculate the end based on the last indirect we want to keep. If + * the block extends into indirects we can just use the negative of + * its lbn. Doubles and triples exist at lower numbers so we must + * be careful not to remove those, if they exist. double and triple + * indirect lbns do not overlap with others so it is not important + * to verify how many levels are required. + */ + lbn = lblkno(fs, length); + if (lbn >= NDADDR) { + /* Calculate the virtual lbn of the triple indirect. */ + lbn = -lbn - (NIADDR - 1); + end = OFF_TO_IDX(lblktosize(fs, lbn)); + } else + end = extend; + vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end); +} + +/* + * See if the buf bp is in the range eliminated by truncation. + */ +static int +trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags) + struct buf *bp; + int *blkoffp; + ufs_lbn_t lastlbn; + int lastoff; + int flags; +{ + ufs_lbn_t lbn; + + *blkoffp = 0; + /* Only match ext/normal blocks as appropriate. */ + if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) || + ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0)) + return (0); + /* ALTDATA is always a full truncation. */ + if ((bp->b_xflags & BX_ALTDATA) != 0) + return (1); + /* -1 is full truncation. */ + if (lastlbn == -1) + return (1); + /* + * If this is a partial truncate we only want those + * blocks and indirect blocks that cover the range + * we're after. + */ + lbn = bp->b_lblkno; + if (lbn < 0) + lbn = -(lbn + lbn_level(lbn)); + if (lbn < lastlbn) + return (0); + /* Here we only truncate lblkno if it's partial. */ + if (lbn == lastlbn) { + if (lastoff == 0) + return (0); + *blkoffp = lastoff; + } + return (1); +} + +/* + * Eliminate any dependencies that exist in memory beyond lblkno:off + */ +static void +trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags) + struct inode *ip; + struct freeblks *freeblks; + ufs_lbn_t lastlbn; + int lastoff; + int flags; +{ + struct bufobj *bo; + struct vnode *vp; + struct buf *bp; + int blkoff; + + /* + * We must wait for any I/O in progress to finish so that + * all potential buffers on the dirty list will be visible. + * Once they are all there, walk the list and get rid of + * any dependencies. + */ + vp = ITOV(ip); + bo = &vp->v_bufobj; + BO_LOCK(bo); + drain_output(vp); + TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) + bp->b_vflags &= ~BV_SCANNED; +restart: + TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { + if (bp->b_vflags & BV_SCANNED) + continue; + if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { + bp->b_vflags |= BV_SCANNED; + continue; + } + KASSERT(bp->b_bufobj == bo, ("Wrong object in buffer")); + if ((bp = getdirtybuf(bp, BO_LOCKPTR(bo), MNT_WAIT)) == NULL) + goto restart; + BO_UNLOCK(bo); + if (deallocate_dependencies(bp, freeblks, blkoff)) + bqrelse(bp); + else + brelse(bp); + BO_LOCK(bo); + goto restart; + } + /* + * Now do the work of vtruncbuf while also matching indirect blocks. + */ + TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) + bp->b_vflags &= ~BV_SCANNED; +cleanrestart: + TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) { + if (bp->b_vflags & BV_SCANNED) + continue; + if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { + bp->b_vflags |= BV_SCANNED; + continue; + } + if (BUF_LOCK(bp, + LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, + BO_LOCKPTR(bo)) == ENOLCK) { + BO_LOCK(bo); + goto cleanrestart; + } + bp->b_vflags |= BV_SCANNED; + bremfree(bp); + if (blkoff != 0) { + allocbuf(bp, blkoff); + bqrelse(bp); + } else { + bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF; + brelse(bp); + } + BO_LOCK(bo); + goto cleanrestart; + } + drain_output(vp); + BO_UNLOCK(bo); +} + +static int +cancel_pagedep(pagedep, freeblks, blkoff) + struct pagedep *pagedep; + struct freeblks *freeblks; + int blkoff; +{ + struct jremref *jremref; + struct jmvref *jmvref; + struct dirrem *dirrem, *tmp; + int i; + + /* + * Copy any directory remove dependencies to the list + * to be processed after the freeblks proceeds. If + * directory entry never made it to disk they + * can be dumped directly onto the work list. + */ + LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) { + /* Skip this directory removal if it is intended to remain. */ + if (dirrem->dm_offset < blkoff) + continue; + /* + * If there are any dirrems we wait for the journal write + * to complete and then restart the buf scan as the lock + * has been dropped. + */ + while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) { + jwait(&jremref->jr_list, MNT_WAIT); + return (ERESTART); + } + LIST_REMOVE(dirrem, dm_next); + dirrem->dm_dirinum = pagedep->pd_ino; + WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list); + } + while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) { + jwait(&jmvref->jm_list, MNT_WAIT); + return (ERESTART); + } + /* + * When we're partially truncating a pagedep we just want to flush + * journal entries and return. There can not be any adds in the + * truncated portion of the directory and newblk must remain if + * part of the block remains. + */ + if (blkoff != 0) { + struct diradd *dap; + + LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) + if (dap->da_offset > blkoff) + panic("cancel_pagedep: diradd %p off %d > %d", + dap, dap->da_offset, blkoff); + for (i = 0; i < DAHASHSZ; i++) + LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) + if (dap->da_offset > blkoff) + panic("cancel_pagedep: diradd %p off %d > %d", + dap, dap->da_offset, blkoff); + return (0); + } + /* + * There should be no directory add dependencies present + * as the directory could not be truncated until all + * children were removed. + */ + KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL, + ("deallocate_dependencies: pendinghd != NULL")); + for (i = 0; i < DAHASHSZ; i++) + KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL, + ("deallocate_dependencies: diraddhd != NULL")); + if ((pagedep->pd_state & NEWBLOCK) != 0) + free_newdirblk(pagedep->pd_newdirblk); + if (free_pagedep(pagedep) == 0) + panic("Failed to free pagedep %p", pagedep); + return (0); +} + +/* + * Reclaim any dependency structures from a buffer that is about to + * be reallocated to a new vnode. The buffer must be locked, thus, + * no I/O completion operations can occur while we are manipulating + * its associated dependencies. The mutex is held so that other I/O's + * associated with related dependencies do not occur. + */ +static int +deallocate_dependencies(bp, freeblks, off) + struct buf *bp; + struct freeblks *freeblks; + int off; +{ + struct indirdep *indirdep; + struct pagedep *pagedep; + struct worklist *wk, *wkn; + struct ufsmount *ump; + + ump = softdep_bp_to_mp(bp); + if (ump == NULL) + goto done; + ACQUIRE_LOCK(ump); + LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) { + switch (wk->wk_type) { + case D_INDIRDEP: + indirdep = WK_INDIRDEP(wk); + if (bp->b_lblkno >= 0 || + bp->b_blkno != indirdep->ir_savebp->b_lblkno) + panic("deallocate_dependencies: not indir"); + cancel_indirdep(indirdep, bp, freeblks); + continue; + + case D_PAGEDEP: + pagedep = WK_PAGEDEP(wk); + if (cancel_pagedep(pagedep, freeblks, off)) { + FREE_LOCK(ump); + return (ERESTART); + } + continue; + + case D_ALLOCINDIR: + /* + * Simply remove the allocindir, we'll find it via + * the indirdep where we can clear pointers if + * needed. + */ + WORKLIST_REMOVE(wk); + continue; + + case D_FREEWORK: + /* + * A truncation is waiting for the zero'd pointers + * to be written. It can be freed when the freeblks + * is journaled. + */ + WORKLIST_REMOVE(wk); + wk->wk_state |= ONDEPLIST; + WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk); + break; + + case D_ALLOCDIRECT: + if (off != 0) + continue; + /* FALLTHROUGH */ + default: + panic("deallocate_dependencies: Unexpected type %s", + TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + } + FREE_LOCK(ump); +done: + /* + * Don't throw away this buf, we were partially truncating and + * some deps may always remain. + */ + if (off) { + allocbuf(bp, off); + bp->b_vflags |= BV_SCANNED; + return (EBUSY); + } + bp->b_flags |= B_INVAL | B_NOCACHE; + + return (0); +} + +/* + * An allocdirect is being canceled due to a truncate. We must make sure + * the journal entry is released in concert with the blkfree that releases + * the storage. Completed journal entries must not be released until the + * space is no longer pointed to by the inode or in the bitmap. + */ +static void +cancel_allocdirect(adphead, adp, freeblks) + struct allocdirectlst *adphead; + struct allocdirect *adp; + struct freeblks *freeblks; +{ + struct freework *freework; + struct newblk *newblk; + struct worklist *wk; + + TAILQ_REMOVE(adphead, adp, ad_next); + newblk = (struct newblk *)adp; + freework = NULL; + /* + * Find the correct freework structure. + */ + LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) { + if (wk->wk_type != D_FREEWORK) + continue; + freework = WK_FREEWORK(wk); + if (freework->fw_blkno == newblk->nb_newblkno) + break; + } + if (freework == NULL) + panic("cancel_allocdirect: Freework not found"); + /* + * If a newblk exists at all we still have the journal entry that + * initiated the allocation so we do not need to journal the free. + */ + cancel_jfreeblk(freeblks, freework->fw_blkno); + /* + * If the journal hasn't been written the jnewblk must be passed + * to the call to ffs_blkfree that reclaims the space. We accomplish + * this by linking the journal dependency into the freework to be + * freed when freework_freeblock() is called. If the journal has + * been written we can simply reclaim the journal space when the + * freeblks work is complete. + */ + freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list, + &freeblks->fb_jwork); + WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); +} + + +/* + * Cancel a new block allocation. May be an indirect or direct block. We + * remove it from various lists and return any journal record that needs to + * be resolved by the caller. + * + * A special consideration is made for indirects which were never pointed + * at on disk and will never be found once this block is released. + */ +static struct jnewblk * +cancel_newblk(newblk, wk, wkhd) + struct newblk *newblk; + struct worklist *wk; + struct workhead *wkhd; +{ + struct jnewblk *jnewblk; + + CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno); + + newblk->nb_state |= GOINGAWAY; + /* + * Previously we traversed the completedhd on each indirdep + * attached to this newblk to cancel them and gather journal + * work. Since we need only the oldest journal segment and + * the lowest point on the tree will always have the oldest + * journal segment we are free to release the segments + * of any subordinates and may leave the indirdep list to + * indirdep_complete() when this newblk is freed. + */ + if (newblk->nb_state & ONDEPLIST) { + newblk->nb_state &= ~ONDEPLIST; + LIST_REMOVE(newblk, nb_deps); + } + if (newblk->nb_state & ONWORKLIST) + WORKLIST_REMOVE(&newblk->nb_list); + /* + * If the journal entry hasn't been written we save a pointer to + * the dependency that frees it until it is written or the + * superseding operation completes. + */ + jnewblk = newblk->nb_jnewblk; + if (jnewblk != NULL && wk != NULL) { + newblk->nb_jnewblk = NULL; + jnewblk->jn_dep = wk; + } + if (!LIST_EMPTY(&newblk->nb_jwork)) + jwork_move(wkhd, &newblk->nb_jwork); + /* + * When truncating we must free the newdirblk early to remove + * the pagedep from the hash before returning. + */ + if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) + free_newdirblk(WK_NEWDIRBLK(wk)); + if (!LIST_EMPTY(&newblk->nb_newdirblk)) + panic("cancel_newblk: extra newdirblk"); + + return (jnewblk); +} + +/* + * Schedule the freefrag associated with a newblk to be released once + * the pointers are written and the previous block is no longer needed. + */ +static void +newblk_freefrag(newblk) + struct newblk *newblk; +{ + struct freefrag *freefrag; + + if (newblk->nb_freefrag == NULL) + return; + freefrag = newblk->nb_freefrag; + newblk->nb_freefrag = NULL; + freefrag->ff_state |= COMPLETE; + if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) + add_to_worklist(&freefrag->ff_list, 0); +} + +/* + * Free a newblk. Generate a new freefrag work request if appropriate. + * This must be called after the inode pointer and any direct block pointers + * are valid or fully removed via truncate or frag extension. + */ +static void +free_newblk(newblk) + struct newblk *newblk; +{ + struct indirdep *indirdep; + struct worklist *wk; + + KASSERT(newblk->nb_jnewblk == NULL, + ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk)); + KASSERT(newblk->nb_list.wk_type != D_NEWBLK, + ("free_newblk: unclaimed newblk")); + LOCK_OWNED(VFSTOUFS(newblk->nb_list.wk_mp)); + newblk_freefrag(newblk); + if (newblk->nb_state & ONDEPLIST) + LIST_REMOVE(newblk, nb_deps); + if (newblk->nb_state & ONWORKLIST) + WORKLIST_REMOVE(&newblk->nb_list); + LIST_REMOVE(newblk, nb_hash); + if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) + free_newdirblk(WK_NEWDIRBLK(wk)); + if (!LIST_EMPTY(&newblk->nb_newdirblk)) + panic("free_newblk: extra newdirblk"); + while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) + indirdep_complete(indirdep); + handle_jwork(&newblk->nb_jwork); + WORKITEM_FREE(newblk, D_NEWBLK); +} + +/* + * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep. + * This routine must be called with splbio interrupts blocked. + */ +static void +free_newdirblk(newdirblk) + struct newdirblk *newdirblk; +{ + struct pagedep *pagedep; + struct diradd *dap; + struct worklist *wk; + + LOCK_OWNED(VFSTOUFS(newdirblk->db_list.wk_mp)); + WORKLIST_REMOVE(&newdirblk->db_list); + /* + * If the pagedep is still linked onto the directory buffer + * dependency chain, then some of the entries on the + * pd_pendinghd list may not be committed to disk yet. In + * this case, we will simply clear the NEWBLOCK flag and + * let the pd_pendinghd list be processed when the pagedep + * is next written. If the pagedep is no longer on the buffer + * dependency chain, then all the entries on the pd_pending + * list are committed to disk and we can free them here. + */ + pagedep = newdirblk->db_pagedep; + pagedep->pd_state &= ~NEWBLOCK; + if ((pagedep->pd_state & ONWORKLIST) == 0) { + while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) + free_diradd(dap, NULL); + /* + * If no dependencies remain, the pagedep will be freed. + */ + free_pagedep(pagedep); + } + /* Should only ever be one item in the list. */ + while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) { + WORKLIST_REMOVE(wk); + handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); + } + WORKITEM_FREE(newdirblk, D_NEWDIRBLK); +} + +/* + * Prepare an inode to be freed. The actual free operation is not + * done until the zero'ed inode has been written to disk. + */ +void +softdep_freefile(pvp, ino, mode) + struct vnode *pvp; + ino_t ino; + int mode; +{ + struct inode *ip = VTOI(pvp); + struct inodedep *inodedep; + struct freefile *freefile; + struct freeblks *freeblks; + struct ufsmount *ump; + + ump = ITOUMP(ip); + KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, + ("softdep_freefile called on non-softdep filesystem")); + /* + * This sets up the inode de-allocation dependency. + */ + freefile = malloc(sizeof(struct freefile), + M_FREEFILE, M_SOFTDEP_FLAGS); + workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount); + freefile->fx_mode = mode; + freefile->fx_oldinum = ino; + freefile->fx_devvp = ump->um_devvp; + LIST_INIT(&freefile->fx_jwork); + UFS_LOCK(ump); + ump->um_fs->fs_pendinginodes += 1; + UFS_UNLOCK(ump); + + /* + * If the inodedep does not exist, then the zero'ed inode has + * been written to disk. If the allocated inode has never been + * written to disk, then the on-disk inode is zero'ed. In either + * case we can free the file immediately. If the journal was + * canceled before being written the inode will never make it to + * disk and we must send the canceled journal entrys to + * ffs_freefile() to be cleared in conjunction with the bitmap. + * Any blocks waiting on the inode to write can be safely freed + * here as it will never been written. + */ + ACQUIRE_LOCK(ump); + inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); + if (inodedep) { + /* + * Clear out freeblks that no longer need to reference + * this inode. + */ + while ((freeblks = + TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) { + TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, + fb_next); + freeblks->fb_state &= ~ONDEPLIST; + } + /* + * Remove this inode from the unlinked list. + */ + if (inodedep->id_state & UNLINKED) { + /* + * Save the journal work to be freed with the bitmap + * before we clear UNLINKED. Otherwise it can be lost + * if the inode block is written. + */ + handle_bufwait(inodedep, &freefile->fx_jwork); + clear_unlinked_inodedep(inodedep); + /* + * Re-acquire inodedep as we've dropped the + * per-filesystem lock in clear_unlinked_inodedep(). + */ + inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); + } + } + if (inodedep == NULL || check_inode_unwritten(inodedep)) { + FREE_LOCK(ump); + handle_workitem_freefile(freefile); + return; + } + if ((inodedep->id_state & DEPCOMPLETE) == 0) + inodedep->id_state |= GOINGAWAY; + WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); + FREE_LOCK(ump); + if (ip->i_number == ino) + ip->i_flag |= IN_MODIFIED; +} + +/* + * Check to see if an inode has never been written to disk. If + * so free the inodedep and return success, otherwise return failure. + * This routine must be called with splbio interrupts blocked. + * + * If we still have a bitmap dependency, then the inode has never + * been written to disk. Drop the dependency as it is no longer + * necessary since the inode is being deallocated. We set the + * ALLCOMPLETE flags since the bitmap now properly shows that the + * inode is not allocated. Even if the inode is actively being + * written, it has been rolled back to its zero'ed state, so we + * are ensured that a zero inode is what is on the disk. For short + * lived files, this change will usually result in removing all the + * dependencies from the inode so that it can be freed immediately. + */ +static int +check_inode_unwritten(inodedep) + struct inodedep *inodedep; +{ + + LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp)); + + if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 || + !LIST_EMPTY(&inodedep->id_dirremhd) || + !LIST_EMPTY(&inodedep->id_pendinghd) || + !LIST_EMPTY(&inodedep->id_bufwait) || + !LIST_EMPTY(&inodedep->id_inowait) || + !TAILQ_EMPTY(&inodedep->id_inoreflst) || + !TAILQ_EMPTY(&inodedep->id_inoupdt) || + !TAILQ_EMPTY(&inodedep->id_newinoupdt) || + !TAILQ_EMPTY(&inodedep->id_extupdt) || + !TAILQ_EMPTY(&inodedep->id_newextupdt) || + !TAILQ_EMPTY(&inodedep->id_freeblklst) || + inodedep->id_mkdiradd != NULL || + inodedep->id_nlinkdelta != 0) + return (0); + /* + * Another process might be in initiate_write_inodeblock_ufs[12] + * trying to allocate memory without holding "Softdep Lock". + */ + if ((inodedep->id_state & IOSTARTED) != 0 && + inodedep->id_savedino1 == NULL) + return (0); + + if (inodedep->id_state & ONDEPLIST) + LIST_REMOVE(inodedep, id_deps); + inodedep->id_state &= ~ONDEPLIST; + inodedep->id_state |= ALLCOMPLETE; + inodedep->id_bmsafemap = NULL; + if (inodedep->id_state & ONWORKLIST) + WORKLIST_REMOVE(&inodedep->id_list); + if (inodedep->id_savedino1 != NULL) { + free(inodedep->id_savedino1, M_SAVEDINO); + inodedep->id_savedino1 = NULL; + } + if (free_inodedep(inodedep) == 0) + panic("check_inode_unwritten: busy inode"); + return (1); +} + +static int +check_inodedep_free(inodedep) + struct inodedep *inodedep; +{ + + LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp)); + if ((inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || + !LIST_EMPTY(&inodedep->id_dirremhd) || + !LIST_EMPTY(&inodedep->id_pendinghd) || + !LIST_EMPTY(&inodedep->id_bufwait) || + !LIST_EMPTY(&inodedep->id_inowait) || + !TAILQ_EMPTY(&inodedep->id_inoreflst) || + !TAILQ_EMPTY(&inodedep->id_inoupdt) || + !TAILQ_EMPTY(&inodedep->id_newinoupdt) || + !TAILQ_EMPTY(&inodedep->id_extupdt) || + !TAILQ_EMPTY(&inodedep->id_newextupdt) || + !TAILQ_EMPTY(&inodedep->id_freeblklst) || + inodedep->id_mkdiradd != NULL || + inodedep->id_nlinkdelta != 0 || + inodedep->id_savedino1 != NULL) + return (0); + return (1); +} + +/* + * Try to free an inodedep structure. Return 1 if it could be freed. + */ +static int +free_inodedep(inodedep) + struct inodedep *inodedep; +{ + + LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp)); + if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 || + !check_inodedep_free(inodedep)) + return (0); + if (inodedep->id_state & ONDEPLIST) + LIST_REMOVE(inodedep, id_deps); + LIST_REMOVE(inodedep, id_hash); + WORKITEM_FREE(inodedep, D_INODEDEP); + return (1); +} + +/* + * Free the block referenced by a freework structure. The parent freeblks + * structure is released and completed when the final cg bitmap reaches + * the disk. This routine may be freeing a jnewblk which never made it to + * disk in which case we do not have to wait as the operation is undone + * in memory immediately. + */ +static void +freework_freeblock(freework) + struct freework *freework; +{ + struct freeblks *freeblks; + struct jnewblk *jnewblk; + struct ufsmount *ump; + struct workhead wkhd; + struct fs *fs; + int bsize; + int needj; + + ump = VFSTOUFS(freework->fw_list.wk_mp); + LOCK_OWNED(ump); + /* + * Handle partial truncate separately. + */ + if (freework->fw_indir) { + complete_trunc_indir(freework); + return; + } + freeblks = freework->fw_freeblks; + fs = ump->um_fs; + needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0; + bsize = lfragtosize(fs, freework->fw_frags); + LIST_INIT(&wkhd); + /* + * DEPCOMPLETE is cleared in indirblk_insert() if the block lives + * on the indirblk hashtable and prevents premature freeing. + */ + freework->fw_state |= DEPCOMPLETE; + /* + * SUJ needs to wait for the segment referencing freed indirect + * blocks to expire so that we know the checker will not confuse + * a re-allocated indirect block with its old contents. + */ + if (needj && freework->fw_lbn <= -NDADDR) + indirblk_insert(freework); + /* + * If we are canceling an existing jnewblk pass it to the free + * routine, otherwise pass the freeblk which will ultimately + * release the freeblks. If we're not journaling, we can just + * free the freeblks immediately. + */ + jnewblk = freework->fw_jnewblk; + if (jnewblk != NULL) { + cancel_jnewblk(jnewblk, &wkhd); + needj = 0; + } else if (needj) { + freework->fw_state |= DELAYEDFREE; + freeblks->fb_cgwait++; + WORKLIST_INSERT(&wkhd, &freework->fw_list); + } + FREE_LOCK(ump); + freeblks_free(ump, freeblks, btodb(bsize)); + CTR4(KTR_SUJ, + "freework_freeblock: ino %d blkno %jd lbn %jd size %ld", + freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize); + ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize, + freeblks->fb_inum, freeblks->fb_vtype, &wkhd); + ACQUIRE_LOCK(ump); + /* + * The jnewblk will be discarded and the bits in the map never + * made it to disk. We can immediately free the freeblk. + */ + if (needj == 0) + handle_written_freework(freework); +} + +/* + * We enqueue freework items that need processing back on the freeblks and + * add the freeblks to the worklist. This makes it easier to find all work + * required to flush a truncation in process_truncates(). + */ +static void +freework_enqueue(freework) + struct freework *freework; +{ + struct freeblks *freeblks; + + freeblks = freework->fw_freeblks; + if ((freework->fw_state & INPROGRESS) == 0) + WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); + if ((freeblks->fb_state & + (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE && + LIST_EMPTY(&freeblks->fb_jblkdephd)) + add_to_worklist(&freeblks->fb_list, WK_NODELAY); +} + +/* + * Start, continue, or finish the process of freeing an indirect block tree. + * The free operation may be paused at any point with fw_off containing the + * offset to restart from. This enables us to implement some flow control + * for large truncates which may fan out and generate a huge number of + * dependencies. + */ +static void +handle_workitem_indirblk(freework) + struct freework *freework; +{ + struct freeblks *freeblks; + struct ufsmount *ump; + struct fs *fs; + + freeblks = freework->fw_freeblks; + ump = VFSTOUFS(freeblks->fb_list.wk_mp); + fs = ump->um_fs; + if (freework->fw_state & DEPCOMPLETE) { + handle_written_freework(freework); + return; + } + if (freework->fw_off == NINDIR(fs)) { + freework_freeblock(freework); + return; + } + freework->fw_state |= INPROGRESS; + FREE_LOCK(ump); + indir_trunc(freework, fsbtodb(fs, freework->fw_blkno), + freework->fw_lbn); + ACQUIRE_LOCK(ump); +} + +/* + * Called when a freework structure attached to a cg buf is written. The + * ref on either the parent or the freeblks structure is released and + * the freeblks is added back to the worklist if there is more work to do. + */ +static void +handle_written_freework(freework) + struct freework *freework; +{ + struct freeblks *freeblks; + struct freework *parent; + + freeblks = freework->fw_freeblks; + parent = freework->fw_parent; + if (freework->fw_state & DELAYEDFREE) + freeblks->fb_cgwait--; + freework->fw_state |= COMPLETE; + if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) + WORKITEM_FREE(freework, D_FREEWORK); + if (parent) { + if (--parent->fw_ref == 0) + freework_enqueue(parent); + return; + } + if (--freeblks->fb_ref != 0) + return; + if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) == + ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) + add_to_worklist(&freeblks->fb_list, WK_NODELAY); +} + +/* + * This workitem routine performs the block de-allocation. + * The workitem is added to the pending list after the updated + * inode block has been written to disk. As mentioned above, + * checks regarding the number of blocks de-allocated (compared + * to the number of blocks allocated for the file) are also + * performed in this function. + */ +static int +handle_workitem_freeblocks(freeblks, flags) + struct freeblks *freeblks; + int flags; +{ + struct freework *freework; + struct newblk *newblk; + struct allocindir *aip; + struct ufsmount *ump; + struct worklist *wk; + + KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd), + ("handle_workitem_freeblocks: Journal entries not written.")); + ump = VFSTOUFS(freeblks->fb_list.wk_mp); + ACQUIRE_LOCK(ump); + while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) { + WORKLIST_REMOVE(wk); + switch (wk->wk_type) { + case D_DIRREM: + wk->wk_state |= COMPLETE; + add_to_worklist(wk, 0); + continue; + + case D_ALLOCDIRECT: + free_newblk(WK_NEWBLK(wk)); + continue; + + case D_ALLOCINDIR: + aip = WK_ALLOCINDIR(wk); + freework = NULL; + if (aip->ai_state & DELAYEDFREE) { + FREE_LOCK(ump); + freework = newfreework(ump, freeblks, NULL, + aip->ai_lbn, aip->ai_newblkno, + ump->um_fs->fs_frag, 0, 0); + ACQUIRE_LOCK(ump); + } + newblk = WK_NEWBLK(wk); + if (newblk->nb_jnewblk) { + freework->fw_jnewblk = newblk->nb_jnewblk; + newblk->nb_jnewblk->jn_dep = &freework->fw_list; + newblk->nb_jnewblk = NULL; + } + free_newblk(newblk); + continue; + + case D_FREEWORK: + freework = WK_FREEWORK(wk); + if (freework->fw_lbn <= -NDADDR) + handle_workitem_indirblk(freework); + else + freework_freeblock(freework); + continue; + default: + panic("handle_workitem_freeblocks: Unknown type %s", + TYPENAME(wk->wk_type)); + } + } + if (freeblks->fb_ref != 0) { + freeblks->fb_state &= ~INPROGRESS; + wake_worklist(&freeblks->fb_list); + freeblks = NULL; + } + FREE_LOCK(ump); + if (freeblks) + return handle_complete_freeblocks(freeblks, flags); + return (0); +} + +/* + * Handle completion of block free via truncate. This allows fs_pending + * to track the actual free block count more closely than if we only updated + * it at the end. We must be careful to handle cases where the block count + * on free was incorrect. + */ +static void +freeblks_free(ump, freeblks, blocks) + struct ufsmount *ump; + struct freeblks *freeblks; + int blocks; +{ + struct fs *fs; + ufs2_daddr_t remain; + + UFS_LOCK(ump); + remain = -freeblks->fb_chkcnt; + freeblks->fb_chkcnt += blocks; + if (remain > 0) { + if (remain < blocks) + blocks = remain; + fs = ump->um_fs; + fs->fs_pendingblocks -= blocks; + } + UFS_UNLOCK(ump); +} + +/* + * Once all of the freework workitems are complete we can retire the + * freeblocks dependency and any journal work awaiting completion. This + * can not be called until all other dependencies are stable on disk. + */ +static int +handle_complete_freeblocks(freeblks, flags) + struct freeblks *freeblks; + int flags; +{ + struct inodedep *inodedep; + struct inode *ip; + struct vnode *vp; + struct fs *fs; + struct ufsmount *ump; + ufs2_daddr_t spare; + + ump = VFSTOUFS(freeblks->fb_list.wk_mp); + fs = ump->um_fs; + flags = LK_EXCLUSIVE | flags; + spare = freeblks->fb_chkcnt; + + /* + * If we did not release the expected number of blocks we may have + * to adjust the inode block count here. Only do so if it wasn't + * a truncation to zero and the modrev still matches. + */ + if (spare && freeblks->fb_len != 0) { + if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum, + flags, &vp, FFSV_FORCEINSMQ) != 0) + return (EBUSY); + ip = VTOI(vp); + if (DIP(ip, i_modrev) == freeblks->fb_modrev) { + DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare); + ip->i_flag |= IN_CHANGE; + /* + * We must wait so this happens before the + * journal is reclaimed. + */ + ffs_update(vp, 1); + } + vput(vp); + } + if (spare < 0) { + UFS_LOCK(ump); + fs->fs_pendingblocks += spare; + UFS_UNLOCK(ump); + } +#ifdef QUOTA + /* Handle spare. */ + if (spare) + quotaadj(freeblks->fb_quota, ump, -spare); + quotarele(freeblks->fb_quota); +#endif + ACQUIRE_LOCK(ump); + if (freeblks->fb_state & ONDEPLIST) { + inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum, + 0, &inodedep); + TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next); + freeblks->fb_state &= ~ONDEPLIST; + if (TAILQ_EMPTY(&inodedep->id_freeblklst)) + free_inodedep(inodedep); + } + /* + * All of the freeblock deps must be complete prior to this call + * so it's now safe to complete earlier outstanding journal entries. + */ + handle_jwork(&freeblks->fb_jwork); + WORKITEM_FREE(freeblks, D_FREEBLKS); + FREE_LOCK(ump); + return (0); +} + +/* + * Release blocks associated with the freeblks and stored in the indirect + * block dbn. If level is greater than SINGLE, the block is an indirect block + * and recursive calls to indirtrunc must be used to cleanse other indirect + * blocks. + * + * This handles partial and complete truncation of blocks. Partial is noted + * with goingaway == 0. In this case the freework is completed after the + * zero'd indirects are written to disk. For full truncation the freework + * is completed after the block is freed. + */ +static void +indir_trunc(freework, dbn, lbn) + struct freework *freework; + ufs2_daddr_t dbn; + ufs_lbn_t lbn; +{ + struct freework *nfreework; + struct workhead wkhd; + struct freeblks *freeblks; + struct buf *bp; + struct fs *fs; + struct indirdep *indirdep; + struct ufsmount *ump; + ufs1_daddr_t *bap1; + ufs2_daddr_t nb, nnb, *bap2; + ufs_lbn_t lbnadd, nlbn; + int i, nblocks, ufs1fmt; + int freedblocks; + int goingaway; + int freedeps; + int needj; + int level; + int cnt; + + freeblks = freework->fw_freeblks; + ump = VFSTOUFS(freeblks->fb_list.wk_mp); + fs = ump->um_fs; + /* + * Get buffer of block pointers to be freed. There are three cases: + * + * 1) Partial truncate caches the indirdep pointer in the freework + * which provides us a back copy to the save bp which holds the + * pointers we want to clear. When this completes the zero + * pointers are written to the real copy. + * 2) The indirect is being completely truncated, cancel_indirdep() + * eliminated the real copy and placed the indirdep on the saved + * copy. The indirdep and buf are discarded when this completes. + * 3) The indirect was not in memory, we read a copy off of the disk + * using the devvp and drop and invalidate the buffer when we're + * done. + */ + goingaway = 1; + indirdep = NULL; + if (freework->fw_indir != NULL) { + goingaway = 0; + indirdep = freework->fw_indir; + bp = indirdep->ir_savebp; + if (bp == NULL || bp->b_blkno != dbn) + panic("indir_trunc: Bad saved buf %p blkno %jd", + bp, (intmax_t)dbn); + } else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) { + /* + * The lock prevents the buf dep list from changing and + * indirects on devvp should only ever have one dependency. + */ + indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep)); + if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0) + panic("indir_trunc: Bad indirdep %p from buf %p", + indirdep, bp); + } else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, + NOCRED, &bp) != 0) { + brelse(bp); + return; + } + ACQUIRE_LOCK(ump); + /* Protects against a race with complete_trunc_indir(). */ + freework->fw_state &= ~INPROGRESS; + /* + * If we have an indirdep we need to enforce the truncation order + * and discard it when it is complete. + */ + if (indirdep) { + if (freework != TAILQ_FIRST(&indirdep->ir_trunc) && + !TAILQ_EMPTY(&indirdep->ir_trunc)) { + /* + * Add the complete truncate to the list on the + * indirdep to enforce in-order processing. + */ + if (freework->fw_indir == NULL) + TAILQ_INSERT_TAIL(&indirdep->ir_trunc, + freework, fw_next); + FREE_LOCK(ump); + return; + } + /* + * If we're goingaway, free the indirdep. Otherwise it will + * linger until the write completes. + */ + if (goingaway) + free_indirdep(indirdep); + } + FREE_LOCK(ump); + /* Initialize pointers depending on block size. */ + if (ump->um_fstype == UFS1) { + bap1 = (ufs1_daddr_t *)bp->b_data; + nb = bap1[freework->fw_off]; + ufs1fmt = 1; + bap2 = NULL; + } else { + bap2 = (ufs2_daddr_t *)bp->b_data; + nb = bap2[freework->fw_off]; + ufs1fmt = 0; + bap1 = NULL; + } + level = lbn_level(lbn); + needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0; + lbnadd = lbn_offset(fs, level); + nblocks = btodb(fs->fs_bsize); + nfreework = freework; + freedeps = 0; + cnt = 0; + /* + * Reclaim blocks. Traverses into nested indirect levels and + * arranges for the current level to be freed when subordinates + * are free when journaling. + */ + for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) { + if (i != NINDIR(fs) - 1) { + if (ufs1fmt) + nnb = bap1[i+1]; + else + nnb = bap2[i+1]; + } else + nnb = 0; + if (nb == 0) + continue; + cnt++; + if (level != 0) { + nlbn = (lbn + 1) - (i * lbnadd); + if (needj != 0) { + nfreework = newfreework(ump, freeblks, freework, + nlbn, nb, fs->fs_frag, 0, 0); + freedeps++; + } + indir_trunc(nfreework, fsbtodb(fs, nb), nlbn); + } else { + struct freedep *freedep; + + /* + * Attempt to aggregate freedep dependencies for + * all blocks being released to the same CG. + */ + LIST_INIT(&wkhd); + if (needj != 0 && + (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) { + freedep = newfreedep(freework); + WORKLIST_INSERT_UNLOCKED(&wkhd, + &freedep->fd_list); + freedeps++; + } + CTR3(KTR_SUJ, + "indir_trunc: ino %d blkno %jd size %ld", + freeblks->fb_inum, nb, fs->fs_bsize); + ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, + fs->fs_bsize, freeblks->fb_inum, + freeblks->fb_vtype, &wkhd); + } + } + if (goingaway) { + bp->b_flags |= B_INVAL | B_NOCACHE; + brelse(bp); + } + freedblocks = 0; + if (level == 0) + freedblocks = (nblocks * cnt); + if (needj == 0) + freedblocks += nblocks; + freeblks_free(ump, freeblks, freedblocks); + /* + * If we are journaling set up the ref counts and offset so this + * indirect can be completed when its children are free. + */ + if (needj) { + ACQUIRE_LOCK(ump); + freework->fw_off = i; + freework->fw_ref += freedeps; + freework->fw_ref -= NINDIR(fs) + 1; + if (level == 0) + freeblks->fb_cgwait += freedeps; + if (freework->fw_ref == 0) + freework_freeblock(freework); + FREE_LOCK(ump); + return; + } + /* + * If we're not journaling we can free the indirect now. + */ + dbn = dbtofsb(fs, dbn); + CTR3(KTR_SUJ, + "indir_trunc 2: ino %d blkno %jd size %ld", + freeblks->fb_inum, dbn, fs->fs_bsize); + ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize, + freeblks->fb_inum, freeblks->fb_vtype, NULL); + /* Non SUJ softdep does single-threaded truncations. */ + if (freework->fw_blkno == dbn) { + freework->fw_state |= ALLCOMPLETE; + ACQUIRE_LOCK(ump); + handle_written_freework(freework); + FREE_LOCK(ump); + } + return; +} + +/* + * Cancel an allocindir when it is removed via truncation. When bp is not + * NULL the indirect never appeared on disk and is scheduled to be freed + * independently of the indir so we can more easily track journal work. + */ +static void +cancel_allocindir(aip, bp, freeblks, trunc) + struct allocindir *aip; + struct buf *bp; + struct freeblks *freeblks; + int trunc; +{ + struct indirdep *indirdep; + struct freefrag *freefrag; + struct newblk *newblk; + + newblk = (struct newblk *)aip; + LIST_REMOVE(aip, ai_next); + /* + * We must eliminate the pointer in bp if it must be freed on its + * own due to partial truncate or pending journal work. + */ + if (bp && (trunc || newblk->nb_jnewblk)) { + /* + * Clear the pointer and mark the aip to be freed + * directly if it never existed on disk. + */ + aip->ai_state |= DELAYEDFREE; + indirdep = aip->ai_indirdep; + if (indirdep->ir_state & UFS1FMT) + ((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0; + else + ((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0; + } + /* + * When truncating the previous pointer will be freed via + * savedbp. Eliminate the freefrag which would dup free. + */ + if (trunc && (freefrag = newblk->nb_freefrag) != NULL) { + newblk->nb_freefrag = NULL; + if (freefrag->ff_jdep) + cancel_jfreefrag( + WK_JFREEFRAG(freefrag->ff_jdep)); + jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork); + WORKITEM_FREE(freefrag, D_FREEFRAG); + } + /* + * If the journal hasn't been written the jnewblk must be passed + * to the call to ffs_blkfree that reclaims the space. We accomplish + * this by leaving the journal dependency on the newblk to be freed + * when a freework is created in handle_workitem_freeblocks(). + */ + cancel_newblk(newblk, NULL, &freeblks->fb_jwork); + WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); +} + +/* + * Create the mkdir dependencies for . and .. in a new directory. Link them + * in to a newdirblk so any subsequent additions are tracked properly. The + * caller is responsible for adding the mkdir1 dependency to the journal + * and updating id_mkdiradd. This function returns with the per-filesystem + * lock held. + */ +static struct mkdir * +setup_newdir(dap, newinum, dinum, newdirbp, mkdirp) + struct diradd *dap; + ino_t newinum; + ino_t dinum; + struct buf *newdirbp; + struct mkdir **mkdirp; +{ + struct newblk *newblk; + struct pagedep *pagedep; + struct inodedep *inodedep; + struct newdirblk *newdirblk; + struct mkdir *mkdir1, *mkdir2; + struct worklist *wk; + struct jaddref *jaddref; + struct ufsmount *ump; + struct mount *mp; + + mp = dap->da_list.wk_mp; + ump = VFSTOUFS(mp); + newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, + M_SOFTDEP_FLAGS); + workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); + LIST_INIT(&newdirblk->db_mkdir); + mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); + workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); + mkdir1->md_state = ATTACHED | MKDIR_BODY; + mkdir1->md_diradd = dap; + mkdir1->md_jaddref = NULL; + mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); + workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); + mkdir2->md_state = ATTACHED | MKDIR_PARENT; + mkdir2->md_diradd = dap; + mkdir2->md_jaddref = NULL; + if (MOUNTEDSUJ(mp) == 0) { + mkdir1->md_state |= DEPCOMPLETE; + mkdir2->md_state |= DEPCOMPLETE; + } + /* + * Dependency on "." and ".." being written to disk. + */ + mkdir1->md_buf = newdirbp; + ACQUIRE_LOCK(VFSTOUFS(mp)); + LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir1, md_mkdirs); + /* + * We must link the pagedep, allocdirect, and newdirblk for + * the initial file page so the pointer to the new directory + * is not written until the directory contents are live and + * any subsequent additions are not marked live until the + * block is reachable via the inode. + */ + if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0) + panic("setup_newdir: lost pagedep"); + LIST_FOREACH(wk, &newdirbp->b_dep, wk_list) + if (wk->wk_type == D_ALLOCDIRECT) + break; + if (wk == NULL) + panic("setup_newdir: lost allocdirect"); + if (pagedep->pd_state & NEWBLOCK) + panic("setup_newdir: NEWBLOCK already set"); + newblk = WK_NEWBLK(wk); + pagedep->pd_state |= NEWBLOCK; + pagedep->pd_newdirblk = newdirblk; + newdirblk->db_pagedep = pagedep; + WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); + WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list); + /* + * Look up the inodedep for the parent directory so that we + * can link mkdir2 into the pending dotdot jaddref or + * the inode write if there is none. If the inode is + * ALLCOMPLETE and no jaddref is present all dependencies have + * been satisfied and mkdir2 can be freed. + */ + inodedep_lookup(mp, dinum, 0, &inodedep); + if (MOUNTEDSUJ(mp)) { + if (inodedep == NULL) + panic("setup_newdir: Lost parent."); + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref != NULL && jaddref->ja_parent == newinum && + (jaddref->ja_state & MKDIR_PARENT), + ("setup_newdir: bad dotdot jaddref %p", jaddref)); + LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs); + mkdir2->md_jaddref = jaddref; + jaddref->ja_mkdir = mkdir2; + } else if (inodedep == NULL || + (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { + dap->da_state &= ~MKDIR_PARENT; + WORKITEM_FREE(mkdir2, D_MKDIR); + mkdir2 = NULL; + } else { + LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs); + WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list); + } + *mkdirp = mkdir2; + + return (mkdir1); +} + +/* + * Directory entry addition dependencies. + * + * When adding a new directory entry, the inode (with its incremented link + * count) must be written to disk before the directory entry's pointer to it. + * Also, if the inode is newly allocated, the corresponding freemap must be + * updated (on disk) before the directory entry's pointer. These requirements + * are met via undo/redo on the directory entry's pointer, which consists + * simply of the inode number. + * + * As directory entries are added and deleted, the free space within a + * directory block can become fragmented. The ufs filesystem will compact + * a fragmented directory block to make space for a new entry. When this + * occurs, the offsets of previously added entries change. Any "diradd" + * dependency structures corresponding to these entries must be updated with + * the new offsets. + */ + +/* + * This routine is called after the in-memory inode's link + * count has been incremented, but before the directory entry's + * pointer to the inode has been set. + */ +int +softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) + struct buf *bp; /* buffer containing directory block */ + struct inode *dp; /* inode for directory */ + off_t diroffset; /* offset of new entry in directory */ + ino_t newinum; /* inode referenced by new directory entry */ + struct buf *newdirbp; /* non-NULL => contents of new mkdir */ + int isnewblk; /* entry is in a newly allocated block */ +{ + int offset; /* offset of new entry within directory block */ + ufs_lbn_t lbn; /* block in directory containing new entry */ + struct fs *fs; + struct diradd *dap; + struct newblk *newblk; + struct pagedep *pagedep; + struct inodedep *inodedep; + struct newdirblk *newdirblk; + struct mkdir *mkdir1, *mkdir2; + struct jaddref *jaddref; + struct ufsmount *ump; + struct mount *mp; + int isindir; + + mp = ITOVFS(dp); + ump = VFSTOUFS(mp); + KASSERT(MOUNTEDSOFTDEP(mp) != 0, + ("softdep_setup_directory_add called on non-softdep filesystem")); + /* + * Whiteouts have no dependencies. + */ + if (newinum == WINO) { + if (newdirbp != NULL) + bdwrite(newdirbp); + return (0); + } + jaddref = NULL; + mkdir1 = mkdir2 = NULL; + fs = ump->um_fs; + lbn = lblkno(fs, diroffset); + offset = blkoff(fs, diroffset); + dap = malloc(sizeof(struct diradd), M_DIRADD, + M_SOFTDEP_FLAGS|M_ZERO); + workitem_alloc(&dap->da_list, D_DIRADD, mp); + dap->da_offset = offset; + dap->da_newinum = newinum; + dap->da_state = ATTACHED; + LIST_INIT(&dap->da_jwork); + isindir = bp->b_lblkno >= NDADDR; + newdirblk = NULL; + if (isnewblk && + (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) { + newdirblk = malloc(sizeof(struct newdirblk), + M_NEWDIRBLK, M_SOFTDEP_FLAGS); + workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); + LIST_INIT(&newdirblk->db_mkdir); + } + /* + * If we're creating a new directory setup the dependencies and set + * the dap state to wait for them. Otherwise it's COMPLETE and + * we can move on. + */ + if (newdirbp == NULL) { + dap->da_state |= DEPCOMPLETE; + ACQUIRE_LOCK(ump); + } else { + dap->da_state |= MKDIR_BODY | MKDIR_PARENT; + mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp, + &mkdir2); + } + /* + * Link into parent directory pagedep to await its being written. + */ + pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep); +#ifdef DEBUG + if (diradd_lookup(pagedep, offset) != NULL) + panic("softdep_setup_directory_add: %p already at off %d\n", + diradd_lookup(pagedep, offset), offset); +#endif + dap->da_pagedep = pagedep; + LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, + da_pdlist); + inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); + /* + * If we're journaling, link the diradd into the jaddref so it + * may be completed after the journal entry is written. Otherwise, + * link the diradd into its inodedep. If the inode is not yet + * written place it on the bufwait list, otherwise do the post-inode + * write processing to put it on the id_pendinghd list. + */ + if (MOUNTEDSUJ(mp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, + ("softdep_setup_directory_add: bad jaddref %p", jaddref)); + jaddref->ja_diroff = diroffset; + jaddref->ja_diradd = dap; + add_to_journal(&jaddref->ja_list); + } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) + diradd_inode_written(dap, inodedep); + else + WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); + /* + * Add the journal entries for . and .. links now that the primary + * link is written. + */ + if (mkdir1 != NULL && MOUNTEDSUJ(mp)) { + jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, + inoreflst, if_deps); + KASSERT(jaddref != NULL && + jaddref->ja_ino == jaddref->ja_parent && + (jaddref->ja_state & MKDIR_BODY), + ("softdep_setup_directory_add: bad dot jaddref %p", + jaddref)); + mkdir1->md_jaddref = jaddref; + jaddref->ja_mkdir = mkdir1; + /* + * It is important that the dotdot journal entry + * is added prior to the dot entry since dot writes + * both the dot and dotdot links. These both must + * be added after the primary link for the journal + * to remain consistent. + */ + add_to_journal(&mkdir2->md_jaddref->ja_list); + add_to_journal(&jaddref->ja_list); + } + /* + * If we are adding a new directory remember this diradd so that if + * we rename it we can keep the dot and dotdot dependencies. If + * we are adding a new name for an inode that has a mkdiradd we + * must be in rename and we have to move the dot and dotdot + * dependencies to this new name. The old name is being orphaned + * soon. + */ + if (mkdir1 != NULL) { + if (inodedep->id_mkdiradd != NULL) + panic("softdep_setup_directory_add: Existing mkdir"); + inodedep->id_mkdiradd = dap; + } else if (inodedep->id_mkdiradd) + merge_diradd(inodedep, dap); + if (newdirblk != NULL) { + /* + * There is nothing to do if we are already tracking + * this block. + */ + if ((pagedep->pd_state & NEWBLOCK) != 0) { + WORKITEM_FREE(newdirblk, D_NEWDIRBLK); + FREE_LOCK(ump); + return (0); + } + if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk) + == 0) + panic("softdep_setup_directory_add: lost entry"); + WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); + pagedep->pd_state |= NEWBLOCK; + pagedep->pd_newdirblk = newdirblk; + newdirblk->db_pagedep = pagedep; + FREE_LOCK(ump); + /* + * If we extended into an indirect signal direnter to sync. + */ + if (isindir) + return (1); + return (0); + } + FREE_LOCK(ump); + return (0); +} + +/* + * This procedure is called to change the offset of a directory + * entry when compacting a directory block which must be owned + * exclusively by the caller. Note that the actual entry movement + * must be done in this procedure to ensure that no I/O completions + * occur while the move is in progress. + */ +void +softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) + struct buf *bp; /* Buffer holding directory block. */ + struct inode *dp; /* inode for directory */ + caddr_t base; /* address of dp->i_offset */ + caddr_t oldloc; /* address of old directory location */ + caddr_t newloc; /* address of new directory location */ + int entrysize; /* size of directory entry */ +{ + int offset, oldoffset, newoffset; + struct pagedep *pagedep; + struct jmvref *jmvref; + struct diradd *dap; + struct direct *de; + struct mount *mp; + struct ufsmount *ump; + ufs_lbn_t lbn; + int flags; + + mp = ITOVFS(dp); + ump = VFSTOUFS(mp); + KASSERT(MOUNTEDSOFTDEP(mp) != 0, + ("softdep_change_directoryentry_offset called on " + "non-softdep filesystem")); + de = (struct direct *)oldloc; + jmvref = NULL; + flags = 0; + /* + * Moves are always journaled as it would be too complex to + * determine if any affected adds or removes are present in the + * journal. + */ + if (MOUNTEDSUJ(mp)) { + flags = DEPALLOC; + jmvref = newjmvref(dp, de->d_ino, + dp->i_offset + (oldloc - base), + dp->i_offset + (newloc - base)); + } + lbn = lblkno(ump->um_fs, dp->i_offset); + offset = blkoff(ump->um_fs, dp->i_offset); + oldoffset = offset + (oldloc - base); + newoffset = offset + (newloc - base); + ACQUIRE_LOCK(ump); + if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0) + goto done; + dap = diradd_lookup(pagedep, oldoffset); + if (dap) { + dap->da_offset = newoffset; + newoffset = DIRADDHASH(newoffset); + oldoffset = DIRADDHASH(oldoffset); + if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE && + newoffset != oldoffset) { + LIST_REMOVE(dap, da_pdlist); + LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset], + dap, da_pdlist); + } + } +done: + if (jmvref) { + jmvref->jm_pagedep = pagedep; + LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps); + add_to_journal(&jmvref->jm_list); + } + bcopy(oldloc, newloc, entrysize); + FREE_LOCK(ump); +} + +/* + * Move the mkdir dependencies and journal work from one diradd to another + * when renaming a directory. The new name must depend on the mkdir deps + * completing as the old name did. Directories can only have one valid link + * at a time so one must be canonical. + */ +static void +merge_diradd(inodedep, newdap) + struct inodedep *inodedep; + struct diradd *newdap; +{ + struct diradd *olddap; + struct mkdir *mkdir, *nextmd; + struct ufsmount *ump; + short state; + + olddap = inodedep->id_mkdiradd; + inodedep->id_mkdiradd = newdap; + if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { + newdap->da_state &= ~DEPCOMPLETE; + ump = VFSTOUFS(inodedep->id_list.wk_mp); + for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir; + mkdir = nextmd) { + nextmd = LIST_NEXT(mkdir, md_mkdirs); + if (mkdir->md_diradd != olddap) + continue; + mkdir->md_diradd = newdap; + state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY); + newdap->da_state |= state; + olddap->da_state &= ~state; + if ((olddap->da_state & + (MKDIR_PARENT | MKDIR_BODY)) == 0) + break; + } + if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) + panic("merge_diradd: unfound ref"); + } + /* + * Any mkdir related journal items are not safe to be freed until + * the new name is stable. + */ + jwork_move(&newdap->da_jwork, &olddap->da_jwork); + olddap->da_state |= DEPCOMPLETE; + complete_diradd(olddap); +} + +/* + * Move the diradd to the pending list when all diradd dependencies are + * complete. + */ +static void +complete_diradd(dap) + struct diradd *dap; +{ + struct pagedep *pagedep; + + if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { + if (dap->da_state & DIRCHG) + pagedep = dap->da_previous->dm_pagedep; + else + pagedep = dap->da_pagedep; + LIST_REMOVE(dap, da_pdlist); + LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); + } +} + +/* + * Cancel a diradd when a dirrem overlaps with it. We must cancel the journal + * add entries and conditonally journal the remove. + */ +static void +cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref) + struct diradd *dap; + struct dirrem *dirrem; + struct jremref *jremref; + struct jremref *dotremref; + struct jremref *dotdotremref; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct inoref *inoref; + struct ufsmount *ump; + struct mkdir *mkdir; + + /* + * If no remove references were allocated we're on a non-journaled + * filesystem and can skip the cancel step. + */ + if (jremref == NULL) { + free_diradd(dap, NULL); + return; + } + /* + * Cancel the primary name an free it if it does not require + * journaling. + */ + if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum, + 0, &inodedep) != 0) { + /* Abort the addref that reference this diradd. */ + TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { + if (inoref->if_list.wk_type != D_JADDREF) + continue; + jaddref = (struct jaddref *)inoref; + if (jaddref->ja_diradd != dap) + continue; + if (cancel_jaddref(jaddref, inodedep, + &dirrem->dm_jwork) == 0) { + free_jremref(jremref); + jremref = NULL; + } + break; + } + } + /* + * Cancel subordinate names and free them if they do not require + * journaling. + */ + if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { + ump = VFSTOUFS(dap->da_list.wk_mp); + LIST_FOREACH(mkdir, &ump->softdep_mkdirlisthd, md_mkdirs) { + if (mkdir->md_diradd != dap) + continue; + if ((jaddref = mkdir->md_jaddref) == NULL) + continue; + mkdir->md_jaddref = NULL; + if (mkdir->md_state & MKDIR_PARENT) { + if (cancel_jaddref(jaddref, NULL, + &dirrem->dm_jwork) == 0) { + free_jremref(dotdotremref); + dotdotremref = NULL; + } + } else { + if (cancel_jaddref(jaddref, inodedep, + &dirrem->dm_jwork) == 0) { + free_jremref(dotremref); + dotremref = NULL; + } + } + } + } + + if (jremref) + journal_jremref(dirrem, jremref, inodedep); + if (dotremref) + journal_jremref(dirrem, dotremref, inodedep); + if (dotdotremref) + journal_jremref(dirrem, dotdotremref, NULL); + jwork_move(&dirrem->dm_jwork, &dap->da_jwork); + free_diradd(dap, &dirrem->dm_jwork); +} + +/* + * Free a diradd dependency structure. This routine must be called + * with splbio interrupts blocked. + */ +static void +free_diradd(dap, wkhd) + struct diradd *dap; + struct workhead *wkhd; +{ + struct dirrem *dirrem; + struct pagedep *pagedep; + struct inodedep *inodedep; + struct mkdir *mkdir, *nextmd; + struct ufsmount *ump; + + ump = VFSTOUFS(dap->da_list.wk_mp); + LOCK_OWNED(ump); + LIST_REMOVE(dap, da_pdlist); + if (dap->da_state & ONWORKLIST) + WORKLIST_REMOVE(&dap->da_list); + if ((dap->da_state & DIRCHG) == 0) { + pagedep = dap->da_pagedep; + } else { + dirrem = dap->da_previous; + pagedep = dirrem->dm_pagedep; + dirrem->dm_dirinum = pagedep->pd_ino; + dirrem->dm_state |= COMPLETE; + if (LIST_EMPTY(&dirrem->dm_jremrefhd)) + add_to_worklist(&dirrem->dm_list, 0); + } + if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, + 0, &inodedep) != 0) + if (inodedep->id_mkdiradd == dap) + inodedep->id_mkdiradd = NULL; + if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { + for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir; + mkdir = nextmd) { + nextmd = LIST_NEXT(mkdir, md_mkdirs); + if (mkdir->md_diradd != dap) + continue; + dap->da_state &= + ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); + LIST_REMOVE(mkdir, md_mkdirs); + if (mkdir->md_state & ONWORKLIST) + WORKLIST_REMOVE(&mkdir->md_list); + if (mkdir->md_jaddref != NULL) + panic("free_diradd: Unexpected jaddref"); + WORKITEM_FREE(mkdir, D_MKDIR); + if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) + break; + } + if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) + panic("free_diradd: unfound ref"); + } + if (inodedep) + free_inodedep(inodedep); + /* + * Free any journal segments waiting for the directory write. + */ + handle_jwork(&dap->da_jwork); + WORKITEM_FREE(dap, D_DIRADD); +} + +/* + * Directory entry removal dependencies. + * + * When removing a directory entry, the entry's inode pointer must be + * zero'ed on disk before the corresponding inode's link count is decremented + * (possibly freeing the inode for re-use). This dependency is handled by + * updating the directory entry but delaying the inode count reduction until + * after the directory block has been written to disk. After this point, the + * inode count can be decremented whenever it is convenient. + */ + +/* + * This routine should be called immediately after removing + * a directory entry. The inode's link count should not be + * decremented by the calling procedure -- the soft updates + * code will do this task when it is safe. + */ +void +softdep_setup_remove(bp, dp, ip, isrmdir) + struct buf *bp; /* buffer containing directory block */ + struct inode *dp; /* inode for the directory being modified */ + struct inode *ip; /* inode for directory entry being removed */ + int isrmdir; /* indicates if doing RMDIR */ +{ + struct dirrem *dirrem, *prevdirrem; + struct inodedep *inodedep; + struct ufsmount *ump; + int direct; + + ump = ITOUMP(ip); + KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, + ("softdep_setup_remove called on non-softdep filesystem")); + /* + * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want + * newdirrem() to setup the full directory remove which requires + * isrmdir > 1. + */ + dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); + /* + * Add the dirrem to the inodedep's pending remove list for quick + * discovery later. + */ + if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0) + panic("softdep_setup_remove: Lost inodedep."); + KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked")); + dirrem->dm_state |= ONDEPLIST; + LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); + + /* + * If the COMPLETE flag is clear, then there were no active + * entries and we want to roll back to a zeroed entry until + * the new inode is committed to disk. If the COMPLETE flag is + * set then we have deleted an entry that never made it to + * disk. If the entry we deleted resulted from a name change, + * then the old name still resides on disk. We cannot delete + * its inode (returned to us in prevdirrem) until the zeroed + * directory entry gets to disk. The new inode has never been + * referenced on the disk, so can be deleted immediately. + */ + if ((dirrem->dm_state & COMPLETE) == 0) { + LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, + dm_next); + FREE_LOCK(ump); + } else { + if (prevdirrem != NULL) + LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, + prevdirrem, dm_next); + dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; + direct = LIST_EMPTY(&dirrem->dm_jremrefhd); + FREE_LOCK(ump); + if (direct) + handle_workitem_remove(dirrem, 0); + } +} + +/* + * Check for an entry matching 'offset' on both the pd_dirraddhd list and the + * pd_pendinghd list of a pagedep. + */ +static struct diradd * +diradd_lookup(pagedep, offset) + struct pagedep *pagedep; + int offset; +{ + struct diradd *dap; + + LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) + if (dap->da_offset == offset) + return (dap); + LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) + if (dap->da_offset == offset) + return (dap); + return (NULL); +} + +/* + * Search for a .. diradd dependency in a directory that is being removed. + * If the directory was renamed to a new parent we have a diradd rather + * than a mkdir for the .. entry. We need to cancel it now before + * it is found in truncate(). + */ +static struct jremref * +cancel_diradd_dotdot(ip, dirrem, jremref) + struct inode *ip; + struct dirrem *dirrem; + struct jremref *jremref; +{ + struct pagedep *pagedep; + struct diradd *dap; + struct worklist *wk; + + if (pagedep_lookup(ITOVFS(ip), NULL, ip->i_number, 0, 0, &pagedep) == 0) + return (jremref); + dap = diradd_lookup(pagedep, DOTDOT_OFFSET); + if (dap == NULL) + return (jremref); + cancel_diradd(dap, dirrem, jremref, NULL, NULL); + /* + * Mark any journal work as belonging to the parent so it is freed + * with the .. reference. + */ + LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) + wk->wk_state |= MKDIR_PARENT; + return (NULL); +} + +/* + * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to + * replace it with a dirrem/diradd pair as a result of re-parenting a + * directory. This ensures that we don't simultaneously have a mkdir and + * a diradd for the same .. entry. + */ +static struct jremref * +cancel_mkdir_dotdot(ip, dirrem, jremref) + struct inode *ip; + struct dirrem *dirrem; + struct jremref *jremref; +{ + struct inodedep *inodedep; + struct jaddref *jaddref; + struct ufsmount *ump; + struct mkdir *mkdir; + struct diradd *dap; + struct mount *mp; + + mp = ITOVFS(ip); + if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) + return (jremref); + dap = inodedep->id_mkdiradd; + if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0) + return (jremref); + ump = VFSTOUFS(inodedep->id_list.wk_mp); + for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir; + mkdir = LIST_NEXT(mkdir, md_mkdirs)) + if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT) + break; + if (mkdir == NULL) + panic("cancel_mkdir_dotdot: Unable to find mkdir\n"); + if ((jaddref = mkdir->md_jaddref) != NULL) { + mkdir->md_jaddref = NULL; + jaddref->ja_state &= ~MKDIR_PARENT; + if (inodedep_lookup(mp, jaddref->ja_ino, 0, &inodedep) == 0) + panic("cancel_mkdir_dotdot: Lost parent inodedep"); + if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) { + journal_jremref(dirrem, jremref, inodedep); + jremref = NULL; + } + } + if (mkdir->md_state & ONWORKLIST) + WORKLIST_REMOVE(&mkdir->md_list); + mkdir->md_state |= ALLCOMPLETE; + complete_mkdir(mkdir); + return (jremref); +} + +static void +journal_jremref(dirrem, jremref, inodedep) + struct dirrem *dirrem; + struct jremref *jremref; + struct inodedep *inodedep; +{ + + if (inodedep == NULL) + if (inodedep_lookup(jremref->jr_list.wk_mp, + jremref->jr_ref.if_ino, 0, &inodedep) == 0) + panic("journal_jremref: Lost inodedep"); + LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps); + TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); + add_to_journal(&jremref->jr_list); +} + +static void +dirrem_journal(dirrem, jremref, dotremref, dotdotremref) + struct dirrem *dirrem; + struct jremref *jremref; + struct jremref *dotremref; + struct jremref *dotdotremref; +{ + struct inodedep *inodedep; + + + if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, + &inodedep) == 0) + panic("dirrem_journal: Lost inodedep"); + journal_jremref(dirrem, jremref, inodedep); + if (dotremref) + journal_jremref(dirrem, dotremref, inodedep); + if (dotdotremref) + journal_jremref(dirrem, dotdotremref, NULL); +} + +/* + * Allocate a new dirrem if appropriate and return it along with + * its associated pagedep. Called without a lock, returns with lock. + */ +static struct dirrem * +newdirrem(bp, dp, ip, isrmdir, prevdirremp) + struct buf *bp; /* buffer containing directory block */ + struct inode *dp; /* inode for the directory being modified */ + struct inode *ip; /* inode for directory entry being removed */ + int isrmdir; /* indicates if doing RMDIR */ + struct dirrem **prevdirremp; /* previously referenced inode, if any */ +{ + int offset; + ufs_lbn_t lbn; + struct diradd *dap; + struct dirrem *dirrem; + struct pagedep *pagedep; + struct jremref *jremref; + struct jremref *dotremref; + struct jremref *dotdotremref; + struct vnode *dvp; + struct ufsmount *ump; + + /* + * Whiteouts have no deletion dependencies. + */ + if (ip == NULL) + panic("newdirrem: whiteout"); + dvp = ITOV(dp); + ump = ITOUMP(dp); + + /* + * If the system is over its limit and our filesystem is + * responsible for more than our share of that usage and + * we are not a snapshot, request some inodedep cleanup. + * Limiting the number of dirrem structures will also limit + * the number of freefile and freeblks structures. + */ + ACQUIRE_LOCK(ump); + if (!IS_SNAPSHOT(ip) && softdep_excess_items(ump, D_DIRREM)) + schedule_cleanup(UFSTOVFS(ump)); + else + FREE_LOCK(ump); + dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS | + M_ZERO); + workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount); + LIST_INIT(&dirrem->dm_jremrefhd); + LIST_INIT(&dirrem->dm_jwork); + dirrem->dm_state = isrmdir ? RMDIR : 0; + dirrem->dm_oldinum = ip->i_number; + *prevdirremp = NULL; + /* + * Allocate remove reference structures to track journal write + * dependencies. We will always have one for the link and + * when doing directories we will always have one more for dot. + * When renaming a directory we skip the dotdot link change so + * this is not needed. + */ + jremref = dotremref = dotdotremref = NULL; + if (DOINGSUJ(dvp)) { + if (isrmdir) { + jremref = newjremref(dirrem, dp, ip, dp->i_offset, + ip->i_effnlink + 2); + dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET, + ip->i_effnlink + 1); + dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET, + dp->i_effnlink + 1); + dotdotremref->jr_state |= MKDIR_PARENT; + } else + jremref = newjremref(dirrem, dp, ip, dp->i_offset, + ip->i_effnlink + 1); + } + ACQUIRE_LOCK(ump); + lbn = lblkno(ump->um_fs, dp->i_offset); + offset = blkoff(ump->um_fs, dp->i_offset); + pagedep_lookup(UFSTOVFS(ump), bp, dp->i_number, lbn, DEPALLOC, + &pagedep); + dirrem->dm_pagedep = pagedep; + dirrem->dm_offset = offset; + /* + * If we're renaming a .. link to a new directory, cancel any + * existing MKDIR_PARENT mkdir. If it has already been canceled + * the jremref is preserved for any potential diradd in this + * location. This can not coincide with a rmdir. + */ + if (dp->i_offset == DOTDOT_OFFSET) { + if (isrmdir) + panic("newdirrem: .. directory change during remove?"); + jremref = cancel_mkdir_dotdot(dp, dirrem, jremref); + } + /* + * If we're removing a directory search for the .. dependency now and + * cancel it. Any pending journal work will be added to the dirrem + * to be completed when the workitem remove completes. + */ + if (isrmdir) + dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref); + /* + * Check for a diradd dependency for the same directory entry. + * If present, then both dependencies become obsolete and can + * be de-allocated. + */ + dap = diradd_lookup(pagedep, offset); + if (dap == NULL) { + /* + * Link the jremref structures into the dirrem so they are + * written prior to the pagedep. + */ + if (jremref) + dirrem_journal(dirrem, jremref, dotremref, + dotdotremref); + return (dirrem); + } + /* + * Must be ATTACHED at this point. + */ + if ((dap->da_state & ATTACHED) == 0) + panic("newdirrem: not ATTACHED"); + if (dap->da_newinum != ip->i_number) + panic("newdirrem: inum %ju should be %ju", + (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum); + /* + * If we are deleting a changed name that never made it to disk, + * then return the dirrem describing the previous inode (which + * represents the inode currently referenced from this entry on disk). + */ + if ((dap->da_state & DIRCHG) != 0) { + *prevdirremp = dap->da_previous; + dap->da_state &= ~DIRCHG; + dap->da_pagedep = pagedep; + } + /* + * We are deleting an entry that never made it to disk. + * Mark it COMPLETE so we can delete its inode immediately. + */ + dirrem->dm_state |= COMPLETE; + cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref); +#ifdef SUJ_DEBUG + if (isrmdir == 0) { + struct worklist *wk; + + LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) + if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT)) + panic("bad wk %p (0x%X)\n", wk, wk->wk_state); + } +#endif + + return (dirrem); +} + +/* + * Directory entry change dependencies. + * + * Changing an existing directory entry requires that an add operation + * be completed first followed by a deletion. The semantics for the addition + * are identical to the description of adding a new entry above except + * that the rollback is to the old inode number rather than zero. Once + * the addition dependency is completed, the removal is done as described + * in the removal routine above. + */ + +/* + * This routine should be called immediately after changing + * a directory entry. The inode's link count should not be + * decremented by the calling procedure -- the soft updates + * code will perform this task when it is safe. + */ +void +softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) + struct buf *bp; /* buffer containing directory block */ + struct inode *dp; /* inode for the directory being modified */ + struct inode *ip; /* inode for directory entry being removed */ + ino_t newinum; /* new inode number for changed entry */ + int isrmdir; /* indicates if doing RMDIR */ +{ + int offset; + struct diradd *dap = NULL; + struct dirrem *dirrem, *prevdirrem; + struct pagedep *pagedep; + struct inodedep *inodedep; + struct jaddref *jaddref; + struct mount *mp; + struct ufsmount *ump; + + mp = ITOVFS(dp); + ump = VFSTOUFS(mp); + offset = blkoff(ump->um_fs, dp->i_offset); + KASSERT(MOUNTEDSOFTDEP(mp) != 0, + ("softdep_setup_directory_change called on non-softdep filesystem")); + + /* + * Whiteouts do not need diradd dependencies. + */ + if (newinum != WINO) { + dap = malloc(sizeof(struct diradd), + M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); + workitem_alloc(&dap->da_list, D_DIRADD, mp); + dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; + dap->da_offset = offset; + dap->da_newinum = newinum; + LIST_INIT(&dap->da_jwork); + } + + /* + * Allocate a new dirrem and ACQUIRE_LOCK. + */ + dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); + pagedep = dirrem->dm_pagedep; + /* + * The possible values for isrmdir: + * 0 - non-directory file rename + * 1 - directory rename within same directory + * inum - directory rename to new directory of given inode number + * When renaming to a new directory, we are both deleting and + * creating a new directory entry, so the link count on the new + * directory should not change. Thus we do not need the followup + * dirrem which is usually done in handle_workitem_remove. We set + * the DIRCHG flag to tell handle_workitem_remove to skip the + * followup dirrem. + */ + if (isrmdir > 1) + dirrem->dm_state |= DIRCHG; + + /* + * Whiteouts have no additional dependencies, + * so just put the dirrem on the correct list. + */ + if (newinum == WINO) { + if ((dirrem->dm_state & COMPLETE) == 0) { + LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, + dm_next); + } else { + dirrem->dm_dirinum = pagedep->pd_ino; + if (LIST_EMPTY(&dirrem->dm_jremrefhd)) + add_to_worklist(&dirrem->dm_list, 0); + } + FREE_LOCK(ump); + return; + } + /* + * Add the dirrem to the inodedep's pending remove list for quick + * discovery later. A valid nlinkdelta ensures that this lookup + * will not fail. + */ + if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) + panic("softdep_setup_directory_change: Lost inodedep."); + dirrem->dm_state |= ONDEPLIST; + LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); + + /* + * If the COMPLETE flag is clear, then there were no active + * entries and we want to roll back to the previous inode until + * the new inode is committed to disk. If the COMPLETE flag is + * set, then we have deleted an entry that never made it to disk. + * If the entry we deleted resulted from a name change, then the old + * inode reference still resides on disk. Any rollback that we do + * needs to be to that old inode (returned to us in prevdirrem). If + * the entry we deleted resulted from a create, then there is + * no entry on the disk, so we want to roll back to zero rather + * than the uncommitted inode. In either of the COMPLETE cases we + * want to immediately free the unwritten and unreferenced inode. + */ + if ((dirrem->dm_state & COMPLETE) == 0) { + dap->da_previous = dirrem; + } else { + if (prevdirrem != NULL) { + dap->da_previous = prevdirrem; + } else { + dap->da_state &= ~DIRCHG; + dap->da_pagedep = pagedep; + } + dirrem->dm_dirinum = pagedep->pd_ino; + if (LIST_EMPTY(&dirrem->dm_jremrefhd)) + add_to_worklist(&dirrem->dm_list, 0); + } + /* + * Lookup the jaddref for this journal entry. We must finish + * initializing it and make the diradd write dependent on it. + * If we're not journaling, put it on the id_bufwait list if the + * inode is not yet written. If it is written, do the post-inode + * write processing to put it on the id_pendinghd list. + */ + inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); + if (MOUNTEDSUJ(mp)) { + jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, + inoreflst); + KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, + ("softdep_setup_directory_change: bad jaddref %p", + jaddref)); + jaddref->ja_diroff = dp->i_offset; + jaddref->ja_diradd = dap; + LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], + dap, da_pdlist); + add_to_journal(&jaddref->ja_list); + } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { + dap->da_state |= COMPLETE; + LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); + WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); + } else { + LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], + dap, da_pdlist); + WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); + } + /* + * If we're making a new name for a directory that has not been + * committed when need to move the dot and dotdot references to + * this new name. + */ + if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET) + merge_diradd(inodedep, dap); + FREE_LOCK(ump); +} + +/* + * Called whenever the link count on an inode is changed. + * It creates an inode dependency so that the new reference(s) + * to the inode cannot be committed to disk until the updated + * inode has been written. + */ +void +softdep_change_linkcnt(ip) + struct inode *ip; /* the inode with the increased link count */ +{ + struct inodedep *inodedep; + struct ufsmount *ump; + + ump = ITOUMP(ip); + KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, + ("softdep_change_linkcnt called on non-softdep filesystem")); + ACQUIRE_LOCK(ump); + inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep); + if (ip->i_nlink < ip->i_effnlink) + panic("softdep_change_linkcnt: bad delta"); + inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; + FREE_LOCK(ump); +} + +/* + * Attach a sbdep dependency to the superblock buf so that we can keep + * track of the head of the linked list of referenced but unlinked inodes. + */ +void +softdep_setup_sbupdate(ump, fs, bp) + struct ufsmount *ump; + struct fs *fs; + struct buf *bp; +{ + struct sbdep *sbdep; + struct worklist *wk; + + KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, + ("softdep_setup_sbupdate called on non-softdep filesystem")); + LIST_FOREACH(wk, &bp->b_dep, wk_list) + if (wk->wk_type == D_SBDEP) + break; + if (wk != NULL) + return; + sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS); + workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump)); + sbdep->sb_fs = fs; + sbdep->sb_ump = ump; + ACQUIRE_LOCK(ump); + WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list); + FREE_LOCK(ump); +} + +/* + * Return the first unlinked inodedep which is ready to be the head of the + * list. The inodedep and all those after it must have valid next pointers. + */ +static struct inodedep * +first_unlinked_inodedep(ump) + struct ufsmount *ump; +{ + struct inodedep *inodedep; + struct inodedep *idp; + + LOCK_OWNED(ump); + for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst); + inodedep; inodedep = idp) { + if ((inodedep->id_state & UNLINKNEXT) == 0) + return (NULL); + idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); + if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0) + break; + if ((inodedep->id_state & UNLINKPREV) == 0) + break; + } + return (inodedep); +} + +/* + * Set the sujfree unlinked head pointer prior to writing a superblock. + */ +static void +initiate_write_sbdep(sbdep) + struct sbdep *sbdep; +{ + struct inodedep *inodedep; + struct fs *bpfs; + struct fs *fs; + + bpfs = sbdep->sb_fs; + fs = sbdep->sb_ump->um_fs; + inodedep = first_unlinked_inodedep(sbdep->sb_ump); + if (inodedep) { + fs->fs_sujfree = inodedep->id_ino; + inodedep->id_state |= UNLINKPREV; + } else + fs->fs_sujfree = 0; + bpfs->fs_sujfree = fs->fs_sujfree; +} + +/* + * After a superblock is written determine whether it must be written again + * due to a changing unlinked list head. + */ +static int +handle_written_sbdep(sbdep, bp) + struct sbdep *sbdep; + struct buf *bp; +{ + struct inodedep *inodedep; + struct fs *fs; + + LOCK_OWNED(sbdep->sb_ump); + fs = sbdep->sb_fs; + /* + * If the superblock doesn't match the in-memory list start over. + */ + inodedep = first_unlinked_inodedep(sbdep->sb_ump); + if ((inodedep && fs->fs_sujfree != inodedep->id_ino) || + (inodedep == NULL && fs->fs_sujfree != 0)) { + bdirty(bp); + return (1); + } + WORKITEM_FREE(sbdep, D_SBDEP); + if (fs->fs_sujfree == 0) + return (0); + /* + * Now that we have a record of this inode in stable store allow it + * to be written to free up pending work. Inodes may see a lot of + * write activity after they are unlinked which we must not hold up. + */ + for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { + if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS) + panic("handle_written_sbdep: Bad inodedep %p (0x%X)", + inodedep, inodedep->id_state); + if (inodedep->id_state & UNLINKONLIST) + break; + inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST; + } + + return (0); +} + +/* + * Mark an inodedep as unlinked and insert it into the in-memory unlinked list. + */ +static void +unlinked_inodedep(mp, inodedep) + struct mount *mp; + struct inodedep *inodedep; +{ + struct ufsmount *ump; + + ump = VFSTOUFS(mp); + LOCK_OWNED(ump); + if (MOUNTEDSUJ(mp) == 0) + return; + ump->um_fs->fs_fmod = 1; + if (inodedep->id_state & UNLINKED) + panic("unlinked_inodedep: %p already unlinked\n", inodedep); + inodedep->id_state |= UNLINKED; + TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked); +} + +/* + * Remove an inodedep from the unlinked inodedep list. This may require + * disk writes if the inode has made it that far. + */ +static void +clear_unlinked_inodedep(inodedep) + struct inodedep *inodedep; +{ + struct ufsmount *ump; + struct inodedep *idp; + struct inodedep *idn; + struct fs *fs; + struct buf *bp; + ino_t ino; + ino_t nino; + ino_t pino; + int error; + + ump = VFSTOUFS(inodedep->id_list.wk_mp); + fs = ump->um_fs; + ino = inodedep->id_ino; + error = 0; + for (;;) { + LOCK_OWNED(ump); + KASSERT((inodedep->id_state & UNLINKED) != 0, + ("clear_unlinked_inodedep: inodedep %p not unlinked", + inodedep)); + /* + * If nothing has yet been written simply remove us from + * the in memory list and return. This is the most common + * case where handle_workitem_remove() loses the final + * reference. + */ + if ((inodedep->id_state & UNLINKLINKS) == 0) + break; + /* + * If we have a NEXT pointer and no PREV pointer we can simply + * clear NEXT's PREV and remove ourselves from the list. Be + * careful not to clear PREV if the superblock points at + * next as well. + */ + idn = TAILQ_NEXT(inodedep, id_unlinked); + if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) { + if (idn && fs->fs_sujfree != idn->id_ino) + idn->id_state &= ~UNLINKPREV; + break; + } + /* + * Here we have an inodedep which is actually linked into + * the list. We must remove it by forcing a write to the + * link before us, whether it be the superblock or an inode. + * Unfortunately the list may change while we're waiting + * on the buf lock for either resource so we must loop until + * we lock the right one. If both the superblock and an + * inode point to this inode we must clear the inode first + * followed by the superblock. + */ + idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); + pino = 0; + if (idp && (idp->id_state & UNLINKNEXT)) + pino = idp->id_ino; + FREE_LOCK(ump); + if (pino == 0) { + bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), + (int)fs->fs_sbsize, 0, 0, 0); + } else { + error = bread(ump->um_devvp, + fsbtodb(fs, ino_to_fsba(fs, pino)), + (int)fs->fs_bsize, NOCRED, &bp); + if (error) + brelse(bp); + } + ACQUIRE_LOCK(ump); + if (error) + break; + /* If the list has changed restart the loop. */ + idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); + nino = 0; + if (idp && (idp->id_state & UNLINKNEXT)) + nino = idp->id_ino; + if (nino != pino || + (inodedep->id_state & UNLINKPREV) != UNLINKPREV) { + FREE_LOCK(ump); + brelse(bp); + ACQUIRE_LOCK(ump); + continue; + } + nino = 0; + idn = TAILQ_NEXT(inodedep, id_unlinked); + if (idn) + nino = idn->id_ino; + /* + * Remove us from the in memory list. After this we cannot + * access the inodedep. + */ + KASSERT((inodedep->id_state & UNLINKED) != 0, + ("clear_unlinked_inodedep: inodedep %p not unlinked", + inodedep)); + inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST); + TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); + FREE_LOCK(ump); + /* + * The predecessor's next pointer is manually updated here + * so that the NEXT flag is never cleared for an element + * that is in the list. + */ + if (pino == 0) { + bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); + ffs_oldfscompat_write((struct fs *)bp->b_data, ump); + softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, + bp); + } else if (fs->fs_magic == FS_UFS1_MAGIC) + ((struct ufs1_dinode *)bp->b_data + + ino_to_fsbo(fs, pino))->di_freelink = nino; + else + ((struct ufs2_dinode *)bp->b_data + + ino_to_fsbo(fs, pino))->di_freelink = nino; + /* + * If the bwrite fails we have no recourse to recover. The + * filesystem is corrupted already. + */ + bwrite(bp); + ACQUIRE_LOCK(ump); + /* + * If the superblock pointer still needs to be cleared force + * a write here. + */ + if (fs->fs_sujfree == ino) { + FREE_LOCK(ump); + bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), + (int)fs->fs_sbsize, 0, 0, 0); + bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); + ffs_oldfscompat_write((struct fs *)bp->b_data, ump); + softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, + bp); + bwrite(bp); + ACQUIRE_LOCK(ump); + } + + if (fs->fs_sujfree != ino) + return; + panic("clear_unlinked_inodedep: Failed to clear free head"); + } + if (inodedep->id_ino == fs->fs_sujfree) + panic("clear_unlinked_inodedep: Freeing head of free list"); + inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST); + TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); + return; +} + +/* + * This workitem decrements the inode's link count. + * If the link count reaches zero, the file is removed. + */ +static int +handle_workitem_remove(dirrem, flags) + struct dirrem *dirrem; + int flags; +{ + struct inodedep *inodedep; + struct workhead dotdotwk; + struct worklist *wk; + struct ufsmount *ump; + struct mount *mp; + struct vnode *vp; + struct inode *ip; + ino_t oldinum; + + if (dirrem->dm_state & ONWORKLIST) + panic("handle_workitem_remove: dirrem %p still on worklist", + dirrem); + oldinum = dirrem->dm_oldinum; + mp = dirrem->dm_list.wk_mp; + ump = VFSTOUFS(mp); + flags |= LK_EXCLUSIVE; + if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0) + return (EBUSY); + ip = VTOI(vp); + ACQUIRE_LOCK(ump); + if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0) + panic("handle_workitem_remove: lost inodedep"); + if (dirrem->dm_state & ONDEPLIST) + LIST_REMOVE(dirrem, dm_inonext); + KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), + ("handle_workitem_remove: Journal entries not written.")); + + /* + * Move all dependencies waiting on the remove to complete + * from the dirrem to the inode inowait list to be completed + * after the inode has been updated and written to disk. Any + * marked MKDIR_PARENT are saved to be completed when the .. ref + * is removed. + */ + LIST_INIT(&dotdotwk); + while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) { + WORKLIST_REMOVE(wk); + if (wk->wk_state & MKDIR_PARENT) { + wk->wk_state &= ~MKDIR_PARENT; + WORKLIST_INSERT(&dotdotwk, wk); + continue; + } + WORKLIST_INSERT(&inodedep->id_inowait, wk); + } + LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list); + /* + * Normal file deletion. + */ + if ((dirrem->dm_state & RMDIR) == 0) { + ip->i_nlink--; + DIP_SET(ip, i_nlink, ip->i_nlink); + ip->i_flag |= IN_CHANGE; + if (ip->i_nlink < ip->i_effnlink) + panic("handle_workitem_remove: bad file delta"); + if (ip->i_nlink == 0) + unlinked_inodedep(mp, inodedep); + inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; + KASSERT(LIST_EMPTY(&dirrem->dm_jwork), + ("handle_workitem_remove: worklist not empty. %s", + TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type))); + WORKITEM_FREE(dirrem, D_DIRREM); + FREE_LOCK(ump); + goto out; + } + /* + * Directory deletion. Decrement reference count for both the + * just deleted parent directory entry and the reference for ".". + * Arrange to have the reference count on the parent decremented + * to account for the loss of "..". + */ + ip->i_nlink -= 2; + DIP_SET(ip, i_nlink, ip->i_nlink); + ip->i_flag |= IN_CHANGE; + if (ip->i_nlink < ip->i_effnlink) + panic("handle_workitem_remove: bad dir delta"); + if (ip->i_nlink == 0) + unlinked_inodedep(mp, inodedep); + inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; + /* + * Rename a directory to a new parent. Since, we are both deleting + * and creating a new directory entry, the link count on the new + * directory should not change. Thus we skip the followup dirrem. + */ + if (dirrem->dm_state & DIRCHG) { + KASSERT(LIST_EMPTY(&dirrem->dm_jwork), + ("handle_workitem_remove: DIRCHG and worklist not empty.")); + WORKITEM_FREE(dirrem, D_DIRREM); + FREE_LOCK(ump); + goto out; + } + dirrem->dm_state = ONDEPLIST; + dirrem->dm_oldinum = dirrem->dm_dirinum; + /* + * Place the dirrem on the parent's diremhd list. + */ + if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0) + panic("handle_workitem_remove: lost dir inodedep"); + LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); + /* + * If the allocated inode has never been written to disk, then + * the on-disk inode is zero'ed and we can remove the file + * immediately. When journaling if the inode has been marked + * unlinked and not DEPCOMPLETE we know it can never be written. + */ + inodedep_lookup(mp, oldinum, 0, &inodedep); + if (inodedep == NULL || + (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED || + check_inode_unwritten(inodedep)) { + FREE_LOCK(ump); + vput(vp); + return handle_workitem_remove(dirrem, flags); + } + WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); + FREE_LOCK(ump); + ip->i_flag |= IN_CHANGE; +out: + ffs_update(vp, 0); + vput(vp); + return (0); +} + +/* + * Inode de-allocation dependencies. + * + * When an inode's link count is reduced to zero, it can be de-allocated. We + * found it convenient to postpone de-allocation until after the inode is + * written to disk with its new link count (zero). At this point, all of the + * on-disk inode's block pointers are nullified and, with careful dependency + * list ordering, all dependencies related to the inode will be satisfied and + * the corresponding dependency structures de-allocated. So, if/when the + * inode is reused, there will be no mixing of old dependencies with new + * ones. This artificial dependency is set up by the block de-allocation + * procedure above (softdep_setup_freeblocks) and completed by the + * following procedure. + */ +static void +handle_workitem_freefile(freefile) + struct freefile *freefile; +{ + struct workhead wkhd; + struct fs *fs; + struct inodedep *idp; + struct ufsmount *ump; + int error; + + ump = VFSTOUFS(freefile->fx_list.wk_mp); + fs = ump->um_fs; +#ifdef DEBUG + ACQUIRE_LOCK(ump); + error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); + FREE_LOCK(ump); + if (error) + panic("handle_workitem_freefile: inodedep %p survived", idp); +#endif + UFS_LOCK(ump); + fs->fs_pendinginodes -= 1; + UFS_UNLOCK(ump); + LIST_INIT(&wkhd); + LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list); + if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, + freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0) + softdep_error("handle_workitem_freefile", error); + ACQUIRE_LOCK(ump); + WORKITEM_FREE(freefile, D_FREEFILE); + FREE_LOCK(ump); +} + + +/* + * Helper function which unlinks marker element from work list and returns + * the next element on the list. + */ +static __inline struct worklist * +markernext(struct worklist *marker) +{ + struct worklist *next; + + next = LIST_NEXT(marker, wk_list); + LIST_REMOVE(marker, wk_list); + return next; +} + +/* + * Disk writes. + * + * The dependency structures constructed above are most actively used when file + * system blocks are written to disk. No constraints are placed on when a + * block can be written, but unsatisfied update dependencies are made safe by + * modifying (or replacing) the source memory for the duration of the disk + * write. When the disk write completes, the memory block is again brought + * up-to-date. + * + * In-core inode structure reclamation. + * + * Because there are a finite number of "in-core" inode structures, they are + * reused regularly. By transferring all inode-related dependencies to the + * in-memory inode block and indexing them separately (via "inodedep"s), we + * can allow "in-core" inode structures to be reused at any time and avoid + * any increase in contention. + * + * Called just before entering the device driver to initiate a new disk I/O. + * The buffer must be locked, thus, no I/O completion operations can occur + * while we are manipulating its associated dependencies. + */ +static void +softdep_disk_io_initiation(bp) + struct buf *bp; /* structure describing disk write to occur */ +{ + struct worklist *wk; + struct worklist marker; + struct inodedep *inodedep; + struct freeblks *freeblks; + struct jblkdep *jblkdep; + struct newblk *newblk; + struct ufsmount *ump; + + /* + * We only care about write operations. There should never + * be dependencies for reads. + */ + if (bp->b_iocmd != BIO_WRITE) + panic("softdep_disk_io_initiation: not write"); + + if (bp->b_vflags & BV_BKGRDINPROG) + panic("softdep_disk_io_initiation: Writing buffer with " + "background write in progress: %p", bp); + + ump = softdep_bp_to_mp(bp); + if (ump == NULL) + return; + + marker.wk_type = D_LAST + 1; /* Not a normal workitem */ + PHOLD(curproc); /* Don't swap out kernel stack */ + ACQUIRE_LOCK(ump); + /* + * Do any necessary pre-I/O processing. + */ + for (wk = LIST_FIRST(&bp->b_dep); wk != NULL; + wk = markernext(&marker)) { + LIST_INSERT_AFTER(wk, &marker, wk_list); + switch (wk->wk_type) { + + case D_PAGEDEP: + initiate_write_filepage(WK_PAGEDEP(wk), bp); + continue; + + case D_INODEDEP: + inodedep = WK_INODEDEP(wk); + if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) + initiate_write_inodeblock_ufs1(inodedep, bp); + else + initiate_write_inodeblock_ufs2(inodedep, bp); + continue; + + case D_INDIRDEP: + initiate_write_indirdep(WK_INDIRDEP(wk), bp); + continue; + + case D_BMSAFEMAP: + initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp); + continue; + + case D_JSEG: + WK_JSEG(wk)->js_buf = NULL; + continue; + + case D_FREEBLKS: + freeblks = WK_FREEBLKS(wk); + jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd); + /* + * We have to wait for the freeblks to be journaled + * before we can write an inodeblock with updated + * pointers. Be careful to arrange the marker so + * we revisit the freeblks if it's not removed by + * the first jwait(). + */ + if (jblkdep != NULL) { + LIST_REMOVE(&marker, wk_list); + LIST_INSERT_BEFORE(wk, &marker, wk_list); + jwait(&jblkdep->jb_list, MNT_WAIT); + } + continue; + case D_ALLOCDIRECT: + case D_ALLOCINDIR: + /* + * We have to wait for the jnewblk to be journaled + * before we can write to a block if the contents + * may be confused with an earlier file's indirect + * at recovery time. Handle the marker as described + * above. + */ + newblk = WK_NEWBLK(wk); + if (newblk->nb_jnewblk != NULL && + indirblk_lookup(newblk->nb_list.wk_mp, + newblk->nb_newblkno)) { + LIST_REMOVE(&marker, wk_list); + LIST_INSERT_BEFORE(wk, &marker, wk_list); + jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); + } + continue; + + case D_SBDEP: + initiate_write_sbdep(WK_SBDEP(wk)); + continue; + + case D_MKDIR: + case D_FREEWORK: + case D_FREEDEP: + case D_JSEGDEP: + continue; + + default: + panic("handle_disk_io_initiation: Unexpected type %s", + TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + } + FREE_LOCK(ump); + PRELE(curproc); /* Allow swapout of kernel stack */ +} + +/* + * Called from within the procedure above to deal with unsatisfied + * allocation dependencies in a directory. The buffer must be locked, + * thus, no I/O completion operations can occur while we are + * manipulating its associated dependencies. + */ +static void +initiate_write_filepage(pagedep, bp) + struct pagedep *pagedep; + struct buf *bp; +{ + struct jremref *jremref; + struct jmvref *jmvref; + struct dirrem *dirrem; + struct diradd *dap; + struct direct *ep; + int i; + + if (pagedep->pd_state & IOSTARTED) { + /* + * This can only happen if there is a driver that does not + * understand chaining. Here biodone will reissue the call + * to strategy for the incomplete buffers. + */ + printf("initiate_write_filepage: already started\n"); + return; + } + pagedep->pd_state |= IOSTARTED; + /* + * Wait for all journal remove dependencies to hit the disk. + * We can not allow any potentially conflicting directory adds + * to be visible before removes and rollback is too difficult. + * The per-filesystem lock may be dropped and re-acquired, however + * we hold the buf locked so the dependency can not go away. + */ + LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) + while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) + jwait(&jremref->jr_list, MNT_WAIT); + while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) + jwait(&jmvref->jm_list, MNT_WAIT); + for (i = 0; i < DAHASHSZ; i++) { + LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { + ep = (struct direct *) + ((char *)bp->b_data + dap->da_offset); + if (ep->d_ino != dap->da_newinum) + panic("%s: dir inum %ju != new %ju", + "initiate_write_filepage", + (uintmax_t)ep->d_ino, + (uintmax_t)dap->da_newinum); + if (dap->da_state & DIRCHG) + ep->d_ino = dap->da_previous->dm_oldinum; + else + ep->d_ino = 0; + dap->da_state &= ~ATTACHED; + dap->da_state |= UNDONE; + } + } +} + +/* + * Version of initiate_write_inodeblock that handles UFS1 dinodes. + * Note that any bug fixes made to this routine must be done in the + * version found below. + * + * Called from within the procedure above to deal with unsatisfied + * allocation dependencies in an inodeblock. The buffer must be + * locked, thus, no I/O completion operations can occur while we + * are manipulating its associated dependencies. + */ +static void +initiate_write_inodeblock_ufs1(inodedep, bp) + struct inodedep *inodedep; + struct buf *bp; /* The inode block */ +{ + struct allocdirect *adp, *lastadp; + struct ufs1_dinode *dp; + struct ufs1_dinode *sip; + struct inoref *inoref; + struct ufsmount *ump; + struct fs *fs; + ufs_lbn_t i; +#ifdef INVARIANTS + ufs_lbn_t prevlbn = 0; +#endif + int deplist; + + if (inodedep->id_state & IOSTARTED) + panic("initiate_write_inodeblock_ufs1: already started"); + inodedep->id_state |= IOSTARTED; + fs = inodedep->id_fs; + ump = VFSTOUFS(inodedep->id_list.wk_mp); + LOCK_OWNED(ump); + dp = (struct ufs1_dinode *)bp->b_data + + ino_to_fsbo(fs, inodedep->id_ino); + + /* + * If we're on the unlinked list but have not yet written our + * next pointer initialize it here. + */ + if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { + struct inodedep *inon; + + inon = TAILQ_NEXT(inodedep, id_unlinked); + dp->di_freelink = inon ? inon->id_ino : 0; + } + /* + * If the bitmap is not yet written, then the allocated + * inode cannot be written to disk. + */ + if ((inodedep->id_state & DEPCOMPLETE) == 0) { + if (inodedep->id_savedino1 != NULL) + panic("initiate_write_inodeblock_ufs1: I/O underway"); + FREE_LOCK(ump); + sip = malloc(sizeof(struct ufs1_dinode), + M_SAVEDINO, M_SOFTDEP_FLAGS); + ACQUIRE_LOCK(ump); + inodedep->id_savedino1 = sip; + *inodedep->id_savedino1 = *dp; + bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); + dp->di_gen = inodedep->id_savedino1->di_gen; + dp->di_freelink = inodedep->id_savedino1->di_freelink; + return; + } + /* + * If no dependencies, then there is nothing to roll back. + */ + inodedep->id_savedsize = dp->di_size; + inodedep->id_savedextsize = 0; + inodedep->id_savednlink = dp->di_nlink; + if (TAILQ_EMPTY(&inodedep->id_inoupdt) && + TAILQ_EMPTY(&inodedep->id_inoreflst)) + return; + /* + * Revert the link count to that of the first unwritten journal entry. + */ + inoref = TAILQ_FIRST(&inodedep->id_inoreflst); + if (inoref) + dp->di_nlink = inoref->if_nlink; + /* + * Set the dependencies to busy. + */ + for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; + adp = TAILQ_NEXT(adp, ad_next)) { +#ifdef INVARIANTS + if (deplist != 0 && prevlbn >= adp->ad_offset) + panic("softdep_write_inodeblock: lbn order"); + prevlbn = adp->ad_offset; + if (adp->ad_offset < NDADDR && + dp->di_db[adp->ad_offset] != adp->ad_newblkno) + panic("%s: direct pointer #%jd mismatch %d != %jd", + "softdep_write_inodeblock", + (intmax_t)adp->ad_offset, + dp->di_db[adp->ad_offset], + (intmax_t)adp->ad_newblkno); + if (adp->ad_offset >= NDADDR && + dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) + panic("%s: indirect pointer #%jd mismatch %d != %jd", + "softdep_write_inodeblock", + (intmax_t)adp->ad_offset - NDADDR, + dp->di_ib[adp->ad_offset - NDADDR], + (intmax_t)adp->ad_newblkno); + deplist |= 1 << adp->ad_offset; + if ((adp->ad_state & ATTACHED) == 0) + panic("softdep_write_inodeblock: Unknown state 0x%x", + adp->ad_state); +#endif /* INVARIANTS */ + adp->ad_state &= ~ATTACHED; + adp->ad_state |= UNDONE; + } + /* + * The on-disk inode cannot claim to be any larger than the last + * fragment that has been written. Otherwise, the on-disk inode + * might have fragments that were not the last block in the file + * which would corrupt the filesystem. + */ + for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; + lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { + if (adp->ad_offset >= NDADDR) + break; + dp->di_db[adp->ad_offset] = adp->ad_oldblkno; + /* keep going until hitting a rollback to a frag */ + if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) + continue; + dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; + for (i = adp->ad_offset + 1; i < NDADDR; i++) { +#ifdef INVARIANTS + if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) + panic("softdep_write_inodeblock: lost dep1"); +#endif /* INVARIANTS */ + dp->di_db[i] = 0; + } + for (i = 0; i < NIADDR; i++) { +#ifdef INVARIANTS + if (dp->di_ib[i] != 0 && + (deplist & ((1 << NDADDR) << i)) == 0) + panic("softdep_write_inodeblock: lost dep2"); +#endif /* INVARIANTS */ + dp->di_ib[i] = 0; + } + return; + } + /* + * If we have zero'ed out the last allocated block of the file, + * roll back the size to the last currently allocated block. + * We know that this last allocated block is a full-sized as + * we already checked for fragments in the loop above. + */ + if (lastadp != NULL && + dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { + for (i = lastadp->ad_offset; i >= 0; i--) + if (dp->di_db[i] != 0) + break; + dp->di_size = (i + 1) * fs->fs_bsize; + } + /* + * The only dependencies are for indirect blocks. + * + * The file size for indirect block additions is not guaranteed. + * Such a guarantee would be non-trivial to achieve. The conventional + * synchronous write implementation also does not make this guarantee. + * Fsck should catch and fix discrepancies. Arguably, the file size + * can be over-estimated without destroying integrity when the file + * moves into the indirect blocks (i.e., is large). If we want to + * postpone fsck, we are stuck with this argument. + */ + for (; adp; adp = TAILQ_NEXT(adp, ad_next)) + dp->di_ib[adp->ad_offset - NDADDR] = 0; +} + +/* + * Version of initiate_write_inodeblock that handles UFS2 dinodes. + * Note that any bug fixes made to this routine must be done in the + * version found above. + * + * Called from within the procedure above to deal with unsatisfied + * allocation dependencies in an inodeblock. The buffer must be + * locked, thus, no I/O completion operations can occur while we + * are manipulating its associated dependencies. + */ +static void +initiate_write_inodeblock_ufs2(inodedep, bp) + struct inodedep *inodedep; + struct buf *bp; /* The inode block */ +{ + struct allocdirect *adp, *lastadp; + struct ufs2_dinode *dp; + struct ufs2_dinode *sip; + struct inoref *inoref; + struct ufsmount *ump; + struct fs *fs; + ufs_lbn_t i; +#ifdef INVARIANTS + ufs_lbn_t prevlbn = 0; +#endif + int deplist; + + if (inodedep->id_state & IOSTARTED) + panic("initiate_write_inodeblock_ufs2: already started"); + inodedep->id_state |= IOSTARTED; + fs = inodedep->id_fs; + ump = VFSTOUFS(inodedep->id_list.wk_mp); + LOCK_OWNED(ump); + dp = (struct ufs2_dinode *)bp->b_data + + ino_to_fsbo(fs, inodedep->id_ino); + + /* + * If we're on the unlinked list but have not yet written our + * next pointer initialize it here. + */ + if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { + struct inodedep *inon; + + inon = TAILQ_NEXT(inodedep, id_unlinked); + dp->di_freelink = inon ? inon->id_ino : 0; + } + /* + * If the bitmap is not yet written, then the allocated + * inode cannot be written to disk. + */ + if ((inodedep->id_state & DEPCOMPLETE) == 0) { + if (inodedep->id_savedino2 != NULL) + panic("initiate_write_inodeblock_ufs2: I/O underway"); + FREE_LOCK(ump); + sip = malloc(sizeof(struct ufs2_dinode), + M_SAVEDINO, M_SOFTDEP_FLAGS); + ACQUIRE_LOCK(ump); + inodedep->id_savedino2 = sip; + *inodedep->id_savedino2 = *dp; + bzero((caddr_t)dp, sizeof(struct ufs2_dinode)); + dp->di_gen = inodedep->id_savedino2->di_gen; + dp->di_freelink = inodedep->id_savedino2->di_freelink; + return; + } + /* + * If no dependencies, then there is nothing to roll back. + */ + inodedep->id_savedsize = dp->di_size; + inodedep->id_savedextsize = dp->di_extsize; + inodedep->id_savednlink = dp->di_nlink; + if (TAILQ_EMPTY(&inodedep->id_inoupdt) && + TAILQ_EMPTY(&inodedep->id_extupdt) && + TAILQ_EMPTY(&inodedep->id_inoreflst)) + return; + /* + * Revert the link count to that of the first unwritten journal entry. + */ + inoref = TAILQ_FIRST(&inodedep->id_inoreflst); + if (inoref) + dp->di_nlink = inoref->if_nlink; + + /* + * Set the ext data dependencies to busy. + */ + for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; + adp = TAILQ_NEXT(adp, ad_next)) { +#ifdef INVARIANTS + if (deplist != 0 && prevlbn >= adp->ad_offset) + panic("softdep_write_inodeblock: lbn order"); + prevlbn = adp->ad_offset; + if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno) + panic("%s: direct pointer #%jd mismatch %jd != %jd", + "softdep_write_inodeblock", + (intmax_t)adp->ad_offset, + (intmax_t)dp->di_extb[adp->ad_offset], + (intmax_t)adp->ad_newblkno); + deplist |= 1 << adp->ad_offset; + if ((adp->ad_state & ATTACHED) == 0) + panic("softdep_write_inodeblock: Unknown state 0x%x", + adp->ad_state); +#endif /* INVARIANTS */ + adp->ad_state &= ~ATTACHED; + adp->ad_state |= UNDONE; + } + /* + * The on-disk inode cannot claim to be any larger than the last + * fragment that has been written. Otherwise, the on-disk inode + * might have fragments that were not the last block in the ext + * data which would corrupt the filesystem. + */ + for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; + lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { + dp->di_extb[adp->ad_offset] = adp->ad_oldblkno; + /* keep going until hitting a rollback to a frag */ + if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) + continue; + dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; + for (i = adp->ad_offset + 1; i < NXADDR; i++) { +#ifdef INVARIANTS + if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) + panic("softdep_write_inodeblock: lost dep1"); +#endif /* INVARIANTS */ + dp->di_extb[i] = 0; + } + lastadp = NULL; + break; + } + /* + * If we have zero'ed out the last allocated block of the ext + * data, roll back the size to the last currently allocated block. + * We know that this last allocated block is a full-sized as + * we already checked for fragments in the loop above. + */ + if (lastadp != NULL && + dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) { + for (i = lastadp->ad_offset; i >= 0; i--) + if (dp->di_extb[i] != 0) + break; + dp->di_extsize = (i + 1) * fs->fs_bsize; + } + /* + * Set the file data dependencies to busy. + */ + for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; + adp = TAILQ_NEXT(adp, ad_next)) { +#ifdef INVARIANTS + if (deplist != 0 && prevlbn >= adp->ad_offset) + panic("softdep_write_inodeblock: lbn order"); + if ((adp->ad_state & ATTACHED) == 0) + panic("inodedep %p and adp %p not attached", inodedep, adp); + prevlbn = adp->ad_offset; + if (adp->ad_offset < NDADDR && + dp->di_db[adp->ad_offset] != adp->ad_newblkno) + panic("%s: direct pointer #%jd mismatch %jd != %jd", + "softdep_write_inodeblock", + (intmax_t)adp->ad_offset, + (intmax_t)dp->di_db[adp->ad_offset], + (intmax_t)adp->ad_newblkno); + if (adp->ad_offset >= NDADDR && + dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) + panic("%s indirect pointer #%jd mismatch %jd != %jd", + "softdep_write_inodeblock:", + (intmax_t)adp->ad_offset - NDADDR, + (intmax_t)dp->di_ib[adp->ad_offset - NDADDR], + (intmax_t)adp->ad_newblkno); + deplist |= 1 << adp->ad_offset; + if ((adp->ad_state & ATTACHED) == 0) + panic("softdep_write_inodeblock: Unknown state 0x%x", + adp->ad_state); +#endif /* INVARIANTS */ + adp->ad_state &= ~ATTACHED; + adp->ad_state |= UNDONE; + } + /* + * The on-disk inode cannot claim to be any larger than the last + * fragment that has been written. Otherwise, the on-disk inode + * might have fragments that were not the last block in the file + * which would corrupt the filesystem. + */ + for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; + lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { + if (adp->ad_offset >= NDADDR) + break; + dp->di_db[adp->ad_offset] = adp->ad_oldblkno; + /* keep going until hitting a rollback to a frag */ + if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) + continue; + dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; + for (i = adp->ad_offset + 1; i < NDADDR; i++) { +#ifdef INVARIANTS + if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) + panic("softdep_write_inodeblock: lost dep2"); +#endif /* INVARIANTS */ + dp->di_db[i] = 0; + } + for (i = 0; i < NIADDR; i++) { +#ifdef INVARIANTS + if (dp->di_ib[i] != 0 && + (deplist & ((1 << NDADDR) << i)) == 0) + panic("softdep_write_inodeblock: lost dep3"); +#endif /* INVARIANTS */ + dp->di_ib[i] = 0; + } + return; + } + /* + * If we have zero'ed out the last allocated block of the file, + * roll back the size to the last currently allocated block. + * We know that this last allocated block is a full-sized as + * we already checked for fragments in the loop above. + */ + if (lastadp != NULL && + dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { + for (i = lastadp->ad_offset; i >= 0; i--) + if (dp->di_db[i] != 0) + break; + dp->di_size = (i + 1) * fs->fs_bsize; + } + /* + * The only dependencies are for indirect blocks. + * + * The file size for indirect block additions is not guaranteed. + * Such a guarantee would be non-trivial to achieve. The conventional + * synchronous write implementation also does not make this guarantee. + * Fsck should catch and fix discrepancies. Arguably, the file size + * can be over-estimated without destroying integrity when the file + * moves into the indirect blocks (i.e., is large). If we want to + * postpone fsck, we are stuck with this argument. + */ + for (; adp; adp = TAILQ_NEXT(adp, ad_next)) + dp->di_ib[adp->ad_offset - NDADDR] = 0; +} + +/* + * Cancel an indirdep as a result of truncation. Release all of the + * children allocindirs and place their journal work on the appropriate + * list. + */ +static void +cancel_indirdep(indirdep, bp, freeblks) + struct indirdep *indirdep; + struct buf *bp; + struct freeblks *freeblks; +{ + struct allocindir *aip; + + /* + * None of the indirect pointers will ever be visible, + * so they can simply be tossed. GOINGAWAY ensures + * that allocated pointers will be saved in the buffer + * cache until they are freed. Note that they will + * only be able to be found by their physical address + * since the inode mapping the logical address will + * be gone. The save buffer used for the safe copy + * was allocated in setup_allocindir_phase2 using + * the physical address so it could be used for this + * purpose. Hence we swap the safe copy with the real + * copy, allowing the safe copy to be freed and holding + * on to the real copy for later use in indir_trunc. + */ + if (indirdep->ir_state & GOINGAWAY) + panic("cancel_indirdep: already gone"); + if ((indirdep->ir_state & DEPCOMPLETE) == 0) { + indirdep->ir_state |= DEPCOMPLETE; + LIST_REMOVE(indirdep, ir_next); + } + indirdep->ir_state |= GOINGAWAY; + /* + * Pass in bp for blocks still have journal writes + * pending so we can cancel them on their own. + */ + while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != NULL) + cancel_allocindir(aip, bp, freeblks, 0); + while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) + cancel_allocindir(aip, NULL, freeblks, 0); + while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL) + cancel_allocindir(aip, NULL, freeblks, 0); + while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) + cancel_allocindir(aip, NULL, freeblks, 0); + /* + * If there are pending partial truncations we need to keep the + * old block copy around until they complete. This is because + * the current b_data is not a perfect superset of the available + * blocks. + */ + if (TAILQ_EMPTY(&indirdep->ir_trunc)) + bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); + else + bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); + WORKLIST_REMOVE(&indirdep->ir_list); + WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list); + indirdep->ir_bp = NULL; + indirdep->ir_freeblks = freeblks; +} + +/* + * Free an indirdep once it no longer has new pointers to track. + */ +static void +free_indirdep(indirdep) + struct indirdep *indirdep; +{ + + KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc), + ("free_indirdep: Indir trunc list not empty.")); + KASSERT(LIST_EMPTY(&indirdep->ir_completehd), + ("free_indirdep: Complete head not empty.")); + KASSERT(LIST_EMPTY(&indirdep->ir_writehd), + ("free_indirdep: write head not empty.")); + KASSERT(LIST_EMPTY(&indirdep->ir_donehd), + ("free_indirdep: done head not empty.")); + KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd), + ("free_indirdep: deplist head not empty.")); + KASSERT((indirdep->ir_state & DEPCOMPLETE), + ("free_indirdep: %p still on newblk list.", indirdep)); + KASSERT(indirdep->ir_saveddata == NULL, + ("free_indirdep: %p still has saved data.", indirdep)); + if (indirdep->ir_state & ONWORKLIST) + WORKLIST_REMOVE(&indirdep->ir_list); + WORKITEM_FREE(indirdep, D_INDIRDEP); +} + +/* + * Called before a write to an indirdep. This routine is responsible for + * rolling back pointers to a safe state which includes only those + * allocindirs which have been completed. + */ +static void +initiate_write_indirdep(indirdep, bp) + struct indirdep *indirdep; + struct buf *bp; +{ + struct ufsmount *ump; + + indirdep->ir_state |= IOSTARTED; + if (indirdep->ir_state & GOINGAWAY) + panic("disk_io_initiation: indirdep gone"); + /* + * If there are no remaining dependencies, this will be writing + * the real pointers. + */ + if (LIST_EMPTY(&indirdep->ir_deplisthd) && + TAILQ_EMPTY(&indirdep->ir_trunc)) + return; + /* + * Replace up-to-date version with safe version. + */ + if (indirdep->ir_saveddata == NULL) { + ump = VFSTOUFS(indirdep->ir_list.wk_mp); + LOCK_OWNED(ump); + FREE_LOCK(ump); + indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, + M_SOFTDEP_FLAGS); + ACQUIRE_LOCK(ump); + } + indirdep->ir_state &= ~ATTACHED; + indirdep->ir_state |= UNDONE; + bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); + bcopy(indirdep->ir_savebp->b_data, bp->b_data, + bp->b_bcount); +} + +/* + * Called when an inode has been cleared in a cg bitmap. This finally + * eliminates any canceled jaddrefs + */ +void +softdep_setup_inofree(mp, bp, ino, wkhd) + struct mount *mp; + struct buf *bp; + ino_t ino; + struct workhead *wkhd; +{ + struct worklist *wk, *wkn; + struct inodedep *inodedep; + struct ufsmount *ump; + uint8_t *inosused; + struct cg *cgp; + struct fs *fs; + + KASSERT(MOUNTEDSOFTDEP(mp) != 0, + ("softdep_setup_inofree called on non-softdep filesystem")); + ump = VFSTOUFS(mp); + ACQUIRE_LOCK(ump); + fs = ump->um_fs; + cgp = (struct cg *)bp->b_data; + inosused = cg_inosused(cgp); + if (isset(inosused, ino % fs->fs_ipg)) + panic("softdep_setup_inofree: inode %ju not freed.", + (uintmax_t)ino); + if (inodedep_lookup(mp, ino, 0, &inodedep)) + panic("softdep_setup_inofree: ino %ju has existing inodedep %p", + (uintmax_t)ino, inodedep); + if (wkhd) { + LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { + if (wk->wk_type != D_JADDREF) + continue; + WORKLIST_REMOVE(wk); + /* + * We can free immediately even if the jaddref + * isn't attached in a background write as now + * the bitmaps are reconciled. + */ + wk->wk_state |= COMPLETE | ATTACHED; + free_jaddref(WK_JADDREF(wk)); + } + jwork_move(&bp->b_dep, wkhd); + } + FREE_LOCK(ump); +} + + +/* + * Called via ffs_blkfree() after a set of frags has been cleared from a cg + * map. Any dependencies waiting for the write to clear are added to the + * buf's list and any jnewblks that are being canceled are discarded + * immediately. + */ +void +softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) + struct mount *mp; + struct buf *bp; + ufs2_daddr_t blkno; + int frags; + struct workhead *wkhd; +{ + struct bmsafemap *bmsafemap; + struct jnewblk *jnewblk; + struct ufsmount *ump; + struct worklist *wk; + struct fs *fs; +#ifdef SUJ_DEBUG + uint8_t *blksfree; + struct cg *cgp; + ufs2_daddr_t jstart; + ufs2_daddr_t jend; + ufs2_daddr_t end; + long bno; + int i; +#endif + + CTR3(KTR_SUJ, + "softdep_setup_blkfree: blkno %jd frags %d wk head %p", + blkno, frags, wkhd); + + ump = VFSTOUFS(mp); + KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, + ("softdep_setup_blkfree called on non-softdep filesystem")); + ACQUIRE_LOCK(ump); + /* Lookup the bmsafemap so we track when it is dirty. */ + fs = ump->um_fs; + bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL); + /* + * Detach any jnewblks which have been canceled. They must linger + * until the bitmap is cleared again by ffs_blkfree() to prevent + * an unjournaled allocation from hitting the disk. + */ + if (wkhd) { + while ((wk = LIST_FIRST(wkhd)) != NULL) { + CTR2(KTR_SUJ, + "softdep_setup_blkfree: blkno %jd wk type %d", + blkno, wk->wk_type); + WORKLIST_REMOVE(wk); + if (wk->wk_type != D_JNEWBLK) { + WORKLIST_INSERT(&bmsafemap->sm_freehd, wk); + continue; + } + jnewblk = WK_JNEWBLK(wk); + KASSERT(jnewblk->jn_state & GOINGAWAY, + ("softdep_setup_blkfree: jnewblk not canceled.")); +#ifdef SUJ_DEBUG + /* + * Assert that this block is free in the bitmap + * before we discard the jnewblk. + */ + cgp = (struct cg *)bp->b_data; + blksfree = cg_blksfree(cgp); + bno = dtogd(fs, jnewblk->jn_blkno); + for (i = jnewblk->jn_oldfrags; + i < jnewblk->jn_frags; i++) { + if (isset(blksfree, bno + i)) + continue; + panic("softdep_setup_blkfree: not free"); + } +#endif + /* + * Even if it's not attached we can free immediately + * as the new bitmap is correct. + */ + wk->wk_state |= COMPLETE | ATTACHED; + free_jnewblk(jnewblk); + } + } + +#ifdef SUJ_DEBUG + /* + * Assert that we are not freeing a block which has an outstanding + * allocation dependency. + */ + fs = VFSTOUFS(mp)->um_fs; + bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL); + end = blkno + frags; + LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { + /* + * Don't match against blocks that will be freed when the + * background write is done. + */ + if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) == + (COMPLETE | DEPCOMPLETE)) + continue; + jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags; + jend = jnewblk->jn_blkno + jnewblk->jn_frags; + if ((blkno >= jstart && blkno < jend) || + (end > jstart && end <= jend)) { + printf("state 0x%X %jd - %d %d dep %p\n", + jnewblk->jn_state, jnewblk->jn_blkno, + jnewblk->jn_oldfrags, jnewblk->jn_frags, + jnewblk->jn_dep); + panic("softdep_setup_blkfree: " + "%jd-%jd(%d) overlaps with %jd-%jd", + blkno, end, frags, jstart, jend); + } + } +#endif + FREE_LOCK(ump); +} + +/* + * Revert a block allocation when the journal record that describes it + * is not yet written. + */ +static int +jnewblk_rollback(jnewblk, fs, cgp, blksfree) + struct jnewblk *jnewblk; + struct fs *fs; + struct cg *cgp; + uint8_t *blksfree; +{ + ufs1_daddr_t fragno; + long cgbno, bbase; + int frags, blk; + int i; + + frags = 0; + cgbno = dtogd(fs, jnewblk->jn_blkno); + /* + * We have to test which frags need to be rolled back. We may + * be operating on a stale copy when doing background writes. + */ + for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) + if (isclr(blksfree, cgbno + i)) + frags++; + if (frags == 0) + return (0); + /* + * This is mostly ffs_blkfree() sans some validation and + * superblock updates. + */ + if (frags == fs->fs_frag) { + fragno = fragstoblks(fs, cgbno); + ffs_setblock(fs, blksfree, fragno); + ffs_clusteracct(fs, cgp, fragno, 1); + cgp->cg_cs.cs_nbfree++; + } else { + cgbno += jnewblk->jn_oldfrags; + bbase = cgbno - fragnum(fs, cgbno); + /* Decrement the old frags. */ + blk = blkmap(fs, blksfree, bbase); + ffs_fragacct(fs, blk, cgp->cg_frsum, -1); + /* Deallocate the fragment */ + for (i = 0; i < frags; i++) + setbit(blksfree, cgbno + i); + cgp->cg_cs.cs_nffree += frags; + /* Add back in counts associated with the new frags */ + blk = blkmap(fs, blksfree, bbase); + ffs_fragacct(fs, blk, cgp->cg_frsum, 1); + /* If a complete block has been reassembled, account for it. */ + fragno = fragstoblks(fs, bbase); + if (ffs_isblock(fs, blksfree, fragno)) { + cgp->cg_cs.cs_nffree -= fs->fs_frag; + ffs_clusteracct(fs, cgp, fragno, 1); + cgp->cg_cs.cs_nbfree++; + } + } + stat_jnewblk++; + jnewblk->jn_state &= ~ATTACHED; + jnewblk->jn_state |= UNDONE; + + return (frags); +} + +static void +initiate_write_bmsafemap(bmsafemap, bp) + struct bmsafemap *bmsafemap; + struct buf *bp; /* The cg block. */ +{ + struct jaddref *jaddref; + struct jnewblk *jnewblk; + uint8_t *inosused; + uint8_t *blksfree; + struct cg *cgp; + struct fs *fs; + ino_t ino; + + /* + * If this is a background write, we did this at the time that + * the copy was made, so do not need to do it again. + */ + if (bmsafemap->sm_state & IOSTARTED) + return; + bmsafemap->sm_state |= IOSTARTED; + /* + * Clear any inode allocations which are pending journal writes. + */ + if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) { + cgp = (struct cg *)bp->b_data; + fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; + inosused = cg_inosused(cgp); + LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) { + ino = jaddref->ja_ino % fs->fs_ipg; + if (isset(inosused, ino)) { + if ((jaddref->ja_mode & IFMT) == IFDIR) + cgp->cg_cs.cs_ndir--; + cgp->cg_cs.cs_nifree++; + clrbit(inosused, ino); + jaddref->ja_state &= ~ATTACHED; + jaddref->ja_state |= UNDONE; + stat_jaddref++; + } else + panic("initiate_write_bmsafemap: inode %ju " + "marked free", (uintmax_t)jaddref->ja_ino); + } + } + /* + * Clear any block allocations which are pending journal writes. + */ + if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { + cgp = (struct cg *)bp->b_data; + fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; + blksfree = cg_blksfree(cgp); + LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { + if (jnewblk_rollback(jnewblk, fs, cgp, blksfree)) + continue; + panic("initiate_write_bmsafemap: block %jd " + "marked free", jnewblk->jn_blkno); + } + } + /* + * Move allocation lists to the written lists so they can be + * cleared once the block write is complete. + */ + LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr, + inodedep, id_deps); + LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, + newblk, nb_deps); + LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist, + wk_list); +} + +/* + * This routine is called during the completion interrupt + * service routine for a disk write (from the procedure called + * by the device driver to inform the filesystem caches of + * a request completion). It should be called early in this + * procedure, before the block is made available to other + * processes or other routines are called. + * + */ +static void +softdep_disk_write_complete(bp) + struct buf *bp; /* describes the completed disk write */ +{ + struct worklist *wk; + struct worklist *owk; + struct ufsmount *ump; + struct workhead reattach; + struct freeblks *freeblks; + struct buf *sbp; + + ump = softdep_bp_to_mp(bp); + if (ump == NULL) + return; + + /* + * If an error occurred while doing the write, then the data + * has not hit the disk and the dependencies cannot be processed. + * But we do have to go through and roll forward any dependencies + * that were rolled back before the disk write. + */ + ACQUIRE_LOCK(ump); + if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) { + LIST_FOREACH(wk, &bp->b_dep, wk_list) { + switch (wk->wk_type) { + + case D_PAGEDEP: + handle_written_filepage(WK_PAGEDEP(wk), bp, 0); + continue; + + case D_INODEDEP: + handle_written_inodeblock(WK_INODEDEP(wk), + bp, 0); + continue; + + case D_BMSAFEMAP: + handle_written_bmsafemap(WK_BMSAFEMAP(wk), + bp, 0); + continue; + + case D_INDIRDEP: + handle_written_indirdep(WK_INDIRDEP(wk), + bp, &sbp, 0); + continue; + default: + /* nothing to roll forward */ + continue; + } + } + FREE_LOCK(ump); + return; + } + LIST_INIT(&reattach); + + /* + * Ump SU lock must not be released anywhere in this code segment. + */ + sbp = NULL; + owk = NULL; + while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { + WORKLIST_REMOVE(wk); + atomic_add_long(&dep_write[wk->wk_type], 1); + if (wk == owk) + panic("duplicate worklist: %p\n", wk); + owk = wk; + switch (wk->wk_type) { + + case D_PAGEDEP: + if (handle_written_filepage(WK_PAGEDEP(wk), bp, + WRITESUCCEEDED)) + WORKLIST_INSERT(&reattach, wk); + continue; + + case D_INODEDEP: + if (handle_written_inodeblock(WK_INODEDEP(wk), bp, + WRITESUCCEEDED)) + WORKLIST_INSERT(&reattach, wk); + continue; + + case D_BMSAFEMAP: + if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp, + WRITESUCCEEDED)) + WORKLIST_INSERT(&reattach, wk); + continue; + + case D_MKDIR: + handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); + continue; + + case D_ALLOCDIRECT: + wk->wk_state |= COMPLETE; + handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL); + continue; + + case D_ALLOCINDIR: + wk->wk_state |= COMPLETE; + handle_allocindir_partdone(WK_ALLOCINDIR(wk)); + continue; + + case D_INDIRDEP: + if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp, + WRITESUCCEEDED)) + WORKLIST_INSERT(&reattach, wk); + continue; + + case D_FREEBLKS: + wk->wk_state |= COMPLETE; + freeblks = WK_FREEBLKS(wk); + if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE && + LIST_EMPTY(&freeblks->fb_jblkdephd)) + add_to_worklist(wk, WK_NODELAY); + continue; + + case D_FREEWORK: + handle_written_freework(WK_FREEWORK(wk)); + break; + + case D_JSEGDEP: + free_jsegdep(WK_JSEGDEP(wk)); + continue; + + case D_JSEG: + handle_written_jseg(WK_JSEG(wk), bp); + continue; + + case D_SBDEP: + if (handle_written_sbdep(WK_SBDEP(wk), bp)) + WORKLIST_INSERT(&reattach, wk); + continue; + + case D_FREEDEP: + free_freedep(WK_FREEDEP(wk)); + continue; + + default: + panic("handle_disk_write_complete: Unknown type %s", + TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + } + /* + * Reattach any requests that must be redone. + */ + while ((wk = LIST_FIRST(&reattach)) != NULL) { + WORKLIST_REMOVE(wk); + WORKLIST_INSERT(&bp->b_dep, wk); + } + FREE_LOCK(ump); + if (sbp) + brelse(sbp); +} + +/* + * Called from within softdep_disk_write_complete above. Note that + * this routine is always called from interrupt level with further + * splbio interrupts blocked. + */ +static void +handle_allocdirect_partdone(adp, wkhd) + struct allocdirect *adp; /* the completed allocdirect */ + struct workhead *wkhd; /* Work to do when inode is writtne. */ +{ + struct allocdirectlst *listhead; + struct allocdirect *listadp; + struct inodedep *inodedep; + long bsize; + + if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) + return; + /* + * The on-disk inode cannot claim to be any larger than the last + * fragment that has been written. Otherwise, the on-disk inode + * might have fragments that were not the last block in the file + * which would corrupt the filesystem. Thus, we cannot free any + * allocdirects after one whose ad_oldblkno claims a fragment as + * these blocks must be rolled back to zero before writing the inode. + * We check the currently active set of allocdirects in id_inoupdt + * or id_extupdt as appropriate. + */ + inodedep = adp->ad_inodedep; + bsize = inodedep->id_fs->fs_bsize; + if (adp->ad_state & EXTDATA) + listhead = &inodedep->id_extupdt; + else + listhead = &inodedep->id_inoupdt; + TAILQ_FOREACH(listadp, listhead, ad_next) { + /* found our block */ + if (listadp == adp) + break; + /* continue if ad_oldlbn is not a fragment */ + if (listadp->ad_oldsize == 0 || + listadp->ad_oldsize == bsize) + continue; + /* hit a fragment */ + return; + } + /* + * If we have reached the end of the current list without + * finding the just finished dependency, then it must be + * on the future dependency list. Future dependencies cannot + * be freed until they are moved to the current list. + */ + if (listadp == NULL) { +#ifdef DEBUG + if (adp->ad_state & EXTDATA) + listhead = &inodedep->id_newextupdt; + else + listhead = &inodedep->id_newinoupdt; + TAILQ_FOREACH(listadp, listhead, ad_next) + /* found our block */ + if (listadp == adp) + break; + if (listadp == NULL) + panic("handle_allocdirect_partdone: lost dep"); +#endif /* DEBUG */ + return; + } + /* + * If we have found the just finished dependency, then queue + * it along with anything that follows it that is complete. + * Since the pointer has not yet been written in the inode + * as the dependency prevents it, place the allocdirect on the + * bufwait list where it will be freed once the pointer is + * valid. + */ + if (wkhd == NULL) + wkhd = &inodedep->id_bufwait; + for (; adp; adp = listadp) { + listadp = TAILQ_NEXT(adp, ad_next); + if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) + return; + TAILQ_REMOVE(listhead, adp, ad_next); + WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list); + } +} + +/* + * Called from within softdep_disk_write_complete above. This routine + * completes successfully written allocindirs. + */ +static void +handle_allocindir_partdone(aip) + struct allocindir *aip; /* the completed allocindir */ +{ + struct indirdep *indirdep; + + if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) + return; + indirdep = aip->ai_indirdep; + LIST_REMOVE(aip, ai_next); + /* + * Don't set a pointer while the buffer is undergoing IO or while + * we have active truncations. + */ + if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) { + LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); + return; + } + if (indirdep->ir_state & UFS1FMT) + ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = + aip->ai_newblkno; + else + ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = + aip->ai_newblkno; + /* + * Await the pointer write before freeing the allocindir. + */ + LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next); +} + +/* + * Release segments held on a jwork list. + */ +static void +handle_jwork(wkhd) + struct workhead *wkhd; +{ + struct worklist *wk; + + while ((wk = LIST_FIRST(wkhd)) != NULL) { + WORKLIST_REMOVE(wk); + switch (wk->wk_type) { + case D_JSEGDEP: + free_jsegdep(WK_JSEGDEP(wk)); + continue; + case D_FREEDEP: + free_freedep(WK_FREEDEP(wk)); + continue; + case D_FREEFRAG: + rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep)); + WORKITEM_FREE(wk, D_FREEFRAG); + continue; + case D_FREEWORK: + handle_written_freework(WK_FREEWORK(wk)); + continue; + default: + panic("handle_jwork: Unknown type %s\n", + TYPENAME(wk->wk_type)); + } + } +} + +/* + * Handle the bufwait list on an inode when it is safe to release items + * held there. This normally happens after an inode block is written but + * may be delayed and handled later if there are pending journal items that + * are not yet safe to be released. + */ +static struct freefile * +handle_bufwait(inodedep, refhd) + struct inodedep *inodedep; + struct workhead *refhd; +{ + struct jaddref *jaddref; + struct freefile *freefile; + struct worklist *wk; + + freefile = NULL; + while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { + WORKLIST_REMOVE(wk); + switch (wk->wk_type) { + case D_FREEFILE: + /* + * We defer adding freefile to the worklist + * until all other additions have been made to + * ensure that it will be done after all the + * old blocks have been freed. + */ + if (freefile != NULL) + panic("handle_bufwait: freefile"); + freefile = WK_FREEFILE(wk); + continue; + + case D_MKDIR: + handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); + continue; + + case D_DIRADD: + diradd_inode_written(WK_DIRADD(wk), inodedep); + continue; + + case D_FREEFRAG: + wk->wk_state |= COMPLETE; + if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) + add_to_worklist(wk, 0); + continue; + + case D_DIRREM: + wk->wk_state |= COMPLETE; + add_to_worklist(wk, 0); + continue; + + case D_ALLOCDIRECT: + case D_ALLOCINDIR: + free_newblk(WK_NEWBLK(wk)); + continue; + + case D_JNEWBLK: + wk->wk_state |= COMPLETE; + free_jnewblk(WK_JNEWBLK(wk)); + continue; + + /* + * Save freed journal segments and add references on + * the supplied list which will delay their release + * until the cg bitmap is cleared on disk. + */ + case D_JSEGDEP: + if (refhd == NULL) + free_jsegdep(WK_JSEGDEP(wk)); + else + WORKLIST_INSERT(refhd, wk); + continue; + + case D_JADDREF: + jaddref = WK_JADDREF(wk); + TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, + if_deps); + /* + * Transfer any jaddrefs to the list to be freed with + * the bitmap if we're handling a removed file. + */ + if (refhd == NULL) { + wk->wk_state |= COMPLETE; + free_jaddref(jaddref); + } else + WORKLIST_INSERT(refhd, wk); + continue; + + default: + panic("handle_bufwait: Unknown type %p(%s)", + wk, TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + } + return (freefile); +} +/* + * Called from within softdep_disk_write_complete above to restore + * in-memory inode block contents to their most up-to-date state. Note + * that this routine is always called from interrupt level with further + * interrupts from this device blocked. + * + * If the write did not succeed, we will do all the roll-forward + * operations, but we will not take the actions that will allow its + * dependencies to be processed. + */ +static int +handle_written_inodeblock(inodedep, bp, flags) + struct inodedep *inodedep; + struct buf *bp; /* buffer containing the inode block */ + int flags; +{ + struct freefile *freefile; + struct allocdirect *adp, *nextadp; + struct ufs1_dinode *dp1 = NULL; + struct ufs2_dinode *dp2 = NULL; + struct workhead wkhd; + int hadchanges, fstype; + ino_t freelink; + + LIST_INIT(&wkhd); + hadchanges = 0; + freefile = NULL; + if ((inodedep->id_state & IOSTARTED) == 0) + panic("handle_written_inodeblock: not started"); + inodedep->id_state &= ~IOSTARTED; + if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) { + fstype = UFS1; + dp1 = (struct ufs1_dinode *)bp->b_data + + ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); + freelink = dp1->di_freelink; + } else { + fstype = UFS2; + dp2 = (struct ufs2_dinode *)bp->b_data + + ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); + freelink = dp2->di_freelink; + } + /* + * Leave this inodeblock dirty until it's in the list. + */ + if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED && + (flags & WRITESUCCEEDED)) { + struct inodedep *inon; + + inon = TAILQ_NEXT(inodedep, id_unlinked); + if ((inon == NULL && freelink == 0) || + (inon && inon->id_ino == freelink)) { + if (inon) + inon->id_state |= UNLINKPREV; + inodedep->id_state |= UNLINKNEXT; + } + hadchanges = 1; + } + /* + * If we had to rollback the inode allocation because of + * bitmaps being incomplete, then simply restore it. + * Keep the block dirty so that it will not be reclaimed until + * all associated dependencies have been cleared and the + * corresponding updates written to disk. + */ + if (inodedep->id_savedino1 != NULL) { + hadchanges = 1; + if (fstype == UFS1) + *dp1 = *inodedep->id_savedino1; + else + *dp2 = *inodedep->id_savedino2; + free(inodedep->id_savedino1, M_SAVEDINO); + inodedep->id_savedino1 = NULL; + if ((bp->b_flags & B_DELWRI) == 0) + stat_inode_bitmap++; + bdirty(bp); + /* + * If the inode is clear here and GOINGAWAY it will never + * be written. Process the bufwait and clear any pending + * work which may include the freefile. + */ + if (inodedep->id_state & GOINGAWAY) + goto bufwait; + return (1); + } + if (flags & WRITESUCCEEDED) + inodedep->id_state |= COMPLETE; + /* + * Roll forward anything that had to be rolled back before + * the inode could be updated. + */ + for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { + nextadp = TAILQ_NEXT(adp, ad_next); + if (adp->ad_state & ATTACHED) + panic("handle_written_inodeblock: new entry"); + if (fstype == UFS1) { + if (adp->ad_offset < NDADDR) { + if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno) + panic("%s %s #%jd mismatch %d != %jd", + "handle_written_inodeblock:", + "direct pointer", + (intmax_t)adp->ad_offset, + dp1->di_db[adp->ad_offset], + (intmax_t)adp->ad_oldblkno); + dp1->di_db[adp->ad_offset] = adp->ad_newblkno; + } else { + if (dp1->di_ib[adp->ad_offset - NDADDR] != 0) + panic("%s: %s #%jd allocated as %d", + "handle_written_inodeblock", + "indirect pointer", + (intmax_t)adp->ad_offset - NDADDR, + dp1->di_ib[adp->ad_offset - NDADDR]); + dp1->di_ib[adp->ad_offset - NDADDR] = + adp->ad_newblkno; + } + } else { + if (adp->ad_offset < NDADDR) { + if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno) + panic("%s: %s #%jd %s %jd != %jd", + "handle_written_inodeblock", + "direct pointer", + (intmax_t)adp->ad_offset, "mismatch", + (intmax_t)dp2->di_db[adp->ad_offset], + (intmax_t)adp->ad_oldblkno); + dp2->di_db[adp->ad_offset] = adp->ad_newblkno; + } else { + if (dp2->di_ib[adp->ad_offset - NDADDR] != 0) + panic("%s: %s #%jd allocated as %jd", + "handle_written_inodeblock", + "indirect pointer", + (intmax_t)adp->ad_offset - NDADDR, + (intmax_t) + dp2->di_ib[adp->ad_offset - NDADDR]); + dp2->di_ib[adp->ad_offset - NDADDR] = + adp->ad_newblkno; + } + } + adp->ad_state &= ~UNDONE; + adp->ad_state |= ATTACHED; + hadchanges = 1; + } + for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) { + nextadp = TAILQ_NEXT(adp, ad_next); + if (adp->ad_state & ATTACHED) + panic("handle_written_inodeblock: new entry"); + if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno) + panic("%s: direct pointers #%jd %s %jd != %jd", + "handle_written_inodeblock", + (intmax_t)adp->ad_offset, "mismatch", + (intmax_t)dp2->di_extb[adp->ad_offset], + (intmax_t)adp->ad_oldblkno); + dp2->di_extb[adp->ad_offset] = adp->ad_newblkno; + adp->ad_state &= ~UNDONE; + adp->ad_state |= ATTACHED; + hadchanges = 1; + } + if (hadchanges && (bp->b_flags & B_DELWRI) == 0) + stat_direct_blk_ptrs++; + /* + * Reset the file size to its most up-to-date value. + */ + if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) + panic("handle_written_inodeblock: bad size"); + if (inodedep->id_savednlink > LINK_MAX) + panic("handle_written_inodeblock: Invalid link count " + "%jd for inodedep %p", (uintmax_t)inodedep->id_savednlink, + inodedep); + if (fstype == UFS1) { + if (dp1->di_nlink != inodedep->id_savednlink) { + dp1->di_nlink = inodedep->id_savednlink; + hadchanges = 1; + } + if (dp1->di_size != inodedep->id_savedsize) { + dp1->di_size = inodedep->id_savedsize; + hadchanges = 1; + } + } else { + if (dp2->di_nlink != inodedep->id_savednlink) { + dp2->di_nlink = inodedep->id_savednlink; + hadchanges = 1; + } + if (dp2->di_size != inodedep->id_savedsize) { + dp2->di_size = inodedep->id_savedsize; + hadchanges = 1; + } + if (dp2->di_extsize != inodedep->id_savedextsize) { + dp2->di_extsize = inodedep->id_savedextsize; + hadchanges = 1; + } + } + inodedep->id_savedsize = -1; + inodedep->id_savedextsize = -1; + inodedep->id_savednlink = -1; + /* + * If there were any rollbacks in the inode block, then it must be + * marked dirty so that its will eventually get written back in + * its correct form. + */ + if (hadchanges) + bdirty(bp); +bufwait: + /* + * If the write did not succeed, we have done all the roll-forward + * operations, but we cannot take the actions that will allow its + * dependencies to be processed. + */ + if ((flags & WRITESUCCEEDED) == 0) + return (hadchanges); + /* + * Process any allocdirects that completed during the update. + */ + if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) + handle_allocdirect_partdone(adp, &wkhd); + if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) + handle_allocdirect_partdone(adp, &wkhd); + /* + * Process deallocations that were held pending until the + * inode had been written to disk. Freeing of the inode + * is delayed until after all blocks have been freed to + * avoid creation of new triples + * before the old ones have been deleted. Completely + * unlinked inodes are not processed until the unlinked + * inode list is written or the last reference is removed. + */ + if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) { + freefile = handle_bufwait(inodedep, NULL); + if (freefile && !LIST_EMPTY(&wkhd)) { + WORKLIST_INSERT(&wkhd, &freefile->fx_list); + freefile = NULL; + } + } + /* + * Move rolled forward dependency completions to the bufwait list + * now that those that were already written have been processed. + */ + if (!LIST_EMPTY(&wkhd) && hadchanges == 0) + panic("handle_written_inodeblock: bufwait but no changes"); + jwork_move(&inodedep->id_bufwait, &wkhd); + + if (freefile != NULL) { + /* + * If the inode is goingaway it was never written. Fake up + * the state here so free_inodedep() can succeed. + */ + if (inodedep->id_state & GOINGAWAY) + inodedep->id_state |= COMPLETE | DEPCOMPLETE; + if (free_inodedep(inodedep) == 0) + panic("handle_written_inodeblock: live inodedep %p", + inodedep); + add_to_worklist(&freefile->fx_list, 0); + return (0); + } + + /* + * If no outstanding dependencies, free it. + */ + if (free_inodedep(inodedep) || + (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 && + TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && + TAILQ_FIRST(&inodedep->id_extupdt) == 0 && + LIST_FIRST(&inodedep->id_bufwait) == 0)) + return (0); + return (hadchanges); +} + +/* + * Perform needed roll-forwards and kick off any dependencies that + * can now be processed. + * + * If the write did not succeed, we will do all the roll-forward + * operations, but we will not take the actions that will allow its + * dependencies to be processed. + */ +static int +handle_written_indirdep(indirdep, bp, bpp, flags) + struct indirdep *indirdep; + struct buf *bp; + struct buf **bpp; + int flags; +{ + struct allocindir *aip; + struct buf *sbp; + int chgs; + + if (indirdep->ir_state & GOINGAWAY) + panic("handle_written_indirdep: indirdep gone"); + if ((indirdep->ir_state & IOSTARTED) == 0) + panic("handle_written_indirdep: IO not started"); + chgs = 0; + /* + * If there were rollbacks revert them here. + */ + if (indirdep->ir_saveddata) { + bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); + if (TAILQ_EMPTY(&indirdep->ir_trunc)) { + free(indirdep->ir_saveddata, M_INDIRDEP); + indirdep->ir_saveddata = NULL; + } + chgs = 1; + } + indirdep->ir_state &= ~(UNDONE | IOSTARTED); + indirdep->ir_state |= ATTACHED; + /* + * If the write did not succeed, we have done all the roll-forward + * operations, but we cannot take the actions that will allow its + * dependencies to be processed. + */ + if ((flags & WRITESUCCEEDED) == 0) { + stat_indir_blk_ptrs++; + bdirty(bp); + return (1); + } + /* + * Move allocindirs with written pointers to the completehd if + * the indirdep's pointer is not yet written. Otherwise + * free them here. + */ + while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL) { + LIST_REMOVE(aip, ai_next); + if ((indirdep->ir_state & DEPCOMPLETE) == 0) { + LIST_INSERT_HEAD(&indirdep->ir_completehd, aip, + ai_next); + newblk_freefrag(&aip->ai_block); + continue; + } + free_newblk(&aip->ai_block); + } + /* + * Move allocindirs that have finished dependency processing from + * the done list to the write list after updating the pointers. + */ + if (TAILQ_EMPTY(&indirdep->ir_trunc)) { + while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) { + handle_allocindir_partdone(aip); + if (aip == LIST_FIRST(&indirdep->ir_donehd)) + panic("disk_write_complete: not gone"); + chgs = 1; + } + } + /* + * Preserve the indirdep if there were any changes or if it is not + * yet valid on disk. + */ + if (chgs) { + stat_indir_blk_ptrs++; + bdirty(bp); + return (1); + } + /* + * If there were no changes we can discard the savedbp and detach + * ourselves from the buf. We are only carrying completed pointers + * in this case. + */ + sbp = indirdep->ir_savebp; + sbp->b_flags |= B_INVAL | B_NOCACHE; + indirdep->ir_savebp = NULL; + indirdep->ir_bp = NULL; + if (*bpp != NULL) + panic("handle_written_indirdep: bp already exists."); + *bpp = sbp; + /* + * The indirdep may not be freed until its parent points at it. + */ + if (indirdep->ir_state & DEPCOMPLETE) + free_indirdep(indirdep); + + return (0); +} + +/* + * Process a diradd entry after its dependent inode has been written. + * This routine must be called with splbio interrupts blocked. + */ +static void +diradd_inode_written(dap, inodedep) + struct diradd *dap; + struct inodedep *inodedep; +{ + + dap->da_state |= COMPLETE; + complete_diradd(dap); + WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); +} + +/* + * Returns true if the bmsafemap will have rollbacks when written. Must only + * be called with the per-filesystem lock and the buf lock on the cg held. + */ +static int +bmsafemap_backgroundwrite(bmsafemap, bp) + struct bmsafemap *bmsafemap; + struct buf *bp; +{ + int dirty; + + LOCK_OWNED(VFSTOUFS(bmsafemap->sm_list.wk_mp)); + dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | + !LIST_EMPTY(&bmsafemap->sm_jnewblkhd); + /* + * If we're initiating a background write we need to process the + * rollbacks as they exist now, not as they exist when IO starts. + * No other consumers will look at the contents of the shadowed + * buf so this is safe to do here. + */ + if (bp->b_xflags & BX_BKGRDMARKER) + initiate_write_bmsafemap(bmsafemap, bp); + + return (dirty); +} + +/* + * Re-apply an allocation when a cg write is complete. + */ +static int +jnewblk_rollforward(jnewblk, fs, cgp, blksfree) + struct jnewblk *jnewblk; + struct fs *fs; + struct cg *cgp; + uint8_t *blksfree; +{ + ufs1_daddr_t fragno; + ufs2_daddr_t blkno; + long cgbno, bbase; + int frags, blk; + int i; + + frags = 0; + cgbno = dtogd(fs, jnewblk->jn_blkno); + for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) { + if (isclr(blksfree, cgbno + i)) + panic("jnewblk_rollforward: re-allocated fragment"); + frags++; + } + if (frags == fs->fs_frag) { + blkno = fragstoblks(fs, cgbno); + ffs_clrblock(fs, blksfree, (long)blkno); + ffs_clusteracct(fs, cgp, blkno, -1); + cgp->cg_cs.cs_nbfree--; + } else { + bbase = cgbno - fragnum(fs, cgbno); + cgbno += jnewblk->jn_oldfrags; + /* If a complete block had been reassembled, account for it. */ + fragno = fragstoblks(fs, bbase); + if (ffs_isblock(fs, blksfree, fragno)) { + cgp->cg_cs.cs_nffree += fs->fs_frag; + ffs_clusteracct(fs, cgp, fragno, -1); + cgp->cg_cs.cs_nbfree--; + } + /* Decrement the old frags. */ + blk = blkmap(fs, blksfree, bbase); + ffs_fragacct(fs, blk, cgp->cg_frsum, -1); + /* Allocate the fragment */ + for (i = 0; i < frags; i++) + clrbit(blksfree, cgbno + i); + cgp->cg_cs.cs_nffree -= frags; + /* Add back in counts associated with the new frags */ + blk = blkmap(fs, blksfree, bbase); + ffs_fragacct(fs, blk, cgp->cg_frsum, 1); + } + return (frags); +} + +/* + * Complete a write to a bmsafemap structure. Roll forward any bitmap + * changes if it's not a background write. Set all written dependencies + * to DEPCOMPLETE and free the structure if possible. + * + * If the write did not succeed, we will do all the roll-forward + * operations, but we will not take the actions that will allow its + * dependencies to be processed. + */ +static int +handle_written_bmsafemap(bmsafemap, bp, flags) + struct bmsafemap *bmsafemap; + struct buf *bp; + int flags; +{ + struct newblk *newblk; + struct inodedep *inodedep; + struct jaddref *jaddref, *jatmp; + struct jnewblk *jnewblk, *jntmp; + struct ufsmount *ump; + uint8_t *inosused; + uint8_t *blksfree; + struct cg *cgp; + struct fs *fs; + ino_t ino; + int foreground; + int chgs; + + if ((bmsafemap->sm_state & IOSTARTED) == 0) + panic("handle_written_bmsafemap: Not started\n"); + ump = VFSTOUFS(bmsafemap->sm_list.wk_mp); + chgs = 0; + bmsafemap->sm_state &= ~IOSTARTED; + foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0; + /* + * If write was successful, release journal work that was waiting + * on the write. Otherwise move the work back. + */ + if (flags & WRITESUCCEEDED) + handle_jwork(&bmsafemap->sm_freewr); + else + LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, + worklist, wk_list); + + /* + * Restore unwritten inode allocation pending jaddref writes. + */ + if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) { + cgp = (struct cg *)bp->b_data; + fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; + inosused = cg_inosused(cgp); + LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd, + ja_bmdeps, jatmp) { + if ((jaddref->ja_state & UNDONE) == 0) + continue; + ino = jaddref->ja_ino % fs->fs_ipg; + if (isset(inosused, ino)) + panic("handle_written_bmsafemap: " + "re-allocated inode"); + /* Do the roll-forward only if it's a real copy. */ + if (foreground) { + if ((jaddref->ja_mode & IFMT) == IFDIR) + cgp->cg_cs.cs_ndir++; + cgp->cg_cs.cs_nifree--; + setbit(inosused, ino); + chgs = 1; + } + jaddref->ja_state &= ~UNDONE; + jaddref->ja_state |= ATTACHED; + free_jaddref(jaddref); + } + } + /* + * Restore any block allocations which are pending journal writes. + */ + if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { + cgp = (struct cg *)bp->b_data; + fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; + blksfree = cg_blksfree(cgp); + LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps, + jntmp) { + if ((jnewblk->jn_state & UNDONE) == 0) + continue; + /* Do the roll-forward only if it's a real copy. */ + if (foreground && + jnewblk_rollforward(jnewblk, fs, cgp, blksfree)) + chgs = 1; + jnewblk->jn_state &= ~(UNDONE | NEWBLOCK); + jnewblk->jn_state |= ATTACHED; + free_jnewblk(jnewblk); + } + } + /* + * If the write did not succeed, we have done all the roll-forward + * operations, but we cannot take the actions that will allow its + * dependencies to be processed. + */ + if ((flags & WRITESUCCEEDED) == 0) { + LIST_CONCAT(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, + newblk, nb_deps); + LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, + worklist, wk_list); + if (foreground) + bdirty(bp); + return (1); + } + while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) { + newblk->nb_state |= DEPCOMPLETE; + newblk->nb_state &= ~ONDEPLIST; + newblk->nb_bmsafemap = NULL; + LIST_REMOVE(newblk, nb_deps); + if (newblk->nb_list.wk_type == D_ALLOCDIRECT) + handle_allocdirect_partdone( + WK_ALLOCDIRECT(&newblk->nb_list), NULL); + else if (newblk->nb_list.wk_type == D_ALLOCINDIR) + handle_allocindir_partdone( + WK_ALLOCINDIR(&newblk->nb_list)); + else if (newblk->nb_list.wk_type != D_NEWBLK) + panic("handle_written_bmsafemap: Unexpected type: %s", + TYPENAME(newblk->nb_list.wk_type)); + } + while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) { + inodedep->id_state |= DEPCOMPLETE; + inodedep->id_state &= ~ONDEPLIST; + LIST_REMOVE(inodedep, id_deps); + inodedep->id_bmsafemap = NULL; + } + LIST_REMOVE(bmsafemap, sm_next); + if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) && + LIST_EMPTY(&bmsafemap->sm_jnewblkhd) && + LIST_EMPTY(&bmsafemap->sm_newblkhd) && + LIST_EMPTY(&bmsafemap->sm_inodedephd) && + LIST_EMPTY(&bmsafemap->sm_freehd)) { + LIST_REMOVE(bmsafemap, sm_hash); + WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); + return (0); + } + LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next); + if (foreground) + bdirty(bp); + return (1); +} + +/* + * Try to free a mkdir dependency. + */ +static void +complete_mkdir(mkdir) + struct mkdir *mkdir; +{ + struct diradd *dap; + + if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE) + return; + LIST_REMOVE(mkdir, md_mkdirs); + dap = mkdir->md_diradd; + dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); + if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) { + dap->da_state |= DEPCOMPLETE; + complete_diradd(dap); + } + WORKITEM_FREE(mkdir, D_MKDIR); +} + +/* + * Handle the completion of a mkdir dependency. + */ +static void +handle_written_mkdir(mkdir, type) + struct mkdir *mkdir; + int type; +{ + + if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type) + panic("handle_written_mkdir: bad type"); + mkdir->md_state |= COMPLETE; + complete_mkdir(mkdir); +} + +static int +free_pagedep(pagedep) + struct pagedep *pagedep; +{ + int i; + + if (pagedep->pd_state & NEWBLOCK) + return (0); + if (!LIST_EMPTY(&pagedep->pd_dirremhd)) + return (0); + for (i = 0; i < DAHASHSZ; i++) + if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) + return (0); + if (!LIST_EMPTY(&pagedep->pd_pendinghd)) + return (0); + if (!LIST_EMPTY(&pagedep->pd_jmvrefhd)) + return (0); + if (pagedep->pd_state & ONWORKLIST) + WORKLIST_REMOVE(&pagedep->pd_list); + LIST_REMOVE(pagedep, pd_hash); + WORKITEM_FREE(pagedep, D_PAGEDEP); + + return (1); +} + +/* + * Called from within softdep_disk_write_complete above. + * A write operation was just completed. Removed inodes can + * now be freed and associated block pointers may be committed. + * Note that this routine is always called from interrupt level + * with further interrupts from this device blocked. + * + * If the write did not succeed, we will do all the roll-forward + * operations, but we will not take the actions that will allow its + * dependencies to be processed. + */ +static int +handle_written_filepage(pagedep, bp, flags) + struct pagedep *pagedep; + struct buf *bp; /* buffer containing the written page */ + int flags; +{ + struct dirrem *dirrem; + struct diradd *dap, *nextdap; + struct direct *ep; + int i, chgs; + + if ((pagedep->pd_state & IOSTARTED) == 0) + panic("handle_written_filepage: not started"); + pagedep->pd_state &= ~IOSTARTED; + if ((flags & WRITESUCCEEDED) == 0) + goto rollforward; + /* + * Process any directory removals that have been committed. + */ + while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { + LIST_REMOVE(dirrem, dm_next); + dirrem->dm_state |= COMPLETE; + dirrem->dm_dirinum = pagedep->pd_ino; + KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), + ("handle_written_filepage: Journal entries not written.")); + add_to_worklist(&dirrem->dm_list, 0); + } + /* + * Free any directory additions that have been committed. + * If it is a newly allocated block, we have to wait until + * the on-disk directory inode claims the new block. + */ + if ((pagedep->pd_state & NEWBLOCK) == 0) + while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) + free_diradd(dap, NULL); +rollforward: + /* + * Uncommitted directory entries must be restored. + */ + for (chgs = 0, i = 0; i < DAHASHSZ; i++) { + for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; + dap = nextdap) { + nextdap = LIST_NEXT(dap, da_pdlist); + if (dap->da_state & ATTACHED) + panic("handle_written_filepage: attached"); + ep = (struct direct *) + ((char *)bp->b_data + dap->da_offset); + ep->d_ino = dap->da_newinum; + dap->da_state &= ~UNDONE; + dap->da_state |= ATTACHED; + chgs = 1; + /* + * If the inode referenced by the directory has + * been written out, then the dependency can be + * moved to the pending list. + */ + if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { + LIST_REMOVE(dap, da_pdlist); + LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, + da_pdlist); + } + } + } + /* + * If there were any rollbacks in the directory, then it must be + * marked dirty so that its will eventually get written back in + * its correct form. + */ + if (chgs || (flags & WRITESUCCEEDED) == 0) { + if ((bp->b_flags & B_DELWRI) == 0) + stat_dir_entry++; + bdirty(bp); + return (1); + } + /* + * If we are not waiting for a new directory block to be + * claimed by its inode, then the pagedep will be freed. + * Otherwise it will remain to track any new entries on + * the page in case they are fsync'ed. + */ + free_pagedep(pagedep); + return (0); +} + +/* + * Writing back in-core inode structures. + * + * The filesystem only accesses an inode's contents when it occupies an + * "in-core" inode structure. These "in-core" structures are separate from + * the page frames used to cache inode blocks. Only the latter are + * transferred to/from the disk. So, when the updated contents of the + * "in-core" inode structure are copied to the corresponding in-memory inode + * block, the dependencies are also transferred. The following procedure is + * called when copying a dirty "in-core" inode to a cached inode block. + */ + +/* + * Called when an inode is loaded from disk. If the effective link count + * differed from the actual link count when it was last flushed, then we + * need to ensure that the correct effective link count is put back. + */ +void +softdep_load_inodeblock(ip) + struct inode *ip; /* the "in_core" copy of the inode */ +{ + struct inodedep *inodedep; + struct ufsmount *ump; + + ump = ITOUMP(ip); + KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, + ("softdep_load_inodeblock called on non-softdep filesystem")); + /* + * Check for alternate nlink count. + */ + ip->i_effnlink = ip->i_nlink; + ACQUIRE_LOCK(ump); + if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0) { + FREE_LOCK(ump); + return; + } + ip->i_effnlink -= inodedep->id_nlinkdelta; + FREE_LOCK(ump); +} + +/* + * This routine is called just before the "in-core" inode + * information is to be copied to the in-memory inode block. + * Recall that an inode block contains several inodes. If + * the force flag is set, then the dependencies will be + * cleared so that the update can always be made. Note that + * the buffer is locked when this routine is called, so we + * will never be in the middle of writing the inode block + * to disk. + */ +void +softdep_update_inodeblock(ip, bp, waitfor) + struct inode *ip; /* the "in_core" copy of the inode */ + struct buf *bp; /* the buffer containing the inode block */ + int waitfor; /* nonzero => update must be allowed */ +{ + struct inodedep *inodedep; + struct inoref *inoref; + struct ufsmount *ump; + struct worklist *wk; + struct mount *mp; + struct buf *ibp; + struct fs *fs; + int error; + + ump = ITOUMP(ip); + mp = UFSTOVFS(ump); + KASSERT(MOUNTEDSOFTDEP(mp) != 0, + ("softdep_update_inodeblock called on non-softdep filesystem")); + fs = ump->um_fs; + /* + * Preserve the freelink that is on disk. clear_unlinked_inodedep() + * does not have access to the in-core ip so must write directly into + * the inode block buffer when setting freelink. + */ + if (fs->fs_magic == FS_UFS1_MAGIC) + DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data + + ino_to_fsbo(fs, ip->i_number))->di_freelink); + else + DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data + + ino_to_fsbo(fs, ip->i_number))->di_freelink); + /* + * If the effective link count is not equal to the actual link + * count, then we must track the difference in an inodedep while + * the inode is (potentially) tossed out of the cache. Otherwise, + * if there is no existing inodedep, then there are no dependencies + * to track. + */ + ACQUIRE_LOCK(ump); +again: + if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { + FREE_LOCK(ump); + if (ip->i_effnlink != ip->i_nlink) + panic("softdep_update_inodeblock: bad link count"); + return; + } + if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) + panic("softdep_update_inodeblock: bad delta"); + /* + * If we're flushing all dependencies we must also move any waiting + * for journal writes onto the bufwait list prior to I/O. + */ + if (waitfor) { + TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { + if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) + == DEPCOMPLETE) { + jwait(&inoref->if_list, MNT_WAIT); + goto again; + } + } + } + /* + * Changes have been initiated. Anything depending on these + * changes cannot occur until this inode has been written. + */ + inodedep->id_state &= ~COMPLETE; + if ((inodedep->id_state & ONWORKLIST) == 0) + WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); + /* + * Any new dependencies associated with the incore inode must + * now be moved to the list associated with the buffer holding + * the in-memory copy of the inode. Once merged process any + * allocdirects that are completed by the merger. + */ + merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); + if (!TAILQ_EMPTY(&inodedep->id_inoupdt)) + handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt), + NULL); + merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); + if (!TAILQ_EMPTY(&inodedep->id_extupdt)) + handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt), + NULL); + /* + * Now that the inode has been pushed into the buffer, the + * operations dependent on the inode being written to disk + * can be moved to the id_bufwait so that they will be + * processed when the buffer I/O completes. + */ + while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { + WORKLIST_REMOVE(wk); + WORKLIST_INSERT(&inodedep->id_bufwait, wk); + } + /* + * Newly allocated inodes cannot be written until the bitmap + * that allocates them have been written (indicated by + * DEPCOMPLETE being set in id_state). If we are doing a + * forced sync (e.g., an fsync on a file), we force the bitmap + * to be written so that the update can be done. + */ + if (waitfor == 0) { + FREE_LOCK(ump); + return; + } +retry: + if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) { + FREE_LOCK(ump); + return; + } + ibp = inodedep->id_bmsafemap->sm_buf; + ibp = getdirtybuf(ibp, LOCK_PTR(ump), MNT_WAIT); + if (ibp == NULL) { + /* + * If ibp came back as NULL, the dependency could have been + * freed while we slept. Look it up again, and check to see + * that it has completed. + */ + if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) + goto retry; + FREE_LOCK(ump); + return; + } + FREE_LOCK(ump); + if ((error = bwrite(ibp)) != 0) + softdep_error("softdep_update_inodeblock: bwrite", error); +} + +/* + * Merge the a new inode dependency list (such as id_newinoupdt) into an + * old inode dependency list (such as id_inoupdt). This routine must be + * called with splbio interrupts blocked. + */ +static void +merge_inode_lists(newlisthead, oldlisthead) + struct allocdirectlst *newlisthead; + struct allocdirectlst *oldlisthead; +{ + struct allocdirect *listadp, *newadp; + + newadp = TAILQ_FIRST(newlisthead); + for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { + if (listadp->ad_offset < newadp->ad_offset) { + listadp = TAILQ_NEXT(listadp, ad_next); + continue; + } + TAILQ_REMOVE(newlisthead, newadp, ad_next); + TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); + if (listadp->ad_offset == newadp->ad_offset) { + allocdirect_merge(oldlisthead, newadp, + listadp); + listadp = newadp; + } + newadp = TAILQ_FIRST(newlisthead); + } + while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) { + TAILQ_REMOVE(newlisthead, newadp, ad_next); + TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next); + } +} + +/* + * If we are doing an fsync, then we must ensure that any directory + * entries for the inode have been written after the inode gets to disk. + */ +int +softdep_fsync(vp) + struct vnode *vp; /* the "in_core" copy of the inode */ +{ + struct inodedep *inodedep; + struct pagedep *pagedep; + struct inoref *inoref; + struct ufsmount *ump; + struct worklist *wk; + struct diradd *dap; + struct mount *mp; + struct vnode *pvp; + struct inode *ip; + struct buf *bp; + struct fs *fs; + struct thread *td = curthread; + int error, flushparent, pagedep_new_block; + ino_t parentino; + ufs_lbn_t lbn; + + ip = VTOI(vp); + mp = vp->v_mount; + ump = VFSTOUFS(mp); + fs = ump->um_fs; + if (MOUNTEDSOFTDEP(mp) == 0) + return (0); + ACQUIRE_LOCK(ump); +restart: + if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { + FREE_LOCK(ump); + return (0); + } + TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { + if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) + == DEPCOMPLETE) { + jwait(&inoref->if_list, MNT_WAIT); + goto restart; + } + } + if (!LIST_EMPTY(&inodedep->id_inowait) || + !TAILQ_EMPTY(&inodedep->id_extupdt) || + !TAILQ_EMPTY(&inodedep->id_newextupdt) || + !TAILQ_EMPTY(&inodedep->id_inoupdt) || + !TAILQ_EMPTY(&inodedep->id_newinoupdt)) + panic("softdep_fsync: pending ops %p", inodedep); + for (error = 0, flushparent = 0; ; ) { + if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) + break; + if (wk->wk_type != D_DIRADD) + panic("softdep_fsync: Unexpected type %s", + TYPENAME(wk->wk_type)); + dap = WK_DIRADD(wk); + /* + * Flush our parent if this directory entry has a MKDIR_PARENT + * dependency or is contained in a newly allocated block. + */ + if (dap->da_state & DIRCHG) + pagedep = dap->da_previous->dm_pagedep; + else + pagedep = dap->da_pagedep; + parentino = pagedep->pd_ino; + lbn = pagedep->pd_lbn; + if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) + panic("softdep_fsync: dirty"); + if ((dap->da_state & MKDIR_PARENT) || + (pagedep->pd_state & NEWBLOCK)) + flushparent = 1; + else + flushparent = 0; + /* + * If we are being fsync'ed as part of vgone'ing this vnode, + * then we will not be able to release and recover the + * vnode below, so we just have to give up on writing its + * directory entry out. It will eventually be written, just + * not now, but then the user was not asking to have it + * written, so we are not breaking any promises. + */ + if (vp->v_iflag & VI_DOOMED) + break; + /* + * We prevent deadlock by always fetching inodes from the + * root, moving down the directory tree. Thus, when fetching + * our parent directory, we first try to get the lock. If + * that fails, we must unlock ourselves before requesting + * the lock on our parent. See the comment in ufs_lookup + * for details on possible races. + */ + FREE_LOCK(ump); + if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp, + FFSV_FORCEINSMQ)) { + error = vfs_busy(mp, MBF_NOWAIT); + if (error != 0) { + vfs_ref(mp); + VOP_UNLOCK(vp, 0); + error = vfs_busy(mp, 0); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + vfs_rel(mp); + if (error != 0) + return (ENOENT); + if (vp->v_iflag & VI_DOOMED) { + vfs_unbusy(mp); + return (ENOENT); + } + } + VOP_UNLOCK(vp, 0); + error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE, + &pvp, FFSV_FORCEINSMQ); + vfs_unbusy(mp); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + if (vp->v_iflag & VI_DOOMED) { + if (error == 0) + vput(pvp); + error = ENOENT; + } + if (error != 0) + return (error); + } + /* + * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps + * that are contained in direct blocks will be resolved by + * doing a ffs_update. Pagedeps contained in indirect blocks + * may require a complete sync'ing of the directory. So, we + * try the cheap and fast ffs_update first, and if that fails, + * then we do the slower ffs_syncvnode of the directory. + */ + if (flushparent) { + int locked; + + if ((error = ffs_update(pvp, 1)) != 0) { + vput(pvp); + return (error); + } + ACQUIRE_LOCK(ump); + locked = 1; + if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) { + if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) { + if (wk->wk_type != D_DIRADD) + panic("softdep_fsync: Unexpected type %s", + TYPENAME(wk->wk_type)); + dap = WK_DIRADD(wk); + if (dap->da_state & DIRCHG) + pagedep = dap->da_previous->dm_pagedep; + else + pagedep = dap->da_pagedep; + pagedep_new_block = pagedep->pd_state & NEWBLOCK; + FREE_LOCK(ump); + locked = 0; + if (pagedep_new_block && (error = + ffs_syncvnode(pvp, MNT_WAIT, 0))) { + vput(pvp); + return (error); + } + } + } + if (locked) + FREE_LOCK(ump); + } + /* + * Flush directory page containing the inode's name. + */ + error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred, + &bp); + if (error == 0) + error = bwrite(bp); + else + brelse(bp); + vput(pvp); + if (error != 0) + return (error); + ACQUIRE_LOCK(ump); + if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) + break; + } + FREE_LOCK(ump); + return (0); +} + +/* + * Flush all the dirty bitmaps associated with the block device + * before flushing the rest of the dirty blocks so as to reduce + * the number of dependencies that will have to be rolled back. + * + * XXX Unused? + */ +void +softdep_fsync_mountdev(vp) + struct vnode *vp; +{ + struct buf *bp, *nbp; + struct worklist *wk; + struct bufobj *bo; + + if (!vn_isdisk(vp, NULL)) + panic("softdep_fsync_mountdev: vnode not a disk"); + bo = &vp->v_bufobj; +restart: + BO_LOCK(bo); + TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { + /* + * If it is already scheduled, skip to the next buffer. + */ + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) + continue; + + if ((bp->b_flags & B_DELWRI) == 0) + panic("softdep_fsync_mountdev: not dirty"); + /* + * We are only interested in bitmaps with outstanding + * dependencies. + */ + if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || + wk->wk_type != D_BMSAFEMAP || + (bp->b_vflags & BV_BKGRDINPROG)) { + BUF_UNLOCK(bp); + continue; + } + BO_UNLOCK(bo); + bremfree(bp); + (void) bawrite(bp); + goto restart; + } + drain_output(vp); + BO_UNLOCK(bo); +} + +/* + * Sync all cylinder groups that were dirty at the time this function is + * called. Newly dirtied cgs will be inserted before the sentinel. This + * is used to flush freedep activity that may be holding up writes to a + * indirect block. + */ +static int +sync_cgs(mp, waitfor) + struct mount *mp; + int waitfor; +{ + struct bmsafemap *bmsafemap; + struct bmsafemap *sentinel; + struct ufsmount *ump; + struct buf *bp; + int error; + + sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK); + sentinel->sm_cg = -1; + ump = VFSTOUFS(mp); + error = 0; + ACQUIRE_LOCK(ump); + LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next); + for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != NULL; + bmsafemap = LIST_NEXT(sentinel, sm_next)) { + /* Skip sentinels and cgs with no work to release. */ + if (bmsafemap->sm_cg == -1 || + (LIST_EMPTY(&bmsafemap->sm_freehd) && + LIST_EMPTY(&bmsafemap->sm_freewr))) { + LIST_REMOVE(sentinel, sm_next); + LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next); + continue; + } + /* + * If we don't get the lock and we're waiting try again, if + * not move on to the next buf and try to sync it. + */ + bp = getdirtybuf(bmsafemap->sm_buf, LOCK_PTR(ump), waitfor); + if (bp == NULL && waitfor == MNT_WAIT) + continue; + LIST_REMOVE(sentinel, sm_next); + LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next); + if (bp == NULL) + continue; + FREE_LOCK(ump); + if (waitfor == MNT_NOWAIT) + bawrite(bp); + else + error = bwrite(bp); + ACQUIRE_LOCK(ump); + if (error) + break; + } + LIST_REMOVE(sentinel, sm_next); + FREE_LOCK(ump); + free(sentinel, M_BMSAFEMAP); + return (error); +} + +/* + * This routine is called when we are trying to synchronously flush a + * file. This routine must eliminate any filesystem metadata dependencies + * so that the syncing routine can succeed. + */ +int +softdep_sync_metadata(struct vnode *vp) +{ + struct inode *ip; + int error; + + ip = VTOI(vp); + KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, + ("softdep_sync_metadata called on non-softdep filesystem")); + /* + * Ensure that any direct block dependencies have been cleared, + * truncations are started, and inode references are journaled. + */ + ACQUIRE_LOCK(VFSTOUFS(vp->v_mount)); + /* + * Write all journal records to prevent rollbacks on devvp. + */ + if (vp->v_type == VCHR) + softdep_flushjournal(vp->v_mount); + error = flush_inodedep_deps(vp, vp->v_mount, ip->i_number); + /* + * Ensure that all truncates are written so we won't find deps on + * indirect blocks. + */ + process_truncates(vp); + FREE_LOCK(VFSTOUFS(vp->v_mount)); + + return (error); +} + +/* + * This routine is called when we are attempting to sync a buf with + * dependencies. If waitfor is MNT_NOWAIT it attempts to schedule any + * other IO it can but returns EBUSY if the buffer is not yet able to + * be written. Dependencies which will not cause rollbacks will always + * return 0. + */ +int +softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) +{ + struct indirdep *indirdep; + struct pagedep *pagedep; + struct allocindir *aip; + struct newblk *newblk; + struct ufsmount *ump; + struct buf *nbp; + struct worklist *wk; + int i, error; + + KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, + ("softdep_sync_buf called on non-softdep filesystem")); + /* + * For VCHR we just don't want to force flush any dependencies that + * will cause rollbacks. + */ + if (vp->v_type == VCHR) { + if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0)) + return (EBUSY); + return (0); + } + ump = VFSTOUFS(vp->v_mount); + ACQUIRE_LOCK(ump); + /* + * As we hold the buffer locked, none of its dependencies + * will disappear. + */ + error = 0; +top: + LIST_FOREACH(wk, &bp->b_dep, wk_list) { + switch (wk->wk_type) { + + case D_ALLOCDIRECT: + case D_ALLOCINDIR: + newblk = WK_NEWBLK(wk); + if (newblk->nb_jnewblk != NULL) { + if (waitfor == MNT_NOWAIT) { + error = EBUSY; + goto out_unlock; + } + jwait(&newblk->nb_jnewblk->jn_list, waitfor); + goto top; + } + if (newblk->nb_state & DEPCOMPLETE || + waitfor == MNT_NOWAIT) + continue; + nbp = newblk->nb_bmsafemap->sm_buf; + nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor); + if (nbp == NULL) + goto top; + FREE_LOCK(ump); + if ((error = bwrite(nbp)) != 0) + goto out; + ACQUIRE_LOCK(ump); + continue; + + case D_INDIRDEP: + indirdep = WK_INDIRDEP(wk); + if (waitfor == MNT_NOWAIT) { + if (!TAILQ_EMPTY(&indirdep->ir_trunc) || + !LIST_EMPTY(&indirdep->ir_deplisthd)) { + error = EBUSY; + goto out_unlock; + } + } + if (!TAILQ_EMPTY(&indirdep->ir_trunc)) + panic("softdep_sync_buf: truncation pending."); + restart: + LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { + newblk = (struct newblk *)aip; + if (newblk->nb_jnewblk != NULL) { + jwait(&newblk->nb_jnewblk->jn_list, + waitfor); + goto restart; + } + if (newblk->nb_state & DEPCOMPLETE) + continue; + nbp = newblk->nb_bmsafemap->sm_buf; + nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor); + if (nbp == NULL) + goto restart; + FREE_LOCK(ump); + if ((error = bwrite(nbp)) != 0) + goto out; + ACQUIRE_LOCK(ump); + goto restart; + } + continue; + + case D_PAGEDEP: + /* + * Only flush directory entries in synchronous passes. + */ + if (waitfor != MNT_WAIT) { + error = EBUSY; + goto out_unlock; + } + /* + * While syncing snapshots, we must allow recursive + * lookups. + */ + BUF_AREC(bp); + /* + * We are trying to sync a directory that may + * have dependencies on both its own metadata + * and/or dependencies on the inodes of any + * recently allocated files. We walk its diradd + * lists pushing out the associated inode. + */ + pagedep = WK_PAGEDEP(wk); + for (i = 0; i < DAHASHSZ; i++) { + if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) + continue; + if ((error = flush_pagedep_deps(vp, wk->wk_mp, + &pagedep->pd_diraddhd[i]))) { + BUF_NOREC(bp); + goto out_unlock; + } + } + BUF_NOREC(bp); + continue; + + case D_FREEWORK: + case D_FREEDEP: + case D_JSEGDEP: + case D_JNEWBLK: + continue; + + default: + panic("softdep_sync_buf: Unknown type %s", + TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + } +out_unlock: + FREE_LOCK(ump); +out: + return (error); +} + +/* + * Flush the dependencies associated with an inodedep. + * Called with splbio blocked. + */ +static int +flush_inodedep_deps(vp, mp, ino) + struct vnode *vp; + struct mount *mp; + ino_t ino; +{ + struct inodedep *inodedep; + struct inoref *inoref; + struct ufsmount *ump; + int error, waitfor; + + /* + * This work is done in two passes. The first pass grabs most + * of the buffers and begins asynchronously writing them. The + * only way to wait for these asynchronous writes is to sleep + * on the filesystem vnode which may stay busy for a long time + * if the filesystem is active. So, instead, we make a second + * pass over the dependencies blocking on each write. In the + * usual case we will be blocking against a write that we + * initiated, so when it is done the dependency will have been + * resolved. Thus the second pass is expected to end quickly. + * We give a brief window at the top of the loop to allow + * any pending I/O to complete. + */ + ump = VFSTOUFS(mp); + LOCK_OWNED(ump); + for (error = 0, waitfor = MNT_NOWAIT; ; ) { + if (error) + return (error); + FREE_LOCK(ump); + ACQUIRE_LOCK(ump); +restart: + if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) + return (0); + TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { + if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) + == DEPCOMPLETE) { + jwait(&inoref->if_list, MNT_WAIT); + goto restart; + } + } + if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || + flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || + flush_deplist(&inodedep->id_extupdt, waitfor, &error) || + flush_deplist(&inodedep->id_newextupdt, waitfor, &error)) + continue; + /* + * If pass2, we are done, otherwise do pass 2. + */ + if (waitfor == MNT_WAIT) + break; + waitfor = MNT_WAIT; + } + /* + * Try freeing inodedep in case all dependencies have been removed. + */ + if (inodedep_lookup(mp, ino, 0, &inodedep) != 0) + (void) free_inodedep(inodedep); + return (0); +} + +/* + * Flush an inode dependency list. + * Called with splbio blocked. + */ +static int +flush_deplist(listhead, waitfor, errorp) + struct allocdirectlst *listhead; + int waitfor; + int *errorp; +{ + struct allocdirect *adp; + struct newblk *newblk; + struct ufsmount *ump; + struct buf *bp; + + if ((adp = TAILQ_FIRST(listhead)) == NULL) + return (0); + ump = VFSTOUFS(adp->ad_list.wk_mp); + LOCK_OWNED(ump); + TAILQ_FOREACH(adp, listhead, ad_next) { + newblk = (struct newblk *)adp; + if (newblk->nb_jnewblk != NULL) { + jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); + return (1); + } + if (newblk->nb_state & DEPCOMPLETE) + continue; + bp = newblk->nb_bmsafemap->sm_buf; + bp = getdirtybuf(bp, LOCK_PTR(ump), waitfor); + if (bp == NULL) { + if (waitfor == MNT_NOWAIT) + continue; + return (1); + } + FREE_LOCK(ump); + if (waitfor == MNT_NOWAIT) + bawrite(bp); + else + *errorp = bwrite(bp); + ACQUIRE_LOCK(ump); + return (1); + } + return (0); +} + +/* + * Flush dependencies associated with an allocdirect block. + */ +static int +flush_newblk_dep(vp, mp, lbn) + struct vnode *vp; + struct mount *mp; + ufs_lbn_t lbn; +{ + struct newblk *newblk; + struct ufsmount *ump; + struct bufobj *bo; + struct inode *ip; + struct buf *bp; + ufs2_daddr_t blkno; + int error; + + error = 0; + bo = &vp->v_bufobj; + ip = VTOI(vp); + blkno = DIP(ip, i_db[lbn]); + if (blkno == 0) + panic("flush_newblk_dep: Missing block"); + ump = VFSTOUFS(mp); + ACQUIRE_LOCK(ump); + /* + * Loop until all dependencies related to this block are satisfied. + * We must be careful to restart after each sleep in case a write + * completes some part of this process for us. + */ + for (;;) { + if (newblk_lookup(mp, blkno, 0, &newblk) == 0) { + FREE_LOCK(ump); + break; + } + if (newblk->nb_list.wk_type != D_ALLOCDIRECT) + panic("flush_newblk_deps: Bad newblk %p", newblk); + /* + * Flush the journal. + */ + if (newblk->nb_jnewblk != NULL) { + jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); + continue; + } + /* + * Write the bitmap dependency. + */ + if ((newblk->nb_state & DEPCOMPLETE) == 0) { + bp = newblk->nb_bmsafemap->sm_buf; + bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT); + if (bp == NULL) + continue; + FREE_LOCK(ump); + error = bwrite(bp); + if (error) + break; + ACQUIRE_LOCK(ump); + continue; + } + /* + * Write the buffer. + */ + FREE_LOCK(ump); + BO_LOCK(bo); + bp = gbincore(bo, lbn); + if (bp != NULL) { + error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | + LK_INTERLOCK, BO_LOCKPTR(bo)); + if (error == ENOLCK) { + ACQUIRE_LOCK(ump); + error = 0; + continue; /* Slept, retry */ + } + if (error != 0) + break; /* Failed */ + if (bp->b_flags & B_DELWRI) { + bremfree(bp); + error = bwrite(bp); + if (error) + break; + } else + BUF_UNLOCK(bp); + } else + BO_UNLOCK(bo); + /* + * We have to wait for the direct pointers to + * point at the newdirblk before the dependency + * will go away. + */ + error = ffs_update(vp, 1); + if (error) + break; + ACQUIRE_LOCK(ump); + } + return (error); +} + +/* + * Eliminate a pagedep dependency by flushing out all its diradd dependencies. + * Called with splbio blocked. + */ +static int +flush_pagedep_deps(pvp, mp, diraddhdp) + struct vnode *pvp; + struct mount *mp; + struct diraddhd *diraddhdp; +{ + struct inodedep *inodedep; + struct inoref *inoref; + struct ufsmount *ump; + struct diradd *dap; + struct vnode *vp; + int error = 0; + struct buf *bp; + ino_t inum; + struct diraddhd unfinished; + + LIST_INIT(&unfinished); + ump = VFSTOUFS(mp); + LOCK_OWNED(ump); +restart: + while ((dap = LIST_FIRST(diraddhdp)) != NULL) { + /* + * Flush ourselves if this directory entry + * has a MKDIR_PARENT dependency. + */ + if (dap->da_state & MKDIR_PARENT) { + FREE_LOCK(ump); + if ((error = ffs_update(pvp, 1)) != 0) + break; + ACQUIRE_LOCK(ump); + /* + * If that cleared dependencies, go on to next. + */ + if (dap != LIST_FIRST(diraddhdp)) + continue; + /* + * All MKDIR_PARENT dependencies and all the + * NEWBLOCK pagedeps that are contained in direct + * blocks were resolved by doing above ffs_update. + * Pagedeps contained in indirect blocks may + * require a complete sync'ing of the directory. + * We are in the midst of doing a complete sync, + * so if they are not resolved in this pass we + * defer them for now as they will be sync'ed by + * our caller shortly. + */ + LIST_REMOVE(dap, da_pdlist); + LIST_INSERT_HEAD(&unfinished, dap, da_pdlist); + continue; + } + /* + * A newly allocated directory must have its "." and + * ".." entries written out before its name can be + * committed in its parent. + */ + inum = dap->da_newinum; + if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) + panic("flush_pagedep_deps: lost inode1"); + /* + * Wait for any pending journal adds to complete so we don't + * cause rollbacks while syncing. + */ + TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { + if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) + == DEPCOMPLETE) { + jwait(&inoref->if_list, MNT_WAIT); + goto restart; + } + } + if (dap->da_state & MKDIR_BODY) { + FREE_LOCK(ump); + if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, + FFSV_FORCEINSMQ))) + break; + error = flush_newblk_dep(vp, mp, 0); + /* + * If we still have the dependency we might need to + * update the vnode to sync the new link count to + * disk. + */ + if (error == 0 && dap == LIST_FIRST(diraddhdp)) + error = ffs_update(vp, 1); + vput(vp); + if (error != 0) + break; + ACQUIRE_LOCK(ump); + /* + * If that cleared dependencies, go on to next. + */ + if (dap != LIST_FIRST(diraddhdp)) + continue; + if (dap->da_state & MKDIR_BODY) { + inodedep_lookup(UFSTOVFS(ump), inum, 0, + &inodedep); + panic("flush_pagedep_deps: MKDIR_BODY " + "inodedep %p dap %p vp %p", + inodedep, dap, vp); + } + } + /* + * Flush the inode on which the directory entry depends. + * Having accounted for MKDIR_PARENT and MKDIR_BODY above, + * the only remaining dependency is that the updated inode + * count must get pushed to disk. The inode has already + * been pushed into its inode buffer (via VOP_UPDATE) at + * the time of the reference count change. So we need only + * locate that buffer, ensure that there will be no rollback + * caused by a bitmap dependency, then write the inode buffer. + */ +retry: + if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) + panic("flush_pagedep_deps: lost inode"); + /* + * If the inode still has bitmap dependencies, + * push them to disk. + */ + if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) { + bp = inodedep->id_bmsafemap->sm_buf; + bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT); + if (bp == NULL) + goto retry; + FREE_LOCK(ump); + if ((error = bwrite(bp)) != 0) + break; + ACQUIRE_LOCK(ump); + if (dap != LIST_FIRST(diraddhdp)) + continue; + } + /* + * If the inode is still sitting in a buffer waiting + * to be written or waiting for the link count to be + * adjusted update it here to flush it to disk. + */ + if (dap == LIST_FIRST(diraddhdp)) { + FREE_LOCK(ump); + if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, + FFSV_FORCEINSMQ))) + break; + error = ffs_update(vp, 1); + vput(vp); + if (error) + break; + ACQUIRE_LOCK(ump); + } + /* + * If we have failed to get rid of all the dependencies + * then something is seriously wrong. + */ + if (dap == LIST_FIRST(diraddhdp)) { + inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); + panic("flush_pagedep_deps: failed to flush " + "inodedep %p ino %ju dap %p", + inodedep, (uintmax_t)inum, dap); + } + } + if (error) + ACQUIRE_LOCK(ump); + while ((dap = LIST_FIRST(&unfinished)) != NULL) { + LIST_REMOVE(dap, da_pdlist); + LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist); + } + return (error); +} + +/* + * A large burst of file addition or deletion activity can drive the + * memory load excessively high. First attempt to slow things down + * using the techniques below. If that fails, this routine requests + * the offending operations to fall back to running synchronously + * until the memory load returns to a reasonable level. + */ +int +softdep_slowdown(vp) + struct vnode *vp; +{ + struct ufsmount *ump; + int jlow; + int max_softdeps_hard; + + KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, + ("softdep_slowdown called on non-softdep filesystem")); + ump = VFSTOUFS(vp->v_mount); + ACQUIRE_LOCK(ump); + jlow = 0; + /* + * Check for journal space if needed. + */ + if (DOINGSUJ(vp)) { + if (journal_space(ump, 0) == 0) + jlow = 1; + } + /* + * If the system is under its limits and our filesystem is + * not responsible for more than our share of the usage and + * we are not low on journal space, then no need to slow down. + */ + max_softdeps_hard = max_softdeps * 11 / 10; + if (dep_current[D_DIRREM] < max_softdeps_hard / 2 && + dep_current[D_INODEDEP] < max_softdeps_hard && + dep_current[D_INDIRDEP] < max_softdeps_hard / 1000 && + dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0 && + ump->softdep_curdeps[D_DIRREM] < + (max_softdeps_hard / 2) / stat_flush_threads && + ump->softdep_curdeps[D_INODEDEP] < + max_softdeps_hard / stat_flush_threads && + ump->softdep_curdeps[D_INDIRDEP] < + (max_softdeps_hard / 1000) / stat_flush_threads && + ump->softdep_curdeps[D_FREEBLKS] < + max_softdeps_hard / stat_flush_threads) { + FREE_LOCK(ump); + return (0); + } + /* + * If the journal is low or our filesystem is over its limit + * then speedup the cleanup. + */ + if (ump->softdep_curdeps[D_INDIRDEP] < + (max_softdeps_hard / 1000) / stat_flush_threads || jlow) + softdep_speedup(ump); + stat_sync_limit_hit += 1; + FREE_LOCK(ump); + /* + * We only slow down the rate at which new dependencies are + * generated if we are not using journaling. With journaling, + * the cleanup should always be sufficient to keep things + * under control. + */ + if (DOINGSUJ(vp)) + return (0); + return (1); +} + +/* + * Called by the allocation routines when they are about to fail + * in the hope that we can free up the requested resource (inodes + * or disk space). + * + * First check to see if the work list has anything on it. If it has, + * clean up entries until we successfully free the requested resource. + * Because this process holds inodes locked, we cannot handle any remove + * requests that might block on a locked inode as that could lead to + * deadlock. If the worklist yields none of the requested resource, + * start syncing out vnodes to free up the needed space. + */ +int +softdep_request_cleanup(fs, vp, cred, resource) + struct fs *fs; + struct vnode *vp; + struct ucred *cred; + int resource; +{ + struct ufsmount *ump; + struct mount *mp; + long starttime; + ufs2_daddr_t needed; + int error, failed_vnode; + + /* + * If we are being called because of a process doing a + * copy-on-write, then it is not safe to process any + * worklist items as we will recurse into the copyonwrite + * routine. This will result in an incoherent snapshot. + * If the vnode that we hold is a snapshot, we must avoid + * handling other resources that could cause deadlock. + */ + if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp))) + return (0); + + if (resource == FLUSH_BLOCKS_WAIT) + stat_cleanup_blkrequests += 1; + else + stat_cleanup_inorequests += 1; + + mp = vp->v_mount; + ump = VFSTOUFS(mp); + mtx_assert(UFS_MTX(ump), MA_OWNED); + UFS_UNLOCK(ump); + error = ffs_update(vp, 1); + if (error != 0 || MOUNTEDSOFTDEP(mp) == 0) { + UFS_LOCK(ump); + return (0); + } + /* + * If we are in need of resources, start by cleaning up + * any block removals associated with our inode. + */ + ACQUIRE_LOCK(ump); + process_removes(vp); + process_truncates(vp); + FREE_LOCK(ump); + /* + * Now clean up at least as many resources as we will need. + * + * When requested to clean up inodes, the number that are needed + * is set by the number of simultaneous writers (mnt_writeopcount) + * plus a bit of slop (2) in case some more writers show up while + * we are cleaning. + * + * When requested to free up space, the amount of space that + * we need is enough blocks to allocate a full-sized segment + * (fs_contigsumsize). The number of such segments that will + * be needed is set by the number of simultaneous writers + * (mnt_writeopcount) plus a bit of slop (2) in case some more + * writers show up while we are cleaning. + * + * Additionally, if we are unpriviledged and allocating space, + * we need to ensure that we clean up enough blocks to get the + * needed number of blocks over the threshold of the minimum + * number of blocks required to be kept free by the filesystem + * (fs_minfree). + */ + if (resource == FLUSH_INODES_WAIT) { + needed = vp->v_mount->mnt_writeopcount + 2; + } else if (resource == FLUSH_BLOCKS_WAIT) { + needed = (vp->v_mount->mnt_writeopcount + 2) * + fs->fs_contigsumsize; + if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0)) + needed += fragstoblks(fs, + roundup((fs->fs_dsize * fs->fs_minfree / 100) - + fs->fs_cstotal.cs_nffree, fs->fs_frag)); + } else { + UFS_LOCK(ump); + printf("softdep_request_cleanup: Unknown resource type %d\n", + resource); + return (0); + } + starttime = time_second; +retry: + if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 && + fs->fs_cstotal.cs_nbfree <= needed) || + (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && + fs->fs_cstotal.cs_nifree <= needed)) { + ACQUIRE_LOCK(ump); + if (ump->softdep_on_worklist > 0 && + process_worklist_item(UFSTOVFS(ump), + ump->softdep_on_worklist, LK_NOWAIT) != 0) + stat_worklist_push += 1; + FREE_LOCK(ump); + } + /* + * If we still need resources and there are no more worklist + * entries to process to obtain them, we have to start flushing + * the dirty vnodes to force the release of additional requests + * to the worklist that we can then process to reap addition + * resources. We walk the vnodes associated with the mount point + * until we get the needed worklist requests that we can reap. + * + * If there are several threads all needing to clean the same + * mount point, only one is allowed to walk the mount list. + * When several threads all try to walk the same mount list, + * they end up competing with each other and often end up in + * livelock. This approach ensures that forward progress is + * made at the cost of occational ENOSPC errors being returned + * that might otherwise have been avoided. + */ + error = 1; + if ((resource == FLUSH_BLOCKS_WAIT && + fs->fs_cstotal.cs_nbfree <= needed) || + (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && + fs->fs_cstotal.cs_nifree <= needed)) { + ACQUIRE_LOCK(ump); + if ((ump->um_softdep->sd_flags & FLUSH_RC_ACTIVE) == 0) { + ump->um_softdep->sd_flags |= FLUSH_RC_ACTIVE; + FREE_LOCK(ump); + failed_vnode = softdep_request_cleanup_flush(mp, ump); + ACQUIRE_LOCK(ump); + ump->um_softdep->sd_flags &= ~FLUSH_RC_ACTIVE; + FREE_LOCK(ump); + if (ump->softdep_on_worklist > 0) { + stat_cleanup_retries += 1; + if (!failed_vnode) + goto retry; + } + } else { + FREE_LOCK(ump); + error = 0; + } + stat_cleanup_failures += 1; + } + if (time_second - starttime > stat_cleanup_high_delay) + stat_cleanup_high_delay = time_second - starttime; + UFS_LOCK(ump); + return (error); +} + +/* + * Scan the vnodes for the specified mount point flushing out any + * vnodes that can be locked without waiting. Finally, try to flush + * the device associated with the mount point if it can be locked + * without waiting. + * + * We return 0 if we were able to lock every vnode in our scan. + * If we had to skip one or more vnodes, we return 1. + */ +static int +softdep_request_cleanup_flush(mp, ump) + struct mount *mp; + struct ufsmount *ump; +{ + struct thread *td; + struct vnode *lvp, *mvp; + int failed_vnode; + + failed_vnode = 0; + td = curthread; + MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) { + if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) { + VI_UNLOCK(lvp); + continue; + } + if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT, + td) != 0) { + failed_vnode = 1; + continue; + } + if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */ + vput(lvp); + continue; + } + (void) ffs_syncvnode(lvp, MNT_NOWAIT, 0); + vput(lvp); + } + lvp = ump->um_devvp; + if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { + VOP_FSYNC(lvp, MNT_NOWAIT, td); + VOP_UNLOCK(lvp, 0); + } + return (failed_vnode); +} + +static bool +softdep_excess_items(struct ufsmount *ump, int item) +{ + + KASSERT(item >= 0 && item < D_LAST, ("item %d", item)); + return (dep_current[item] > max_softdeps && + ump->softdep_curdeps[item] > max_softdeps / + stat_flush_threads); +} + +static void +schedule_cleanup(struct mount *mp) +{ + struct ufsmount *ump; + struct thread *td; + + ump = VFSTOUFS(mp); + LOCK_OWNED(ump); + FREE_LOCK(ump); + td = curthread; + if ((td->td_pflags & TDP_KTHREAD) != 0 && + (td->td_proc->p_flag2 & P2_AST_SU) == 0) { + /* + * No ast is delivered to kernel threads, so nobody + * would deref the mp. Some kernel threads + * explicitely check for AST, e.g. NFS daemon does + * this in the serving loop. + */ + return; + } + if (td->td_su != NULL) + vfs_rel(td->td_su); + vfs_ref(mp); + td->td_su = mp; + thread_lock(td); + td->td_flags |= TDF_ASTPENDING; + thread_unlock(td); +} + +static void +softdep_ast_cleanup_proc(struct thread *td) +{ + struct mount *mp; + struct ufsmount *ump; + int error; + bool req; + + while ((mp = td->td_su) != NULL) { + td->td_su = NULL; + error = vfs_busy(mp, MBF_NOWAIT); + vfs_rel(mp); + if (error != 0) + return; + if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) { + ump = VFSTOUFS(mp); + for (;;) { + req = false; + ACQUIRE_LOCK(ump); + if (softdep_excess_items(ump, D_INODEDEP)) { + req = true; + request_cleanup(mp, FLUSH_INODES); + } + if (softdep_excess_items(ump, D_DIRREM)) { + req = true; + request_cleanup(mp, FLUSH_BLOCKS); + } + FREE_LOCK(ump); + if (softdep_excess_items(ump, D_NEWBLK) || + softdep_excess_items(ump, D_ALLOCDIRECT) || + softdep_excess_items(ump, D_ALLOCINDIR)) { + error = vn_start_write(NULL, &mp, + V_WAIT); + if (error == 0) { + req = true; + VFS_SYNC(mp, MNT_WAIT); + vn_finished_write(mp); + } + } + if ((td->td_pflags & TDP_KTHREAD) != 0 || !req) + break; + } + } + vfs_unbusy(mp); + } + if ((mp = td->td_su) != NULL) { + td->td_su = NULL; + vfs_rel(mp); + } +} + +/* + * If memory utilization has gotten too high, deliberately slow things + * down and speed up the I/O processing. + */ +static int +request_cleanup(mp, resource) + struct mount *mp; + int resource; +{ + struct thread *td = curthread; + struct ufsmount *ump; + + ump = VFSTOUFS(mp); + LOCK_OWNED(ump); + /* + * We never hold up the filesystem syncer or buf daemon. + */ + if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF)) + return (0); + /* + * First check to see if the work list has gotten backlogged. + * If it has, co-opt this process to help clean up two entries. + * Because this process may hold inodes locked, we cannot + * handle any remove requests that might block on a locked + * inode as that could lead to deadlock. We set TDP_SOFTDEP + * to avoid recursively processing the worklist. + */ + if (ump->softdep_on_worklist > max_softdeps / 10) { + td->td_pflags |= TDP_SOFTDEP; + process_worklist_item(mp, 2, LK_NOWAIT); + td->td_pflags &= ~TDP_SOFTDEP; + stat_worklist_push += 2; + return(1); + } + /* + * Next, we attempt to speed up the syncer process. If that + * is successful, then we allow the process to continue. + */ + if (softdep_speedup(ump) && + resource != FLUSH_BLOCKS_WAIT && + resource != FLUSH_INODES_WAIT) + return(0); + /* + * If we are resource constrained on inode dependencies, try + * flushing some dirty inodes. Otherwise, we are constrained + * by file deletions, so try accelerating flushes of directories + * with removal dependencies. We would like to do the cleanup + * here, but we probably hold an inode locked at this point and + * that might deadlock against one that we try to clean. So, + * the best that we can do is request the syncer daemon to do + * the cleanup for us. + */ + switch (resource) { + + case FLUSH_INODES: + case FLUSH_INODES_WAIT: + ACQUIRE_GBLLOCK(&lk); + stat_ino_limit_push += 1; + req_clear_inodedeps += 1; + FREE_GBLLOCK(&lk); + stat_countp = &stat_ino_limit_hit; + break; + + case FLUSH_BLOCKS: + case FLUSH_BLOCKS_WAIT: + ACQUIRE_GBLLOCK(&lk); + stat_blk_limit_push += 1; + req_clear_remove += 1; + FREE_GBLLOCK(&lk); + stat_countp = &stat_blk_limit_hit; + break; + + default: + panic("request_cleanup: unknown type"); + } + /* + * Hopefully the syncer daemon will catch up and awaken us. + * We wait at most tickdelay before proceeding in any case. + */ + ACQUIRE_GBLLOCK(&lk); + FREE_LOCK(ump); + proc_waiting += 1; + if (callout_pending(&softdep_callout) == FALSE) + callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, + pause_timer, 0); + + if ((td->td_pflags & TDP_KTHREAD) == 0) + msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); + proc_waiting -= 1; + FREE_GBLLOCK(&lk); + ACQUIRE_LOCK(ump); + return (1); +} + +/* + * Awaken processes pausing in request_cleanup and clear proc_waiting + * to indicate that there is no longer a timer running. Pause_timer + * will be called with the global softdep mutex (&lk) locked. + */ +static void +pause_timer(arg) + void *arg; +{ + + GBLLOCK_OWNED(&lk); + /* + * The callout_ API has acquired mtx and will hold it around this + * function call. + */ + *stat_countp += proc_waiting; + wakeup(&proc_waiting); +} + +/* + * If requested, try removing inode or removal dependencies. + */ +static void +check_clear_deps(mp) + struct mount *mp; +{ + + /* + * If we are suspended, it may be because of our using + * too many inodedeps, so help clear them out. + */ + if (MOUNTEDSUJ(mp) && VFSTOUFS(mp)->softdep_jblocks->jb_suspended) + clear_inodedeps(mp); + /* + * General requests for cleanup of backed up dependencies + */ + ACQUIRE_GBLLOCK(&lk); + if (req_clear_inodedeps) { + req_clear_inodedeps -= 1; + FREE_GBLLOCK(&lk); + clear_inodedeps(mp); + ACQUIRE_GBLLOCK(&lk); + wakeup(&proc_waiting); + } + if (req_clear_remove) { + req_clear_remove -= 1; + FREE_GBLLOCK(&lk); + clear_remove(mp); + ACQUIRE_GBLLOCK(&lk); + wakeup(&proc_waiting); + } + FREE_GBLLOCK(&lk); +} + +/* + * Flush out a directory with at least one removal dependency in an effort to + * reduce the number of dirrem, freefile, and freeblks dependency structures. + */ +static void +clear_remove(mp) + struct mount *mp; +{ + struct pagedep_hashhead *pagedephd; + struct pagedep *pagedep; + struct ufsmount *ump; + struct vnode *vp; + struct bufobj *bo; + int error, cnt; + ino_t ino; + + ump = VFSTOUFS(mp); + LOCK_OWNED(ump); + + for (cnt = 0; cnt <= ump->pagedep_hash_size; cnt++) { + pagedephd = &ump->pagedep_hashtbl[ump->pagedep_nextclean++]; + if (ump->pagedep_nextclean > ump->pagedep_hash_size) + ump->pagedep_nextclean = 0; + LIST_FOREACH(pagedep, pagedephd, pd_hash) { + if (LIST_EMPTY(&pagedep->pd_dirremhd)) + continue; + ino = pagedep->pd_ino; + if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) + continue; + FREE_LOCK(ump); + + /* + * Let unmount clear deps + */ + error = vfs_busy(mp, MBF_NOWAIT); + if (error != 0) + goto finish_write; + error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, + FFSV_FORCEINSMQ); + vfs_unbusy(mp); + if (error != 0) { + softdep_error("clear_remove: vget", error); + goto finish_write; + } + if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0))) + softdep_error("clear_remove: fsync", error); + bo = &vp->v_bufobj; + BO_LOCK(bo); + drain_output(vp); + BO_UNLOCK(bo); + vput(vp); + finish_write: + vn_finished_write(mp); + ACQUIRE_LOCK(ump); + return; + } + } +} + +/* + * Clear out a block of dirty inodes in an effort to reduce + * the number of inodedep dependency structures. + */ +static void +clear_inodedeps(mp) + struct mount *mp; +{ + struct inodedep_hashhead *inodedephd; + struct inodedep *inodedep; + struct ufsmount *ump; + struct vnode *vp; + struct fs *fs; + int error, cnt; + ino_t firstino, lastino, ino; + + ump = VFSTOUFS(mp); + fs = ump->um_fs; + LOCK_OWNED(ump); + /* + * Pick a random inode dependency to be cleared. + * We will then gather up all the inodes in its block + * that have dependencies and flush them out. + */ + for (cnt = 0; cnt <= ump->inodedep_hash_size; cnt++) { + inodedephd = &ump->inodedep_hashtbl[ump->inodedep_nextclean++]; + if (ump->inodedep_nextclean > ump->inodedep_hash_size) + ump->inodedep_nextclean = 0; + if ((inodedep = LIST_FIRST(inodedephd)) != NULL) + break; + } + if (inodedep == NULL) + return; + /* + * Find the last inode in the block with dependencies. + */ + firstino = rounddown2(inodedep->id_ino, INOPB(fs)); + for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) + if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0) + break; + /* + * Asynchronously push all but the last inode with dependencies. + * Synchronously push the last inode with dependencies to ensure + * that the inode block gets written to free up the inodedeps. + */ + for (ino = firstino; ino <= lastino; ino++) { + if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) + continue; + if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) + continue; + FREE_LOCK(ump); + error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */ + if (error != 0) { + vn_finished_write(mp); + ACQUIRE_LOCK(ump); + return; + } + if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, + FFSV_FORCEINSMQ)) != 0) { + softdep_error("clear_inodedeps: vget", error); + vfs_unbusy(mp); + vn_finished_write(mp); + ACQUIRE_LOCK(ump); + return; + } + vfs_unbusy(mp); + if (ino == lastino) { + if ((error = ffs_syncvnode(vp, MNT_WAIT, 0))) + softdep_error("clear_inodedeps: fsync1", error); + } else { + if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0))) + softdep_error("clear_inodedeps: fsync2", error); + BO_LOCK(&vp->v_bufobj); + drain_output(vp); + BO_UNLOCK(&vp->v_bufobj); + } + vput(vp); + vn_finished_write(mp); + ACQUIRE_LOCK(ump); + } +} + +void +softdep_buf_append(bp, wkhd) + struct buf *bp; + struct workhead *wkhd; +{ + struct worklist *wk; + struct ufsmount *ump; + + if ((wk = LIST_FIRST(wkhd)) == NULL) + return; + KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0, + ("softdep_buf_append called on non-softdep filesystem")); + ump = VFSTOUFS(wk->wk_mp); + ACQUIRE_LOCK(ump); + while ((wk = LIST_FIRST(wkhd)) != NULL) { + WORKLIST_REMOVE(wk); + WORKLIST_INSERT(&bp->b_dep, wk); + } + FREE_LOCK(ump); + +} + +void +softdep_inode_append(ip, cred, wkhd) + struct inode *ip; + struct ucred *cred; + struct workhead *wkhd; +{ + struct buf *bp; + struct fs *fs; + struct ufsmount *ump; + int error; + + ump = ITOUMP(ip); + KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, + ("softdep_inode_append called on non-softdep filesystem")); + fs = ump->um_fs; + error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), + (int)fs->fs_bsize, cred, &bp); + if (error) { + bqrelse(bp); + softdep_freework(wkhd); + return; + } + softdep_buf_append(bp, wkhd); + bqrelse(bp); +} + +void +softdep_freework(wkhd) + struct workhead *wkhd; +{ + struct worklist *wk; + struct ufsmount *ump; + + if ((wk = LIST_FIRST(wkhd)) == NULL) + return; + KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0, + ("softdep_freework called on non-softdep filesystem")); + ump = VFSTOUFS(wk->wk_mp); + ACQUIRE_LOCK(ump); + handle_jwork(wkhd); + FREE_LOCK(ump); +} + +static struct ufsmount * +softdep_bp_to_mp(bp) + struct buf *bp; +{ + struct mount *mp; + struct vnode *vp; + + if (LIST_EMPTY(&bp->b_dep)) + return (NULL); + vp = bp->b_vp; + + /* + * The ump mount point is stable after we get a correct + * pointer, since bp is locked and this prevents unmount from + * proceeding. But to get to it, we cannot dereference bp->b_dep + * head wk_mp, because we do not yet own SU ump lock and + * workitem might be freed while dereferenced. + */ +retry: + if (vp->v_type == VCHR) { + VI_LOCK(vp); + mp = vp->v_type == VCHR ? vp->v_rdev->si_mountpt : NULL; + VI_UNLOCK(vp); + if (mp == NULL) + goto retry; + } else if (vp->v_type == VREG || vp->v_type == VDIR || + vp->v_type == VLNK) { + mp = vp->v_mount; + } else { + return (NULL); + } + return (VFSTOUFS(mp)); +} + +/* + * Function to determine if the buffer has outstanding dependencies + * that will cause a roll-back if the buffer is written. If wantcount + * is set, return number of dependencies, otherwise just yes or no. + */ +static int +softdep_count_dependencies(bp, wantcount) + struct buf *bp; + int wantcount; +{ + struct worklist *wk; + struct ufsmount *ump; + struct bmsafemap *bmsafemap; + struct freework *freework; + struct inodedep *inodedep; + struct indirdep *indirdep; + struct freeblks *freeblks; + struct allocindir *aip; + struct pagedep *pagedep; + struct dirrem *dirrem; + struct newblk *newblk; + struct mkdir *mkdir; + struct diradd *dap; + int i, retval; + + ump = softdep_bp_to_mp(bp); + if (ump == NULL) + return (0); + retval = 0; + ACQUIRE_LOCK(ump); + LIST_FOREACH(wk, &bp->b_dep, wk_list) { + switch (wk->wk_type) { + + case D_INODEDEP: + inodedep = WK_INODEDEP(wk); + if ((inodedep->id_state & DEPCOMPLETE) == 0) { + /* bitmap allocation dependency */ + retval += 1; + if (!wantcount) + goto out; + } + if (TAILQ_FIRST(&inodedep->id_inoupdt)) { + /* direct block pointer dependency */ + retval += 1; + if (!wantcount) + goto out; + } + if (TAILQ_FIRST(&inodedep->id_extupdt)) { + /* direct block pointer dependency */ + retval += 1; + if (!wantcount) + goto out; + } + if (TAILQ_FIRST(&inodedep->id_inoreflst)) { + /* Add reference dependency. */ + retval += 1; + if (!wantcount) + goto out; + } + continue; + + case D_INDIRDEP: + indirdep = WK_INDIRDEP(wk); + + TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) { + /* indirect truncation dependency */ + retval += 1; + if (!wantcount) + goto out; + } + + LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { + /* indirect block pointer dependency */ + retval += 1; + if (!wantcount) + goto out; + } + continue; + + case D_PAGEDEP: + pagedep = WK_PAGEDEP(wk); + LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { + if (LIST_FIRST(&dirrem->dm_jremrefhd)) { + /* Journal remove ref dependency. */ + retval += 1; + if (!wantcount) + goto out; + } + } + for (i = 0; i < DAHASHSZ; i++) { + + LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { + /* directory entry dependency */ + retval += 1; + if (!wantcount) + goto out; + } + } + continue; + + case D_BMSAFEMAP: + bmsafemap = WK_BMSAFEMAP(wk); + if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) { + /* Add reference dependency. */ + retval += 1; + if (!wantcount) + goto out; + } + if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) { + /* Allocate block dependency. */ + retval += 1; + if (!wantcount) + goto out; + } + continue; + + case D_FREEBLKS: + freeblks = WK_FREEBLKS(wk); + if (LIST_FIRST(&freeblks->fb_jblkdephd)) { + /* Freeblk journal dependency. */ + retval += 1; + if (!wantcount) + goto out; + } + continue; + + case D_ALLOCDIRECT: + case D_ALLOCINDIR: + newblk = WK_NEWBLK(wk); + if (newblk->nb_jnewblk) { + /* Journal allocate dependency. */ + retval += 1; + if (!wantcount) + goto out; + } + continue; + + case D_MKDIR: + mkdir = WK_MKDIR(wk); + if (mkdir->md_jaddref) { + /* Journal reference dependency. */ + retval += 1; + if (!wantcount) + goto out; + } + continue; + + case D_FREEWORK: + case D_FREEDEP: + case D_JSEGDEP: + case D_JSEG: + case D_SBDEP: + /* never a dependency on these blocks */ + continue; + + default: + panic("softdep_count_dependencies: Unexpected type %s", + TYPENAME(wk->wk_type)); + /* NOTREACHED */ + } + } +out: + FREE_LOCK(ump); + return (retval); +} + +/* + * Acquire exclusive access to a buffer. + * Must be called with a locked mtx parameter. + * Return acquired buffer or NULL on failure. + */ +static struct buf * +getdirtybuf(bp, lock, waitfor) + struct buf *bp; + struct rwlock *lock; + int waitfor; +{ + int error; + + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) { + if (waitfor != MNT_WAIT) + return (NULL); + error = BUF_LOCK(bp, + LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock); + /* + * Even if we successfully acquire bp here, we have dropped + * lock, which may violates our guarantee. + */ + if (error == 0) + BUF_UNLOCK(bp); + else if (error != ENOLCK) + panic("getdirtybuf: inconsistent lock: %d", error); + rw_wlock(lock); + return (NULL); + } + if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { + if (lock != BO_LOCKPTR(bp->b_bufobj) && waitfor == MNT_WAIT) { + rw_wunlock(lock); + BO_LOCK(bp->b_bufobj); + BUF_UNLOCK(bp); + if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { + bp->b_vflags |= BV_BKGRDWAIT; + msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj), + PRIBIO | PDROP, "getbuf", 0); + } else + BO_UNLOCK(bp->b_bufobj); + rw_wlock(lock); + return (NULL); + } + BUF_UNLOCK(bp); + if (waitfor != MNT_WAIT) + return (NULL); +#ifdef DEBUG_VFS_LOCKS + if (bp->b_vp->v_type != VCHR) + ASSERT_BO_WLOCKED(bp->b_bufobj); +#endif + bp->b_vflags |= BV_BKGRDWAIT; + rw_sleep(&bp->b_xflags, lock, PRIBIO, "getbuf", 0); + return (NULL); + } + if ((bp->b_flags & B_DELWRI) == 0) { + BUF_UNLOCK(bp); + return (NULL); + } + bremfree(bp); + return (bp); +} + + +/* + * Check if it is safe to suspend the file system now. On entry, + * the vnode interlock for devvp should be held. Return 0 with + * the mount interlock held if the file system can be suspended now, + * otherwise return EAGAIN with the mount interlock held. + */ +int +softdep_check_suspend(struct mount *mp, + struct vnode *devvp, + int softdep_depcnt, + int softdep_accdepcnt, + int secondary_writes, + int secondary_accwrites) +{ + struct bufobj *bo; + struct ufsmount *ump; + struct inodedep *inodedep; + int error, unlinked; + + bo = &devvp->v_bufobj; + ASSERT_BO_WLOCKED(bo); + + /* + * If we are not running with soft updates, then we need only + * deal with secondary writes as we try to suspend. + */ + if (MOUNTEDSOFTDEP(mp) == 0) { + MNT_ILOCK(mp); + while (mp->mnt_secondary_writes != 0) { + BO_UNLOCK(bo); + msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), + (PUSER - 1) | PDROP, "secwr", 0); + BO_LOCK(bo); + MNT_ILOCK(mp); + } + + /* + * Reasons for needing more work before suspend: + * - Dirty buffers on devvp. + * - Secondary writes occurred after start of vnode sync loop + */ + error = 0; + if (bo->bo_numoutput > 0 || + bo->bo_dirty.bv_cnt > 0 || + secondary_writes != 0 || + mp->mnt_secondary_writes != 0 || + secondary_accwrites != mp->mnt_secondary_accwrites) + error = EAGAIN; + BO_UNLOCK(bo); + return (error); + } + + /* + * If we are running with soft updates, then we need to coordinate + * with them as we try to suspend. + */ + ump = VFSTOUFS(mp); + for (;;) { + if (!TRY_ACQUIRE_LOCK(ump)) { + BO_UNLOCK(bo); + ACQUIRE_LOCK(ump); + FREE_LOCK(ump); + BO_LOCK(bo); + continue; + } + MNT_ILOCK(mp); + if (mp->mnt_secondary_writes != 0) { + FREE_LOCK(ump); + BO_UNLOCK(bo); + msleep(&mp->mnt_secondary_writes, + MNT_MTX(mp), + (PUSER - 1) | PDROP, "secwr", 0); + BO_LOCK(bo); + continue; + } + break; + } + + unlinked = 0; + if (MOUNTEDSUJ(mp)) { + for (inodedep = TAILQ_FIRST(&ump->softdep_unlinked); + inodedep != NULL; + inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { + if ((inodedep->id_state & (UNLINKED | UNLINKLINKS | + UNLINKONLIST)) != (UNLINKED | UNLINKLINKS | + UNLINKONLIST) || + !check_inodedep_free(inodedep)) + continue; + unlinked++; + } + } + + /* + * Reasons for needing more work before suspend: + * - Dirty buffers on devvp. + * - Softdep activity occurred after start of vnode sync loop + * - Secondary writes occurred after start of vnode sync loop + */ + error = 0; + if (bo->bo_numoutput > 0 || + bo->bo_dirty.bv_cnt > 0 || + softdep_depcnt != unlinked || + ump->softdep_deps != unlinked || + softdep_accdepcnt != ump->softdep_accdeps || + secondary_writes != 0 || + mp->mnt_secondary_writes != 0 || + secondary_accwrites != mp->mnt_secondary_accwrites) + error = EAGAIN; + FREE_LOCK(ump); + BO_UNLOCK(bo); + return (error); +} + + +/* + * Get the number of dependency structures for the file system, both + * the current number and the total number allocated. These will + * later be used to detect that softdep processing has occurred. + */ +void +softdep_get_depcounts(struct mount *mp, + int *softdep_depsp, + int *softdep_accdepsp) +{ + struct ufsmount *ump; + + if (MOUNTEDSOFTDEP(mp) == 0) { + *softdep_depsp = 0; + *softdep_accdepsp = 0; + return; + } + ump = VFSTOUFS(mp); + ACQUIRE_LOCK(ump); + *softdep_depsp = ump->softdep_deps; + *softdep_accdepsp = ump->softdep_accdeps; + FREE_LOCK(ump); +} + +/* + * Wait for pending output on a vnode to complete. + */ +static void +drain_output(vp) + struct vnode *vp; +{ + + ASSERT_VOP_LOCKED(vp, "drain_output"); + (void)bufobj_wwait(&vp->v_bufobj, 0, 0); +} + +/* + * Called whenever a buffer that is being invalidated or reallocated + * contains dependencies. This should only happen if an I/O error has + * occurred. The routine is called with the buffer locked. + */ +static void +softdep_deallocate_dependencies(bp) + struct buf *bp; +{ + + if ((bp->b_ioflags & BIO_ERROR) == 0) + panic("softdep_deallocate_dependencies: dangling deps"); + if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL) + softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); + else + printf("softdep_deallocate_dependencies: " + "got error %d while accessing filesystem\n", bp->b_error); + if (bp->b_error != ENXIO) + panic("softdep_deallocate_dependencies: unrecovered I/O error"); +} + +/* + * Function to handle asynchronous write errors in the filesystem. + */ +static void +softdep_error(func, error) + char *func; + int error; +{ + + /* XXX should do something better! */ + printf("%s: got error %d while accessing filesystem\n", func, error); +} + +#ifdef DDB + +static void +inodedep_print(struct inodedep *inodedep, int verbose) +{ + db_printf("%p fs %p st %x ino %jd inoblk %jd delta %jd nlink %jd" + " saveino %p\n", + inodedep, inodedep->id_fs, inodedep->id_state, + (intmax_t)inodedep->id_ino, + (intmax_t)fsbtodb(inodedep->id_fs, + ino_to_fsba(inodedep->id_fs, inodedep->id_ino)), + (intmax_t)inodedep->id_nlinkdelta, + (intmax_t)inodedep->id_savednlink, + inodedep->id_savedino1); + + if (verbose == 0) + return; + + db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, " + "mkdiradd %p\n", + LIST_FIRST(&inodedep->id_pendinghd), + LIST_FIRST(&inodedep->id_bufwait), + LIST_FIRST(&inodedep->id_inowait), + TAILQ_FIRST(&inodedep->id_inoreflst), + inodedep->id_mkdiradd); + db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n", + TAILQ_FIRST(&inodedep->id_inoupdt), + TAILQ_FIRST(&inodedep->id_newinoupdt), + TAILQ_FIRST(&inodedep->id_extupdt), + TAILQ_FIRST(&inodedep->id_newextupdt)); +} + +DB_SHOW_COMMAND(inodedep, db_show_inodedep) +{ + + if (have_addr == 0) { + db_printf("Address required\n"); + return; + } + inodedep_print((struct inodedep*)addr, 1); +} + +DB_SHOW_COMMAND(inodedeps, db_show_inodedeps) +{ + struct inodedep_hashhead *inodedephd; + struct inodedep *inodedep; + struct ufsmount *ump; + int cnt; + + if (have_addr == 0) { + db_printf("Address required\n"); + return; + } + ump = (struct ufsmount *)addr; + for (cnt = 0; cnt < ump->inodedep_hash_size; cnt++) { + inodedephd = &ump->inodedep_hashtbl[cnt]; + LIST_FOREACH(inodedep, inodedephd, id_hash) { + inodedep_print(inodedep, 0); + } + } +} + +DB_SHOW_COMMAND(worklist, db_show_worklist) +{ + struct worklist *wk; + + if (have_addr == 0) { + db_printf("Address required\n"); + return; + } + wk = (struct worklist *)addr; + printf("worklist: %p type %s state 0x%X\n", + wk, TYPENAME(wk->wk_type), wk->wk_state); +} + +DB_SHOW_COMMAND(workhead, db_show_workhead) +{ + struct workhead *wkhd; + struct worklist *wk; + int i; + + if (have_addr == 0) { + db_printf("Address required\n"); + return; + } + wkhd = (struct workhead *)addr; + wk = LIST_FIRST(wkhd); + for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list)) + db_printf("worklist: %p type %s state 0x%X", + wk, TYPENAME(wk->wk_type), wk->wk_state); + if (i == 100) + db_printf("workhead overflow"); + printf("\n"); +} + + +DB_SHOW_COMMAND(mkdirs, db_show_mkdirs) +{ + struct mkdirlist *mkdirlisthd; + struct jaddref *jaddref; + struct diradd *diradd; + struct mkdir *mkdir; + + if (have_addr == 0) { + db_printf("Address required\n"); + return; + } + mkdirlisthd = (struct mkdirlist *)addr; + LIST_FOREACH(mkdir, mkdirlisthd, md_mkdirs) { + diradd = mkdir->md_diradd; + db_printf("mkdir: %p state 0x%X dap %p state 0x%X", + mkdir, mkdir->md_state, diradd, diradd->da_state); + if ((jaddref = mkdir->md_jaddref) != NULL) + db_printf(" jaddref %p jaddref state 0x%X", + jaddref, jaddref->ja_state); + db_printf("\n"); + } +} + +/* exported to ffs_vfsops.c */ +extern void db_print_ffs(struct ufsmount *ump); +void +db_print_ffs(struct ufsmount *ump) +{ + db_printf("mp %p %s devvp %p fs %p su_wl %d su_deps %d su_req %d\n", + ump->um_mountp, ump->um_mountp->mnt_stat.f_mntonname, + ump->um_devvp, ump->um_fs, ump->softdep_on_worklist, + ump->softdep_deps, ump->softdep_req); +} + +#endif /* DDB */ + +#endif /* SOFTUPDATES */ diff --git a/Dump/ufs/ffs/ffs_subr.c b/Dump/ufs/ffs/ffs_subr.c new file mode 100644 index 0000000..e75b89f --- /dev/null +++ b/Dump/ufs/ffs/ffs_subr.c @@ -0,0 +1,379 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_subr.c 8.5 (Berkeley) 3/21/95 + */ + +#include +__FBSDID("$FreeBSD: releng/11.2/sys/ufs/ffs/ffs_subr.c 331722 2018-03-29 02:50:57Z eadler $"); + +#include + +#ifndef _KERNEL +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +/* + * Return buffer with the contents of block "offset" from the beginning of + * directory "ip". If "res" is non-zero, fill it in with a pointer to the + * remaining space in the directory. + */ +int +ffs_blkatoff(vp, offset, res, bpp) + struct vnode *vp; + off_t offset; + char **res; + struct buf **bpp; +{ + struct inode *ip; + struct fs *fs; + struct buf *bp; + ufs_lbn_t lbn; + int bsize, error; + + ip = VTOI(vp); + fs = ITOFS(ip); + lbn = lblkno(fs, offset); + bsize = blksize(fs, ip, lbn); + + *bpp = NULL; + error = bread(vp, lbn, bsize, NOCRED, &bp); + if (error) { + brelse(bp); + return (error); + } + if (res) + *res = (char *)bp->b_data + blkoff(fs, offset); + *bpp = bp; + return (0); +} + +/* + * Load up the contents of an inode and copy the appropriate pieces + * to the incore copy. + */ +void +ffs_load_inode(bp, ip, fs, ino) + struct buf *bp; + struct inode *ip; + struct fs *fs; + ino_t ino; +{ + + if (I_IS_UFS1(ip)) { + *ip->i_din1 = + *((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ino)); + ip->i_mode = ip->i_din1->di_mode; + ip->i_nlink = ip->i_din1->di_nlink; + ip->i_size = ip->i_din1->di_size; + ip->i_flags = ip->i_din1->di_flags; + ip->i_gen = ip->i_din1->di_gen; + ip->i_uid = ip->i_din1->di_uid; + ip->i_gid = ip->i_din1->di_gid; + } else { + *ip->i_din2 = + *((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ino)); + ip->i_mode = ip->i_din2->di_mode; + ip->i_nlink = ip->i_din2->di_nlink; + ip->i_size = ip->i_din2->di_size; + ip->i_flags = ip->i_din2->di_flags; + ip->i_gen = ip->i_din2->di_gen; + ip->i_uid = ip->i_din2->di_uid; + ip->i_gid = ip->i_din2->di_gid; + } +} +#endif /* KERNEL */ + +/* + * Update the frsum fields to reflect addition or deletion + * of some frags. + */ +void +ffs_fragacct(fs, fragmap, fraglist, cnt) + struct fs *fs; + int fragmap; + int32_t fraglist[]; + int cnt; +{ + int inblk; + int field, subfield; + int siz, pos; + + inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1; + fragmap <<= 1; + for (siz = 1; siz < fs->fs_frag; siz++) { + if ((inblk & (1 << (siz + (fs->fs_frag % NBBY)))) == 0) + continue; + field = around[siz]; + subfield = inside[siz]; + for (pos = siz; pos <= fs->fs_frag; pos++) { + if ((fragmap & field) == subfield) { + fraglist[siz] += cnt; + pos += siz; + field <<= siz; + subfield <<= siz; + } + field <<= 1; + subfield <<= 1; + } + } +} + +/* + * block operations + * + * check if a block is available + */ +int +ffs_isblock(fs, cp, h) + struct fs *fs; + unsigned char *cp; + ufs1_daddr_t h; +{ + unsigned char mask; + + switch ((int)fs->fs_frag) { + case 8: + return (cp[h] == 0xff); + case 4: + mask = 0x0f << ((h & 0x1) << 2); + return ((cp[h >> 1] & mask) == mask); + case 2: + mask = 0x03 << ((h & 0x3) << 1); + return ((cp[h >> 2] & mask) == mask); + case 1: + mask = 0x01 << (h & 0x7); + return ((cp[h >> 3] & mask) == mask); + default: +#ifdef _KERNEL + panic("ffs_isblock"); +#endif + break; + } + return (0); +} + +/* + * check if a block is free + */ +int +ffs_isfreeblock(fs, cp, h) + struct fs *fs; + u_char *cp; + ufs1_daddr_t h; +{ + + switch ((int)fs->fs_frag) { + case 8: + return (cp[h] == 0); + case 4: + return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0); + case 2: + return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0); + case 1: + return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0); + default: +#ifdef _KERNEL + panic("ffs_isfreeblock"); +#endif + break; + } + return (0); +} + +/* + * take a block out of the map + */ +void +ffs_clrblock(fs, cp, h) + struct fs *fs; + u_char *cp; + ufs1_daddr_t h; +{ + + switch ((int)fs->fs_frag) { + case 8: + cp[h] = 0; + return; + case 4: + cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2)); + return; + case 2: + cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1)); + return; + case 1: + cp[h >> 3] &= ~(0x01 << (h & 0x7)); + return; + default: +#ifdef _KERNEL + panic("ffs_clrblock"); +#endif + break; + } +} + +/* + * put a block into the map + */ +void +ffs_setblock(fs, cp, h) + struct fs *fs; + unsigned char *cp; + ufs1_daddr_t h; +{ + + switch ((int)fs->fs_frag) { + + case 8: + cp[h] = 0xff; + return; + case 4: + cp[h >> 1] |= (0x0f << ((h & 0x1) << 2)); + return; + case 2: + cp[h >> 2] |= (0x03 << ((h & 0x3) << 1)); + return; + case 1: + cp[h >> 3] |= (0x01 << (h & 0x7)); + return; + default: +#ifdef _KERNEL + panic("ffs_setblock"); +#endif + break; + } +} + +/* + * Update the cluster map because of an allocation or free. + * + * Cnt == 1 means free; cnt == -1 means allocating. + */ +void +ffs_clusteracct(fs, cgp, blkno, cnt) + struct fs *fs; + struct cg *cgp; + ufs1_daddr_t blkno; + int cnt; +{ + int32_t *sump; + int32_t *lp; + u_char *freemapp, *mapp; + int i, start, end, forw, back, map, bit; + + if (fs->fs_contigsumsize <= 0) + return; + freemapp = cg_clustersfree(cgp); + sump = cg_clustersum(cgp); + /* + * Allocate or clear the actual block. + */ + if (cnt > 0) + setbit(freemapp, blkno); + else + clrbit(freemapp, blkno); + /* + * Find the size of the cluster going forward. + */ + start = blkno + 1; + end = start + fs->fs_contigsumsize; + if (end >= cgp->cg_nclusterblks) + end = cgp->cg_nclusterblks; + mapp = &freemapp[start / NBBY]; + map = *mapp++; + bit = 1 << (start % NBBY); + for (i = start; i < end; i++) { + if ((map & bit) == 0) + break; + if ((i & (NBBY - 1)) != (NBBY - 1)) { + bit <<= 1; + } else { + map = *mapp++; + bit = 1; + } + } + forw = i - start; + /* + * Find the size of the cluster going backward. + */ + start = blkno - 1; + end = start - fs->fs_contigsumsize; + if (end < 0) + end = -1; + mapp = &freemapp[start / NBBY]; + map = *mapp--; + bit = 1 << (start % NBBY); + for (i = start; i > end; i--) { + if ((map & bit) == 0) + break; + if ((i & (NBBY - 1)) != 0) { + bit >>= 1; + } else { + map = *mapp--; + bit = 1 << (NBBY - 1); + } + } + back = start - i; + /* + * Account for old cluster and the possibly new forward and + * back clusters. + */ + i = back + forw + 1; + if (i > fs->fs_contigsumsize) + i = fs->fs_contigsumsize; + sump[i] += cnt; + if (back > 0) + sump[back] -= cnt; + if (forw > 0) + sump[forw] -= cnt; + /* + * Update cluster summary information. + */ + lp = &sump[fs->fs_contigsumsize]; + for (i = fs->fs_contigsumsize; i > 0; i--) + if (*lp-- > 0) + break; + fs->fs_maxcluster[cgp->cg_cgx] = i; +} diff --git a/Dump/ufs/ffs/ffs_suspend.c b/Dump/ufs/ffs/ffs_suspend.c new file mode 100644 index 0000000..a714c1f --- /dev/null +++ b/Dump/ufs/ffs/ffs_suspend.c @@ -0,0 +1,337 @@ +/*- + * Copyright (c) 2012 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed by Edward Tomasz Napierala under sponsorship + * from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: releng/11.2/sys/ufs/ffs/ffs_suspend.c 306165 2016-09-22 08:56:54Z kib $ + */ + +#include +__FBSDID("$FreeBSD: releng/11.2/sys/ufs/ffs/ffs_suspend.c 306165 2016-09-22 08:56:54Z kib $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include +#include + +static d_open_t ffs_susp_open; +static d_write_t ffs_susp_rdwr; +static d_ioctl_t ffs_susp_ioctl; + +static struct cdevsw ffs_susp_cdevsw = { + .d_version = D_VERSION, + .d_open = ffs_susp_open, + .d_read = ffs_susp_rdwr, + .d_write = ffs_susp_rdwr, + .d_ioctl = ffs_susp_ioctl, + .d_name = "ffs_susp", +}; + +static struct cdev *ffs_susp_dev; +static struct sx ffs_susp_lock; + +static int +ffs_susp_suspended(struct mount *mp) +{ + struct ufsmount *ump; + + sx_assert(&ffs_susp_lock, SA_LOCKED); + + ump = VFSTOUFS(mp); + if (ump->um_writesuspended) + return (1); + return (0); +} + +static int +ffs_susp_open(struct cdev *dev __unused, int flags __unused, + int fmt __unused, struct thread *td __unused) +{ + + return (0); +} + +static int +ffs_susp_rdwr(struct cdev *dev, struct uio *uio, int ioflag) +{ + int error, i; + struct vnode *devvp; + struct mount *mp; + struct ufsmount *ump; + struct buf *bp; + void *base; + size_t len; + ssize_t cnt; + struct fs *fs; + + sx_slock(&ffs_susp_lock); + + error = devfs_get_cdevpriv((void **)&mp); + if (error != 0) { + sx_sunlock(&ffs_susp_lock); + return (ENXIO); + } + + ump = VFSTOUFS(mp); + devvp = ump->um_devvp; + fs = ump->um_fs; + + if (ffs_susp_suspended(mp) == 0) { + sx_sunlock(&ffs_susp_lock); + return (ENXIO); + } + + KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, + ("neither UIO_READ or UIO_WRITE")); + KASSERT(uio->uio_segflg == UIO_USERSPACE, + ("uio->uio_segflg != UIO_USERSPACE")); + + cnt = uio->uio_resid; + + for (i = 0; i < uio->uio_iovcnt; i++) { + while (uio->uio_iov[i].iov_len) { + base = uio->uio_iov[i].iov_base; + len = uio->uio_iov[i].iov_len; + if (len > fs->fs_bsize) + len = fs->fs_bsize; + if (fragoff(fs, uio->uio_offset) != 0 || + fragoff(fs, len) != 0) { + error = EINVAL; + goto out; + } + error = bread(devvp, btodb(uio->uio_offset), len, + NOCRED, &bp); + if (error != 0) + goto out; + if (uio->uio_rw == UIO_WRITE) { + error = copyin(base, bp->b_data, len); + if (error != 0) { + bp->b_flags |= B_INVAL | B_NOCACHE; + brelse(bp); + goto out; + } + error = bwrite(bp); + if (error != 0) + goto out; + } else { + error = copyout(bp->b_data, base, len); + brelse(bp); + if (error != 0) + goto out; + } + uio->uio_iov[i].iov_base = + (char *)uio->uio_iov[i].iov_base + len; + uio->uio_iov[i].iov_len -= len; + uio->uio_resid -= len; + uio->uio_offset += len; + } + } + +out: + sx_sunlock(&ffs_susp_lock); + + if (uio->uio_resid < cnt) + return (0); + + return (error); +} + +static int +ffs_susp_suspend(struct mount *mp) +{ + struct ufsmount *ump; + int error; + + sx_assert(&ffs_susp_lock, SA_XLOCKED); + + if (!ffs_own_mount(mp)) + return (EINVAL); + if (ffs_susp_suspended(mp)) + return (EBUSY); + + ump = VFSTOUFS(mp); + + /* + * Make sure the calling thread is permitted to access the mounted + * device. The permissions can change after we unlock the vnode; + * it's harmless. + */ + vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_ACCESS(ump->um_devvp, VREAD | VWRITE, + curthread->td_ucred, curthread); + VOP_UNLOCK(ump->um_devvp, 0); + if (error != 0) + return (error); +#ifdef MAC + if (mac_mount_check_stat(curthread->td_ucred, mp) != 0) + return (EPERM); +#endif + + if ((error = vfs_write_suspend(mp, VS_SKIP_UNMOUNT)) != 0) + return (error); + + ump->um_writesuspended = 1; + + return (0); +} + +static void +ffs_susp_dtor(void *data) +{ + struct fs *fs; + struct ufsmount *ump; + struct mount *mp; + int error; + + sx_xlock(&ffs_susp_lock); + + mp = (struct mount *)data; + ump = VFSTOUFS(mp); + fs = ump->um_fs; + + if (ffs_susp_suspended(mp) == 0) { + sx_xunlock(&ffs_susp_lock); + return; + } + + KASSERT((mp->mnt_kern_flag & MNTK_SUSPEND) != 0, + ("MNTK_SUSPEND not set")); + + error = ffs_reload(mp, curthread, FFSR_FORCE | FFSR_UNSUSPEND); + if (error != 0) + panic("failed to unsuspend writes on %s", fs->fs_fsmnt); + + /* + * XXX: The status is kept per-process; the vfs_write_resume() routine + * asserts that the resuming thread is the same one that called + * vfs_write_suspend(). The cdevpriv data, however, is attached + * to the file descriptor, e.g. is inherited during fork. Thus, + * it's possible that the resuming process will be different from + * the one that started the suspension. + * + * Work around by fooling the check in vfs_write_resume(). + */ + mp->mnt_susp_owner = curthread; + + vfs_write_resume(mp, 0); + vfs_unbusy(mp); + ump->um_writesuspended = 0; + + sx_xunlock(&ffs_susp_lock); +} + +static int +ffs_susp_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, + struct thread *td) +{ + struct mount *mp; + fsid_t *fsidp; + int error; + + /* + * No suspend inside the jail. Allowing it would require making + * sure that e.g. the devfs ruleset for that jail permits access + * to the devvp. + */ + if (jailed(td->td_ucred)) + return (EPERM); + + sx_xlock(&ffs_susp_lock); + + switch (cmd) { + case UFSSUSPEND: + fsidp = (fsid_t *)addr; + mp = vfs_getvfs(fsidp); + if (mp == NULL) { + error = ENOENT; + break; + } + error = vfs_busy(mp, 0); + vfs_rel(mp); + if (error != 0) + break; + error = ffs_susp_suspend(mp); + if (error != 0) { + vfs_unbusy(mp); + break; + } + error = devfs_set_cdevpriv(mp, ffs_susp_dtor); + KASSERT(error == 0, ("devfs_set_cdevpriv failed")); + break; + case UFSRESUME: + error = devfs_get_cdevpriv((void **)&mp); + if (error != 0) + break; + /* + * This calls ffs_susp_dtor, which in turn unsuspends the fs. + * The dtor expects to be called without lock held, because + * sometimes it's called from here, and sometimes due to the + * file being closed or process exiting. + */ + sx_xunlock(&ffs_susp_lock); + devfs_clear_cdevpriv(); + return (0); + default: + error = ENXIO; + break; + } + + sx_xunlock(&ffs_susp_lock); + + return (error); +} + +void +ffs_susp_initialize(void) +{ + + sx_init(&ffs_susp_lock, "ffs_susp"); + ffs_susp_dev = make_dev(&ffs_susp_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, + "ufssuspend"); +} + +void +ffs_susp_uninitialize(void) +{ + + destroy_dev(ffs_susp_dev); + sx_destroy(&ffs_susp_lock); +} diff --git a/Dump/ufs/ffs/ffs_tables.c b/Dump/ufs/ffs/ffs_tables.c new file mode 100644 index 0000000..ea4b15b --- /dev/null +++ b/Dump/ufs/ffs/ffs_tables.c @@ -0,0 +1,137 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_tables.c 8.1 (Berkeley) 6/11/93 + */ + +#include +__FBSDID("$FreeBSD: releng/11.2/sys/ufs/ffs/ffs_tables.c 331722 2018-03-29 02:50:57Z eadler $"); + +#include +#include +#include + +/* + * Bit patterns for identifying fragments in the block map + * used as ((map & around) == inside) + */ +int around[9] = { + 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff, 0x1ff, 0x3ff +}; +int inside[9] = { + 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe +}; + +/* + * Given a block map bit pattern, the frag tables tell whether a + * particular size fragment is available. + * + * used as: + * if ((1 << (size - 1)) & fragtbl[fs->fs_frag][map] { + * at least one fragment of the indicated size is available + * } + * + * These tables are used by the scanc instruction on the VAX to + * quickly find an appropriate fragment. + */ +static u_char fragtbl124[256] = { + 0x00, 0x16, 0x16, 0x2a, 0x16, 0x16, 0x26, 0x4e, + 0x16, 0x16, 0x16, 0x3e, 0x2a, 0x3e, 0x4e, 0x8a, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x2a, 0x3e, 0x3e, 0x2a, 0x3e, 0x3e, 0x2e, 0x6e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x2a, 0x3e, 0x6e, 0xaa, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x26, 0x36, 0x36, 0x2e, 0x36, 0x36, 0x26, 0x6e, + 0x36, 0x36, 0x36, 0x3e, 0x2e, 0x3e, 0x6e, 0xae, + 0x4e, 0x5e, 0x5e, 0x6e, 0x5e, 0x5e, 0x6e, 0x4e, + 0x5e, 0x5e, 0x5e, 0x7e, 0x6e, 0x7e, 0x4e, 0xce, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x16, 0x16, 0x16, 0x3e, 0x16, 0x16, 0x36, 0x5e, + 0x16, 0x16, 0x16, 0x3e, 0x3e, 0x3e, 0x5e, 0x9e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0xbe, + 0x2a, 0x3e, 0x3e, 0x2a, 0x3e, 0x3e, 0x2e, 0x6e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x2a, 0x3e, 0x6e, 0xaa, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, + 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x3e, 0x7e, 0xbe, + 0x4e, 0x5e, 0x5e, 0x6e, 0x5e, 0x5e, 0x6e, 0x4e, + 0x5e, 0x5e, 0x5e, 0x7e, 0x6e, 0x7e, 0x4e, 0xce, + 0x8a, 0x9e, 0x9e, 0xaa, 0x9e, 0x9e, 0xae, 0xce, + 0x9e, 0x9e, 0x9e, 0xbe, 0xaa, 0xbe, 0xce, 0x8a, +}; + +static u_char fragtbl8[256] = { + 0x00, 0x01, 0x01, 0x02, 0x01, 0x01, 0x02, 0x04, + 0x01, 0x01, 0x01, 0x03, 0x02, 0x03, 0x04, 0x08, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x02, 0x03, 0x03, 0x02, 0x04, 0x05, 0x08, 0x10, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, + 0x04, 0x05, 0x05, 0x06, 0x08, 0x09, 0x10, 0x20, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11, + 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, + 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0a, + 0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04, + 0x08, 0x09, 0x09, 0x0a, 0x10, 0x11, 0x20, 0x40, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x03, 0x03, 0x03, 0x03, 0x05, 0x05, 0x09, 0x11, + 0x01, 0x01, 0x01, 0x03, 0x01, 0x01, 0x03, 0x05, + 0x01, 0x01, 0x01, 0x03, 0x03, 0x03, 0x05, 0x09, + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07, + 0x05, 0x05, 0x05, 0x07, 0x09, 0x09, 0x11, 0x21, + 0x02, 0x03, 0x03, 0x02, 0x03, 0x03, 0x02, 0x06, + 0x03, 0x03, 0x03, 0x03, 0x02, 0x03, 0x06, 0x0a, + 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x07, + 0x02, 0x03, 0x03, 0x02, 0x06, 0x07, 0x0a, 0x12, + 0x04, 0x05, 0x05, 0x06, 0x05, 0x05, 0x06, 0x04, + 0x05, 0x05, 0x05, 0x07, 0x06, 0x07, 0x04, 0x0c, + 0x08, 0x09, 0x09, 0x0a, 0x09, 0x09, 0x0a, 0x0c, + 0x10, 0x11, 0x11, 0x12, 0x20, 0x21, 0x40, 0x80, +}; + +/* + * The actual fragtbl array. + */ +u_char *fragtbl[MAXFRAG + 1] = { + 0, fragtbl124, fragtbl124, 0, fragtbl124, 0, 0, 0, fragtbl8, +}; diff --git a/Dump/ufs/ffs/ffs_vfsops.c b/Dump/ufs/ffs/ffs_vfsops.c new file mode 100644 index 0000000..b3d822a --- /dev/null +++ b/Dump/ufs/ffs/ffs_vfsops.c @@ -0,0 +1,2289 @@ +/*- + * Copyright (c) 1989, 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ffs_vfsops.c 8.31 (Berkeley) 5/20/95 + */ + +#include +__FBSDID("$FreeBSD: releng/11.2/sys/ufs/ffs/ffs_vfsops.c 331722 2018-03-29 02:50:57Z eadler $"); + +#include "opt_quota.h" +#include "opt_ufs.h" +#include "opt_ffs.h" +#include "opt_ddb.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include + +#include + +static uma_zone_t uma_inode, uma_ufs1, uma_ufs2; + +static int ffs_mountfs(struct vnode *, struct mount *, struct thread *); +static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, + ufs2_daddr_t); +static void ffs_ifree(struct ufsmount *ump, struct inode *ip); +static int ffs_sync_lazy(struct mount *mp); + +static vfs_init_t ffs_init; +static vfs_uninit_t ffs_uninit; +static vfs_extattrctl_t ffs_extattrctl; +static vfs_cmount_t ffs_cmount; +static vfs_unmount_t ffs_unmount; +static vfs_mount_t ffs_mount; +static vfs_statfs_t ffs_statfs; +static vfs_fhtovp_t ffs_fhtovp; +static vfs_sync_t ffs_sync; + +static struct vfsops ufs_vfsops = { + .vfs_extattrctl = ffs_extattrctl, + .vfs_fhtovp = ffs_fhtovp, + .vfs_init = ffs_init, + .vfs_mount = ffs_mount, + .vfs_cmount = ffs_cmount, + .vfs_quotactl = ufs_quotactl, + .vfs_root = ufs_root, + .vfs_statfs = ffs_statfs, + .vfs_sync = ffs_sync, + .vfs_uninit = ffs_uninit, + .vfs_unmount = ffs_unmount, + .vfs_vget = ffs_vget, + .vfs_susp_clean = process_deferred_inactive, +}; + +VFS_SET(ufs_vfsops, ufs, 0); +MODULE_VERSION(ufs, 1); + +static b_strategy_t ffs_geom_strategy; +static b_write_t ffs_bufwrite; + +static struct buf_ops ffs_ops = { + .bop_name = "FFS", + .bop_write = ffs_bufwrite, + .bop_strategy = ffs_geom_strategy, + .bop_sync = bufsync, +#ifdef NO_FFS_SNAPSHOT + .bop_bdflush = bufbdflush, +#else + .bop_bdflush = ffs_bdflush, +#endif +}; + +/* + * Note that userquota and groupquota options are not currently used + * by UFS/FFS code and generally mount(8) does not pass those options + * from userland, but they can be passed by loader(8) via + * vfs.root.mountfrom.options. + */ +static const char *ffs_opts[] = { "acls", "async", "noatime", "noclusterr", + "noclusterw", "noexec", "export", "force", "from", "groupquota", + "multilabel", "nfsv4acls", "fsckpid", "snapshot", "nosuid", "suiddir", + "nosymfollow", "sync", "union", "userquota", NULL }; + +static int +ffs_mount(struct mount *mp) +{ + struct vnode *devvp; + struct thread *td; + struct ufsmount *ump = NULL; + struct fs *fs; + pid_t fsckpid = 0; + int error, error1, flags; + uint64_t mntorflags; + accmode_t accmode; + struct nameidata ndp; + char *fspec; + + td = curthread; + if (vfs_filteropt(mp->mnt_optnew, ffs_opts)) + return (EINVAL); + if (uma_inode == NULL) { + uma_inode = uma_zcreate("FFS inode", + sizeof(struct inode), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + uma_ufs1 = uma_zcreate("FFS1 dinode", + sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + uma_ufs2 = uma_zcreate("FFS2 dinode", + sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + } + + vfs_deleteopt(mp->mnt_optnew, "groupquota"); + vfs_deleteopt(mp->mnt_optnew, "userquota"); + + fspec = vfs_getopts(mp->mnt_optnew, "from", &error); + if (error) + return (error); + + mntorflags = 0; + if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0) + mntorflags |= MNT_ACLS; + + if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0) { + mntorflags |= MNT_SNAPSHOT; + /* + * Once we have set the MNT_SNAPSHOT flag, do not + * persist "snapshot" in the options list. + */ + vfs_deleteopt(mp->mnt_optnew, "snapshot"); + vfs_deleteopt(mp->mnt_opt, "snapshot"); + } + + if (vfs_getopt(mp->mnt_optnew, "fsckpid", NULL, NULL) == 0 && + vfs_scanopt(mp->mnt_optnew, "fsckpid", "%d", &fsckpid) == 1) { + /* + * Once we have set the restricted PID, do not + * persist "fsckpid" in the options list. + */ + vfs_deleteopt(mp->mnt_optnew, "fsckpid"); + vfs_deleteopt(mp->mnt_opt, "fsckpid"); + if (mp->mnt_flag & MNT_UPDATE) { + if (VFSTOUFS(mp)->um_fs->fs_ronly == 0 && + vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) { + vfs_mount_error(mp, + "Checker enable: Must be read-only"); + return (EINVAL); + } + } else if (vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) { + vfs_mount_error(mp, + "Checker enable: Must be read-only"); + return (EINVAL); + } + /* Set to -1 if we are done */ + if (fsckpid == 0) + fsckpid = -1; + } + + if (vfs_getopt(mp->mnt_optnew, "nfsv4acls", NULL, NULL) == 0) { + if (mntorflags & MNT_ACLS) { + vfs_mount_error(mp, + "\"acls\" and \"nfsv4acls\" options " + "are mutually exclusive"); + return (EINVAL); + } + mntorflags |= MNT_NFS4ACLS; + } + + MNT_ILOCK(mp); + mp->mnt_flag |= mntorflags; + MNT_IUNLOCK(mp); + /* + * If updating, check whether changing from read-only to + * read/write; if there is no device name, that's all we do. + */ + if (mp->mnt_flag & MNT_UPDATE) { + ump = VFSTOUFS(mp); + fs = ump->um_fs; + devvp = ump->um_devvp; + if (fsckpid == -1 && ump->um_fsckpid > 0) { + if ((error = ffs_flushfiles(mp, WRITECLOSE, td)) != 0 || + (error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) + return (error); + g_topology_lock(); + /* + * Return to normal read-only mode. + */ + error = g_access(ump->um_cp, 0, -1, 0); + g_topology_unlock(); + ump->um_fsckpid = 0; + } + if (fs->fs_ronly == 0 && + vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { + /* + * Flush any dirty data and suspend filesystem. + */ + if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) + return (error); + error = vfs_write_suspend_umnt(mp); + if (error != 0) + return (error); + /* + * Check for and optionally get rid of files open + * for writing. + */ + flags = WRITECLOSE; + if (mp->mnt_flag & MNT_FORCE) + flags |= FORCECLOSE; + if (MOUNTEDSOFTDEP(mp)) { + error = softdep_flushfiles(mp, flags, td); + } else { + error = ffs_flushfiles(mp, flags, td); + } + if (error) { + vfs_write_resume(mp, 0); + return (error); + } + if (fs->fs_pendingblocks != 0 || + fs->fs_pendinginodes != 0) { + printf("WARNING: %s Update error: blocks %jd " + "files %d\n", fs->fs_fsmnt, + (intmax_t)fs->fs_pendingblocks, + fs->fs_pendinginodes); + fs->fs_pendingblocks = 0; + fs->fs_pendinginodes = 0; + } + if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) + fs->fs_clean = 1; + if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { + fs->fs_ronly = 0; + fs->fs_clean = 0; + vfs_write_resume(mp, 0); + return (error); + } + if (MOUNTEDSOFTDEP(mp)) + softdep_unmount(mp); + g_topology_lock(); + /* + * Drop our write and exclusive access. + */ + g_access(ump->um_cp, 0, -1, -1); + g_topology_unlock(); + fs->fs_ronly = 1; + MNT_ILOCK(mp); + mp->mnt_flag |= MNT_RDONLY; + MNT_IUNLOCK(mp); + /* + * Allow the writers to note that filesystem + * is ro now. + */ + vfs_write_resume(mp, 0); + } + if ((mp->mnt_flag & MNT_RELOAD) && + (error = ffs_reload(mp, td, 0)) != 0) + return (error); + if (fs->fs_ronly && + !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { + /* + * If we are running a checker, do not allow upgrade. + */ + if (ump->um_fsckpid > 0) { + vfs_mount_error(mp, + "Active checker, cannot upgrade to write"); + return (EINVAL); + } + /* + * If upgrade to read-write by non-root, then verify + * that user has necessary permissions on the device. + */ + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_ACCESS(devvp, VREAD | VWRITE, + td->td_ucred, td); + if (error) + error = priv_check(td, PRIV_VFS_MOUNT_PERM); + if (error) { + VOP_UNLOCK(devvp, 0); + return (error); + } + VOP_UNLOCK(devvp, 0); + fs->fs_flags &= ~FS_UNCLEAN; + if (fs->fs_clean == 0) { + fs->fs_flags |= FS_UNCLEAN; + if ((mp->mnt_flag & MNT_FORCE) || + ((fs->fs_flags & + (FS_SUJ | FS_NEEDSFSCK)) == 0 && + (fs->fs_flags & FS_DOSOFTDEP))) { + printf("WARNING: %s was not properly " + "dismounted\n", fs->fs_fsmnt); + } else { + vfs_mount_error(mp, + "R/W mount of %s denied. %s.%s", + fs->fs_fsmnt, + "Filesystem is not clean - run fsck", + (fs->fs_flags & FS_SUJ) == 0 ? "" : + " Forced mount will invalidate" + " journal contents"); + return (EPERM); + } + } + g_topology_lock(); + /* + * Request exclusive write access. + */ + error = g_access(ump->um_cp, 0, 1, 1); + g_topology_unlock(); + if (error) + return (error); + if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) + return (error); + fs->fs_ronly = 0; + MNT_ILOCK(mp); + mp->mnt_flag &= ~MNT_RDONLY; + MNT_IUNLOCK(mp); + fs->fs_mtime = time_second; + /* check to see if we need to start softdep */ + if ((fs->fs_flags & FS_DOSOFTDEP) && + (error = softdep_mount(devvp, mp, fs, td->td_ucred))){ + vn_finished_write(mp); + return (error); + } + fs->fs_clean = 0; + if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { + vn_finished_write(mp); + return (error); + } + if (fs->fs_snapinum[0] != 0) + ffs_snapshot_mount(mp); + vn_finished_write(mp); + } + /* + * Soft updates is incompatible with "async", + * so if we are doing softupdates stop the user + * from setting the async flag in an update. + * Softdep_mount() clears it in an initial mount + * or ro->rw remount. + */ + if (MOUNTEDSOFTDEP(mp)) { + /* XXX: Reset too late ? */ + MNT_ILOCK(mp); + mp->mnt_flag &= ~MNT_ASYNC; + MNT_IUNLOCK(mp); + } + /* + * Keep MNT_ACLS flag if it is stored in superblock. + */ + if ((fs->fs_flags & FS_ACLS) != 0) { + /* XXX: Set too late ? */ + MNT_ILOCK(mp); + mp->mnt_flag |= MNT_ACLS; + MNT_IUNLOCK(mp); + } + + if ((fs->fs_flags & FS_NFS4ACLS) != 0) { + /* XXX: Set too late ? */ + MNT_ILOCK(mp); + mp->mnt_flag |= MNT_NFS4ACLS; + MNT_IUNLOCK(mp); + } + /* + * If this is a request from fsck to clean up the filesystem, + * then allow the specified pid to proceed. + */ + if (fsckpid > 0) { + if (ump->um_fsckpid != 0) { + vfs_mount_error(mp, + "Active checker already running on %s", + fs->fs_fsmnt); + return (EINVAL); + } + KASSERT(MOUNTEDSOFTDEP(mp) == 0, + ("soft updates enabled on read-only file system")); + g_topology_lock(); + /* + * Request write access. + */ + error = g_access(ump->um_cp, 0, 1, 0); + g_topology_unlock(); + if (error) { + vfs_mount_error(mp, + "Checker activation failed on %s", + fs->fs_fsmnt); + return (error); + } + ump->um_fsckpid = fsckpid; + if (fs->fs_snapinum[0] != 0) + ffs_snapshot_mount(mp); + fs->fs_mtime = time_second; + fs->fs_fmod = 1; + fs->fs_clean = 0; + (void) ffs_sbupdate(ump, MNT_WAIT, 0); + } + + /* + * If this is a snapshot request, take the snapshot. + */ + if (mp->mnt_flag & MNT_SNAPSHOT) + return (ffs_snapshot(mp, fspec)); + + /* + * Must not call namei() while owning busy ref. + */ + vfs_unbusy(mp); + } + + /* + * Not an update, or updating the name: look up the name + * and verify that it refers to a sensible disk device. + */ + NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); + error = namei(&ndp); + if ((mp->mnt_flag & MNT_UPDATE) != 0) { + /* + * Unmount does not start if MNT_UPDATE is set. Mount + * update busies mp before setting MNT_UPDATE. We + * must be able to retain our busy ref succesfully, + * without sleep. + */ + error1 = vfs_busy(mp, MBF_NOWAIT); + MPASS(error1 == 0); + } + if (error != 0) + return (error); + NDFREE(&ndp, NDF_ONLY_PNBUF); + devvp = ndp.ni_vp; + if (!vn_isdisk(devvp, &error)) { + vput(devvp); + return (error); + } + + /* + * If mount by non-root, then verify that user has necessary + * permissions on the device. + */ + accmode = VREAD; + if ((mp->mnt_flag & MNT_RDONLY) == 0) + accmode |= VWRITE; + error = VOP_ACCESS(devvp, accmode, td->td_ucred, td); + if (error) + error = priv_check(td, PRIV_VFS_MOUNT_PERM); + if (error) { + vput(devvp); + return (error); + } + + if (mp->mnt_flag & MNT_UPDATE) { + /* + * Update only + * + * If it's not the same vnode, or at least the same device + * then it's not correct. + */ + + if (devvp->v_rdev != ump->um_devvp->v_rdev) + error = EINVAL; /* needs translation */ + vput(devvp); + if (error) + return (error); + } else { + /* + * New mount + * + * We need the name for the mount point (also used for + * "last mounted on") copied in. If an error occurs, + * the mount point is discarded by the upper level code. + * Note that vfs_mount_alloc() populates f_mntonname for us. + */ + if ((error = ffs_mountfs(devvp, mp, td)) != 0) { + vrele(devvp); + return (error); + } + if (fsckpid > 0) { + KASSERT(MOUNTEDSOFTDEP(mp) == 0, + ("soft updates enabled on read-only file system")); + ump = VFSTOUFS(mp); + fs = ump->um_fs; + g_topology_lock(); + /* + * Request write access. + */ + error = g_access(ump->um_cp, 0, 1, 0); + g_topology_unlock(); + if (error) { + printf("WARNING: %s: Checker activation " + "failed\n", fs->fs_fsmnt); + } else { + ump->um_fsckpid = fsckpid; + if (fs->fs_snapinum[0] != 0) + ffs_snapshot_mount(mp); + fs->fs_mtime = time_second; + fs->fs_clean = 0; + (void) ffs_sbupdate(ump, MNT_WAIT, 0); + } + } + } + vfs_mountedfrom(mp, fspec); + return (0); +} + +/* + * Compatibility with old mount system call. + */ + +static int +ffs_cmount(struct mntarg *ma, void *data, uint64_t flags) +{ + struct ufs_args args; + struct export_args exp; + int error; + + if (data == NULL) + return (EINVAL); + error = copyin(data, &args, sizeof args); + if (error) + return (error); + vfs_oexport_conv(&args.export, &exp); + + ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); + ma = mount_arg(ma, "export", &exp, sizeof(exp)); + error = kernel_mount(ma, flags); + + return (error); +} + +/* + * Reload all incore data for a filesystem (used after running fsck on + * the root filesystem and finding things to fix). If the 'force' flag + * is 0, the filesystem must be mounted read-only. + * + * Things to do to update the mount: + * 1) invalidate all cached meta-data. + * 2) re-read superblock from disk. + * 3) re-read summary information from disk. + * 4) invalidate all inactive vnodes. + * 5) clear MNTK_SUSPEND2 and MNTK_SUSPENDED flags, allowing secondary + * writers, if requested. + * 6) invalidate all cached file data. + * 7) re-read inode data for all active vnodes. + */ +int +ffs_reload(struct mount *mp, struct thread *td, int flags) +{ + struct vnode *vp, *mvp, *devvp; + struct inode *ip; + void *space; + struct buf *bp; + struct fs *fs, *newfs; + struct ufsmount *ump; + ufs2_daddr_t sblockloc; + int i, blks, error; + u_long size; + int32_t *lp; + + ump = VFSTOUFS(mp); + + MNT_ILOCK(mp); + if ((mp->mnt_flag & MNT_RDONLY) == 0 && (flags & FFSR_FORCE) == 0) { + MNT_IUNLOCK(mp); + return (EINVAL); + } + MNT_IUNLOCK(mp); + + /* + * Step 1: invalidate all cached meta-data. + */ + devvp = VFSTOUFS(mp)->um_devvp; + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + if (vinvalbuf(devvp, 0, 0, 0) != 0) + panic("ffs_reload: dirty1"); + VOP_UNLOCK(devvp, 0); + + /* + * Step 2: re-read superblock from disk. + */ + fs = VFSTOUFS(mp)->um_fs; + if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize, + NOCRED, &bp)) != 0) + return (error); + newfs = (struct fs *)bp->b_data; + if ((newfs->fs_magic != FS_UFS1_MAGIC && + newfs->fs_magic != FS_UFS2_MAGIC) || + newfs->fs_bsize > MAXBSIZE || + newfs->fs_bsize < sizeof(struct fs)) { + brelse(bp); + return (EIO); /* XXX needs translation */ + } + /* + * Copy pointer fields back into superblock before copying in XXX + * new superblock. These should really be in the ufsmount. XXX + * Note that important parameters (eg fs_ncg) are unchanged. + */ + newfs->fs_csp = fs->fs_csp; + newfs->fs_maxcluster = fs->fs_maxcluster; + newfs->fs_contigdirs = fs->fs_contigdirs; + newfs->fs_active = fs->fs_active; + newfs->fs_ronly = fs->fs_ronly; + sblockloc = fs->fs_sblockloc; + bcopy(newfs, fs, (u_int)fs->fs_sbsize); + brelse(bp); + mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; + ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc); + UFS_LOCK(ump); + if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { + printf("WARNING: %s: reload pending error: blocks %jd " + "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, + fs->fs_pendinginodes); + fs->fs_pendingblocks = 0; + fs->fs_pendinginodes = 0; + } + UFS_UNLOCK(ump); + + /* + * Step 3: re-read summary information from disk. + */ + size = fs->fs_cssize; + blks = howmany(size, fs->fs_fsize); + if (fs->fs_contigsumsize > 0) + size += fs->fs_ncg * sizeof(int32_t); + size += fs->fs_ncg * sizeof(u_int8_t); + free(fs->fs_csp, M_UFSMNT); + space = malloc(size, M_UFSMNT, M_WAITOK); + fs->fs_csp = space; + for (i = 0; i < blks; i += fs->fs_frag) { + size = fs->fs_bsize; + if (i + fs->fs_frag > blks) + size = (blks - i) * fs->fs_fsize; + error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, + NOCRED, &bp); + if (error) + return (error); + bcopy(bp->b_data, space, (u_int)size); + space = (char *)space + size; + brelse(bp); + } + /* + * We no longer know anything about clusters per cylinder group. + */ + if (fs->fs_contigsumsize > 0) { + fs->fs_maxcluster = lp = space; + for (i = 0; i < fs->fs_ncg; i++) + *lp++ = fs->fs_contigsumsize; + space = lp; + } + size = fs->fs_ncg * sizeof(u_int8_t); + fs->fs_contigdirs = (u_int8_t *)space; + bzero(fs->fs_contigdirs, size); + if ((flags & FFSR_UNSUSPEND) != 0) { + MNT_ILOCK(mp); + mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2); + wakeup(&mp->mnt_flag); + MNT_IUNLOCK(mp); + } + +loop: + MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { + /* + * Skip syncer vnode. + */ + if (vp->v_type == VNON) { + VI_UNLOCK(vp); + continue; + } + /* + * Step 4: invalidate all cached file data. + */ + if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { + MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); + goto loop; + } + if (vinvalbuf(vp, 0, 0, 0)) + panic("ffs_reload: dirty2"); + /* + * Step 5: re-read inode data for all active vnodes. + */ + ip = VTOI(vp); + error = + bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), + (int)fs->fs_bsize, NOCRED, &bp); + if (error) { + VOP_UNLOCK(vp, 0); + vrele(vp); + MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); + return (error); + } + ffs_load_inode(bp, ip, fs, ip->i_number); + ip->i_effnlink = ip->i_nlink; + brelse(bp); + VOP_UNLOCK(vp, 0); + vrele(vp); + } + return (0); +} + +/* + * Possible superblock locations ordered from most to least likely. + */ +static int sblock_try[] = SBLOCKSEARCH; + +/* + * Common code for mount and mountroot + */ +static int +ffs_mountfs(devvp, mp, td) + struct vnode *devvp; + struct mount *mp; + struct thread *td; +{ + struct ufsmount *ump; + struct buf *bp; + struct fs *fs; + struct cdev *dev; + void *space; + ufs2_daddr_t sblockloc; + int error, i, blks, len, ronly; + u_long size; + int32_t *lp; + struct ucred *cred; + struct g_consumer *cp; + struct mount *nmp; + + bp = NULL; + ump = NULL; + cred = td ? td->td_ucred : NOCRED; + ronly = (mp->mnt_flag & MNT_RDONLY) != 0; + + KASSERT(devvp->v_type == VCHR, ("reclaimed devvp")); + dev = devvp->v_rdev; + if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0, + (uintptr_t)mp) == 0) { + VOP_UNLOCK(devvp, 0); + return (EBUSY); + } + g_topology_lock(); + error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1); + g_topology_unlock(); + if (error != 0) { + atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); + VOP_UNLOCK(devvp, 0); + return (error); + } + dev_ref(dev); + devvp->v_bufobj.bo_ops = &ffs_ops; + VOP_UNLOCK(devvp, 0); + if (dev->si_iosize_max != 0) + mp->mnt_iosize_max = dev->si_iosize_max; + if (mp->mnt_iosize_max > MAXPHYS) + mp->mnt_iosize_max = MAXPHYS; + + fs = NULL; + sblockloc = 0; + /* + * Try reading the superblock in each of its possible locations. + */ + for (i = 0; sblock_try[i] != -1; i++) { + if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) { + error = EINVAL; + vfs_mount_error(mp, + "Invalid sectorsize %d for superblock size %d", + cp->provider->sectorsize, SBLOCKSIZE); + goto out; + } + if ((error = bread(devvp, btodb(sblock_try[i]), SBLOCKSIZE, + cred, &bp)) != 0) + goto out; + fs = (struct fs *)bp->b_data; + sblockloc = sblock_try[i]; + if ((fs->fs_magic == FS_UFS1_MAGIC || + (fs->fs_magic == FS_UFS2_MAGIC && + (fs->fs_sblockloc == sblockloc || + (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0))) && + fs->fs_bsize <= MAXBSIZE && + fs->fs_bsize >= sizeof(struct fs)) + break; + brelse(bp); + bp = NULL; + } + if (sblock_try[i] == -1) { + error = EINVAL; /* XXX needs translation */ + goto out; + } + fs->fs_fmod = 0; + fs->fs_flags &= ~FS_INDEXDIRS; /* no support for directory indices */ + fs->fs_flags &= ~FS_UNCLEAN; + if (fs->fs_clean == 0) { + fs->fs_flags |= FS_UNCLEAN; + if (ronly || (mp->mnt_flag & MNT_FORCE) || + ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 && + (fs->fs_flags & FS_DOSOFTDEP))) { + printf("WARNING: %s was not properly dismounted\n", + fs->fs_fsmnt); + } else { + vfs_mount_error(mp, "R/W mount of %s denied. %s%s", + fs->fs_fsmnt, "Filesystem is not clean - run fsck.", + (fs->fs_flags & FS_SUJ) == 0 ? "" : + " Forced mount will invalidate journal contents"); + error = EPERM; + goto out; + } + if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) && + (mp->mnt_flag & MNT_FORCE)) { + printf("WARNING: %s: lost blocks %jd files %d\n", + fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, + fs->fs_pendinginodes); + fs->fs_pendingblocks = 0; + fs->fs_pendinginodes = 0; + } + } + if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { + printf("WARNING: %s: mount pending error: blocks %jd " + "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, + fs->fs_pendinginodes); + fs->fs_pendingblocks = 0; + fs->fs_pendinginodes = 0; + } + if ((fs->fs_flags & FS_GJOURNAL) != 0) { +#ifdef UFS_GJOURNAL + /* + * Get journal provider name. + */ + len = 1024; + mp->mnt_gjprovider = malloc((u_long)len, M_UFSMNT, M_WAITOK); + if (g_io_getattr("GJOURNAL::provider", cp, &len, + mp->mnt_gjprovider) == 0) { + mp->mnt_gjprovider = realloc(mp->mnt_gjprovider, len, + M_UFSMNT, M_WAITOK); + MNT_ILOCK(mp); + mp->mnt_flag |= MNT_GJOURNAL; + MNT_IUNLOCK(mp); + } else { + printf("WARNING: %s: GJOURNAL flag on fs " + "but no gjournal provider below\n", + mp->mnt_stat.f_mntonname); + free(mp->mnt_gjprovider, M_UFSMNT); + mp->mnt_gjprovider = NULL; + } +#else + printf("WARNING: %s: GJOURNAL flag on fs but no " + "UFS_GJOURNAL support\n", mp->mnt_stat.f_mntonname); +#endif + } else { + mp->mnt_gjprovider = NULL; + } + ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO); + ump->um_cp = cp; + ump->um_bo = &devvp->v_bufobj; + ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT, M_WAITOK); + if (fs->fs_magic == FS_UFS1_MAGIC) { + ump->um_fstype = UFS1; + ump->um_balloc = ffs_balloc_ufs1; + } else { + ump->um_fstype = UFS2; + ump->um_balloc = ffs_balloc_ufs2; + } + ump->um_blkatoff = ffs_blkatoff; + ump->um_truncate = ffs_truncate; + ump->um_update = ffs_update; + ump->um_valloc = ffs_valloc; + ump->um_vfree = ffs_vfree; + ump->um_ifree = ffs_ifree; + ump->um_rdonly = ffs_rdonly; + ump->um_snapgone = ffs_snapgone; + mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF); + bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize); + if (fs->fs_sbsize < SBLOCKSIZE) + bp->b_flags |= B_INVAL | B_NOCACHE; + brelse(bp); + bp = NULL; + fs = ump->um_fs; + ffs_oldfscompat_read(fs, ump, sblockloc); + fs->fs_ronly = ronly; + size = fs->fs_cssize; + blks = howmany(size, fs->fs_fsize); + if (fs->fs_contigsumsize > 0) + size += fs->fs_ncg * sizeof(int32_t); + size += fs->fs_ncg * sizeof(u_int8_t); + space = malloc(size, M_UFSMNT, M_WAITOK); + fs->fs_csp = space; + for (i = 0; i < blks; i += fs->fs_frag) { + size = fs->fs_bsize; + if (i + fs->fs_frag > blks) + size = (blks - i) * fs->fs_fsize; + if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, + cred, &bp)) != 0) { + free(fs->fs_csp, M_UFSMNT); + goto out; + } + bcopy(bp->b_data, space, (u_int)size); + space = (char *)space + size; + brelse(bp); + bp = NULL; + } + if (fs->fs_contigsumsize > 0) { + fs->fs_maxcluster = lp = space; + for (i = 0; i < fs->fs_ncg; i++) + *lp++ = fs->fs_contigsumsize; + space = lp; + } + size = fs->fs_ncg * sizeof(u_int8_t); + fs->fs_contigdirs = (u_int8_t *)space; + bzero(fs->fs_contigdirs, size); + fs->fs_active = NULL; + mp->mnt_data = ump; + mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0]; + mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1]; + nmp = NULL; + if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 || + (nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) { + if (nmp) + vfs_rel(nmp); + vfs_getnewfsid(mp); + } + mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; + MNT_ILOCK(mp); + mp->mnt_flag |= MNT_LOCAL; + MNT_IUNLOCK(mp); + if ((fs->fs_flags & FS_MULTILABEL) != 0) { +#ifdef MAC + MNT_ILOCK(mp); + mp->mnt_flag |= MNT_MULTILABEL; + MNT_IUNLOCK(mp); +#else + printf("WARNING: %s: multilabel flag on fs but " + "no MAC support\n", mp->mnt_stat.f_mntonname); +#endif + } + if ((fs->fs_flags & FS_ACLS) != 0) { +#ifdef UFS_ACL + MNT_ILOCK(mp); + + if (mp->mnt_flag & MNT_NFS4ACLS) + printf("WARNING: %s: ACLs flag on fs conflicts with " + "\"nfsv4acls\" mount option; option ignored\n", + mp->mnt_stat.f_mntonname); + mp->mnt_flag &= ~MNT_NFS4ACLS; + mp->mnt_flag |= MNT_ACLS; + + MNT_IUNLOCK(mp); +#else + printf("WARNING: %s: ACLs flag on fs but no ACLs support\n", + mp->mnt_stat.f_mntonname); +#endif + } + if ((fs->fs_flags & FS_NFS4ACLS) != 0) { +#ifdef UFS_ACL + MNT_ILOCK(mp); + + if (mp->mnt_flag & MNT_ACLS) + printf("WARNING: %s: NFSv4 ACLs flag on fs conflicts " + "with \"acls\" mount option; option ignored\n", + mp->mnt_stat.f_mntonname); + mp->mnt_flag &= ~MNT_ACLS; + mp->mnt_flag |= MNT_NFS4ACLS; + + MNT_IUNLOCK(mp); +#else + printf("WARNING: %s: NFSv4 ACLs flag on fs but no " + "ACLs support\n", mp->mnt_stat.f_mntonname); +#endif + } + if ((fs->fs_flags & FS_TRIM) != 0) { + len = sizeof(int); + if (g_io_getattr("GEOM::candelete", cp, &len, + &ump->um_candelete) == 0) { + if (!ump->um_candelete) + printf("WARNING: %s: TRIM flag on fs but disk " + "does not support TRIM\n", + mp->mnt_stat.f_mntonname); + } else { + printf("WARNING: %s: TRIM flag on fs but disk does " + "not confirm that it supports TRIM\n", + mp->mnt_stat.f_mntonname); + ump->um_candelete = 0; + } + if (ump->um_candelete) { + ump->um_trim_tq = taskqueue_create("trim", M_WAITOK, + taskqueue_thread_enqueue, &ump->um_trim_tq); + taskqueue_start_threads(&ump->um_trim_tq, 1, PVFS, + "%s trim", mp->mnt_stat.f_mntonname); + } + } + + ump->um_mountp = mp; + ump->um_dev = dev; + ump->um_devvp = devvp; + ump->um_nindir = fs->fs_nindir; + ump->um_bptrtodb = fs->fs_fsbtodb; + ump->um_seqinc = fs->fs_frag; + for (i = 0; i < MAXQUOTAS; i++) + ump->um_quotas[i] = NULLVP; +#ifdef UFS_EXTATTR + ufs_extattr_uepm_init(&ump->um_extattr); +#endif + /* + * Set FS local "last mounted on" information (NULL pad) + */ + bzero(fs->fs_fsmnt, MAXMNTLEN); + strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN); + mp->mnt_stat.f_iosize = fs->fs_bsize; + + if (mp->mnt_flag & MNT_ROOTFS) { + /* + * Root mount; update timestamp in mount structure. + * this will be used by the common root mount code + * to update the system clock. + */ + mp->mnt_time = fs->fs_time; + } + + if (ronly == 0) { + fs->fs_mtime = time_second; + if ((fs->fs_flags & FS_DOSOFTDEP) && + (error = softdep_mount(devvp, mp, fs, cred)) != 0) { + free(fs->fs_csp, M_UFSMNT); + ffs_flushfiles(mp, FORCECLOSE, td); + goto out; + } + if (fs->fs_snapinum[0] != 0) + ffs_snapshot_mount(mp); + fs->fs_fmod = 1; + fs->fs_clean = 0; + (void) ffs_sbupdate(ump, MNT_WAIT, 0); + } + /* + * Initialize filesystem state information in mount struct. + */ + MNT_ILOCK(mp); + mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED | + MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS | MNTK_USES_BCACHE; + MNT_IUNLOCK(mp); +#ifdef UFS_EXTATTR +#ifdef UFS_EXTATTR_AUTOSTART + /* + * + * Auto-starting does the following: + * - check for /.attribute in the fs, and extattr_start if so + * - for each file in .attribute, enable that file with + * an attribute of the same name. + * Not clear how to report errors -- probably eat them. + * This would all happen while the filesystem was busy/not + * available, so would effectively be "atomic". + */ + (void) ufs_extattr_autostart(mp, td); +#endif /* !UFS_EXTATTR_AUTOSTART */ +#endif /* !UFS_EXTATTR */ + return (0); +out: + if (bp) + brelse(bp); + if (cp != NULL) { + g_topology_lock(); + g_vfs_close(cp); + g_topology_unlock(); + } + if (ump) { + mtx_destroy(UFS_MTX(ump)); + if (mp->mnt_gjprovider != NULL) { + free(mp->mnt_gjprovider, M_UFSMNT); + mp->mnt_gjprovider = NULL; + } + free(ump->um_fs, M_UFSMNT); + free(ump, M_UFSMNT); + mp->mnt_data = NULL; + } + atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); + dev_rel(dev); + return (error); +} + +#include +static int bigcgs = 0; +SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, ""); + +/* + * Sanity checks for loading old filesystem superblocks. + * See ffs_oldfscompat_write below for unwound actions. + * + * XXX - Parts get retired eventually. + * Unfortunately new bits get added. + */ +static void +ffs_oldfscompat_read(fs, ump, sblockloc) + struct fs *fs; + struct ufsmount *ump; + ufs2_daddr_t sblockloc; +{ + off_t maxfilesize; + + /* + * If not yet done, update fs_flags location and value of fs_sblockloc. + */ + if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { + fs->fs_flags = fs->fs_old_flags; + fs->fs_old_flags |= FS_FLAGS_UPDATED; + fs->fs_sblockloc = sblockloc; + } + /* + * If not yet done, update UFS1 superblock with new wider fields. + */ + if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) { + fs->fs_maxbsize = fs->fs_bsize; + fs->fs_time = fs->fs_old_time; + fs->fs_size = fs->fs_old_size; + fs->fs_dsize = fs->fs_old_dsize; + fs->fs_csaddr = fs->fs_old_csaddr; + fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir; + fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree; + fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree; + fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree; + } + if (fs->fs_magic == FS_UFS1_MAGIC && + fs->fs_old_inodefmt < FS_44INODEFMT) { + fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1; + fs->fs_qbmask = ~fs->fs_bmask; + fs->fs_qfmask = ~fs->fs_fmask; + } + if (fs->fs_magic == FS_UFS1_MAGIC) { + ump->um_savedmaxfilesize = fs->fs_maxfilesize; + maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1; + if (fs->fs_maxfilesize > maxfilesize) + fs->fs_maxfilesize = maxfilesize; + } + /* Compatibility for old filesystems */ + if (fs->fs_avgfilesize <= 0) + fs->fs_avgfilesize = AVFILESIZ; + if (fs->fs_avgfpdir <= 0) + fs->fs_avgfpdir = AFPDIR; + if (bigcgs) { + fs->fs_save_cgsize = fs->fs_cgsize; + fs->fs_cgsize = fs->fs_bsize; + } +} + +/* + * Unwinding superblock updates for old filesystems. + * See ffs_oldfscompat_read above for details. + * + * XXX - Parts get retired eventually. + * Unfortunately new bits get added. + */ +void +ffs_oldfscompat_write(fs, ump) + struct fs *fs; + struct ufsmount *ump; +{ + + /* + * Copy back UFS2 updated fields that UFS1 inspects. + */ + if (fs->fs_magic == FS_UFS1_MAGIC) { + fs->fs_old_time = fs->fs_time; + fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir; + fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree; + fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree; + fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree; + fs->fs_maxfilesize = ump->um_savedmaxfilesize; + } + if (bigcgs) { + fs->fs_cgsize = fs->fs_save_cgsize; + fs->fs_save_cgsize = 0; + } +} + +/* + * unmount system call + */ +static int +ffs_unmount(mp, mntflags) + struct mount *mp; + int mntflags; +{ + struct thread *td; + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs; + int error, flags, susp; +#ifdef UFS_EXTATTR + int e_restart; +#endif + + flags = 0; + td = curthread; + fs = ump->um_fs; + susp = 0; + if (mntflags & MNT_FORCE) { + flags |= FORCECLOSE; + susp = fs->fs_ronly == 0; + } +#ifdef UFS_EXTATTR + if ((error = ufs_extattr_stop(mp, td))) { + if (error != EOPNOTSUPP) + printf("WARNING: unmount %s: ufs_extattr_stop " + "returned errno %d\n", mp->mnt_stat.f_mntonname, + error); + e_restart = 0; + } else { + ufs_extattr_uepm_destroy(&ump->um_extattr); + e_restart = 1; + } +#endif + if (susp) { + error = vfs_write_suspend_umnt(mp); + if (error != 0) + goto fail1; + } + if (MOUNTEDSOFTDEP(mp)) + error = softdep_flushfiles(mp, flags, td); + else + error = ffs_flushfiles(mp, flags, td); + if (error != 0 && error != ENXIO) + goto fail; + + UFS_LOCK(ump); + if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { + printf("WARNING: unmount %s: pending error: blocks %jd " + "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, + fs->fs_pendinginodes); + fs->fs_pendingblocks = 0; + fs->fs_pendinginodes = 0; + } + UFS_UNLOCK(ump); + if (MOUNTEDSOFTDEP(mp)) + softdep_unmount(mp); + if (fs->fs_ronly == 0 || ump->um_fsckpid > 0) { + fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1; + error = ffs_sbupdate(ump, MNT_WAIT, 0); + if (error && error != ENXIO) { + fs->fs_clean = 0; + goto fail; + } + } + if (susp) + vfs_write_resume(mp, VR_START_WRITE); + if (ump->um_trim_tq != NULL) { + while (ump->um_trim_inflight != 0) + pause("ufsutr", hz); + taskqueue_drain_all(ump->um_trim_tq); + taskqueue_free(ump->um_trim_tq); + } + g_topology_lock(); + if (ump->um_fsckpid > 0) { + /* + * Return to normal read-only mode. + */ + error = g_access(ump->um_cp, 0, -1, 0); + ump->um_fsckpid = 0; + } + g_vfs_close(ump->um_cp); + g_topology_unlock(); + atomic_store_rel_ptr((uintptr_t *)&ump->um_dev->si_mountpt, 0); + vrele(ump->um_devvp); + dev_rel(ump->um_dev); + mtx_destroy(UFS_MTX(ump)); + if (mp->mnt_gjprovider != NULL) { + free(mp->mnt_gjprovider, M_UFSMNT); + mp->mnt_gjprovider = NULL; + } + free(fs->fs_csp, M_UFSMNT); + free(fs, M_UFSMNT); + free(ump, M_UFSMNT); + mp->mnt_data = NULL; + MNT_ILOCK(mp); + mp->mnt_flag &= ~MNT_LOCAL; + MNT_IUNLOCK(mp); + if (td->td_su == mp) { + td->td_su = NULL; + vfs_rel(mp); + } + return (error); + +fail: + if (susp) + vfs_write_resume(mp, VR_START_WRITE); +fail1: +#ifdef UFS_EXTATTR + if (e_restart) { + ufs_extattr_uepm_init(&ump->um_extattr); +#ifdef UFS_EXTATTR_AUTOSTART + (void) ufs_extattr_autostart(mp, td); +#endif + } +#endif + + return (error); +} + +/* + * Flush out all the files in a filesystem. + */ +int +ffs_flushfiles(mp, flags, td) + struct mount *mp; + int flags; + struct thread *td; +{ + struct ufsmount *ump; + int qerror, error; + + ump = VFSTOUFS(mp); + qerror = 0; +#ifdef QUOTA + if (mp->mnt_flag & MNT_QUOTA) { + int i; + error = vflush(mp, 0, SKIPSYSTEM|flags, td); + if (error) + return (error); + for (i = 0; i < MAXQUOTAS; i++) { + error = quotaoff(td, mp, i); + if (error != 0) { + if ((flags & EARLYFLUSH) == 0) + return (error); + else + qerror = error; + } + } + + /* + * Here we fall through to vflush again to ensure that + * we have gotten rid of all the system vnodes, unless + * quotas must not be closed. + */ + } +#endif + ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles"); + if (ump->um_devvp->v_vflag & VV_COPYONWRITE) { + if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0) + return (error); + ffs_snapshot_unmount(mp); + flags |= FORCECLOSE; + /* + * Here we fall through to vflush again to ensure + * that we have gotten rid of all the system vnodes. + */ + } + + /* + * Do not close system files if quotas were not closed, to be + * able to sync the remaining dquots. The freeblks softupdate + * workitems might hold a reference on a dquot, preventing + * quotaoff() from completing. Next round of + * softdep_flushworklist() iteration should process the + * blockers, allowing the next run of quotaoff() to finally + * flush held dquots. + * + * Otherwise, flush all the files. + */ + if (qerror == 0 && (error = vflush(mp, 0, flags, td)) != 0) + return (error); + + /* + * Flush filesystem metadata. + */ + vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td); + VOP_UNLOCK(ump->um_devvp, 0); + return (error); +} + +/* + * Get filesystem statistics. + */ +static int +ffs_statfs(mp, sbp) + struct mount *mp; + struct statfs *sbp; +{ + struct ufsmount *ump; + struct fs *fs; + + ump = VFSTOUFS(mp); + fs = ump->um_fs; + if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC) + panic("ffs_statfs"); + sbp->f_version = STATFS_VERSION; + sbp->f_bsize = fs->fs_fsize; + sbp->f_iosize = fs->fs_bsize; + sbp->f_blocks = fs->fs_dsize; + UFS_LOCK(ump); + sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag + + fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks); + sbp->f_bavail = freespace(fs, fs->fs_minfree) + + dbtofsb(fs, fs->fs_pendingblocks); + sbp->f_files = fs->fs_ncg * fs->fs_ipg - ROOTINO; + sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes; + UFS_UNLOCK(ump); + sbp->f_namemax = NAME_MAX; + return (0); +} + +static bool +sync_doupdate(struct inode *ip) +{ + + return ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | + IN_UPDATE)) != 0); +} + +/* + * For a lazy sync, we only care about access times, quotas and the + * superblock. Other filesystem changes are already converted to + * cylinder group blocks or inode blocks updates and are written to + * disk by syncer. + */ +static int +ffs_sync_lazy(mp) + struct mount *mp; +{ + struct vnode *mvp, *vp; + struct inode *ip; + struct thread *td; + int allerror, error; + + allerror = 0; + td = curthread; + if ((mp->mnt_flag & MNT_NOATIME) != 0) + goto qupdate; + MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { + if (vp->v_type == VNON) { + VI_UNLOCK(vp); + continue; + } + ip = VTOI(vp); + + /* + * The IN_ACCESS flag is converted to IN_MODIFIED by + * ufs_close() and ufs_getattr() by the calls to + * ufs_itimes_locked(), without subsequent UFS_UPDATE(). + * Test also all the other timestamp flags too, to pick up + * any other cases that could be missed. + */ + if (!sync_doupdate(ip) && (vp->v_iflag & VI_OWEINACT) == 0) { + VI_UNLOCK(vp); + continue; + } + if ((error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, + td)) != 0) + continue; + if (sync_doupdate(ip)) + error = ffs_update(vp, 0); + if (error != 0) + allerror = error; + vput(vp); + } + +qupdate: +#ifdef QUOTA + qsync(mp); +#endif + + if (VFSTOUFS(mp)->um_fs->fs_fmod != 0 && + (error = ffs_sbupdate(VFSTOUFS(mp), MNT_LAZY, 0)) != 0) + allerror = error; + return (allerror); +} + +/* + * Go through the disk queues to initiate sandbagged IO; + * go through the inodes to write those that have been modified; + * initiate the writing of the super block if it has been modified. + * + * Note: we are always called with the filesystem marked busy using + * vfs_busy(). + */ +static int +ffs_sync(mp, waitfor) + struct mount *mp; + int waitfor; +{ + struct vnode *mvp, *vp, *devvp; + struct thread *td; + struct inode *ip; + struct ufsmount *ump = VFSTOUFS(mp); + struct fs *fs; + int error, count, lockreq, allerror = 0; + int suspend; + int suspended; + int secondary_writes; + int secondary_accwrites; + int softdep_deps; + int softdep_accdeps; + struct bufobj *bo; + + suspend = 0; + suspended = 0; + td = curthread; + fs = ump->um_fs; + if (fs->fs_fmod != 0 && fs->fs_ronly != 0 && ump->um_fsckpid == 0) + panic("%s: ffs_sync: modification on read-only filesystem", + fs->fs_fsmnt); + if (waitfor == MNT_LAZY) { + if (!rebooting) + return (ffs_sync_lazy(mp)); + waitfor = MNT_NOWAIT; + } + + /* + * Write back each (modified) inode. + */ + lockreq = LK_EXCLUSIVE | LK_NOWAIT; + if (waitfor == MNT_SUSPEND) { + suspend = 1; + waitfor = MNT_WAIT; + } + if (waitfor == MNT_WAIT) + lockreq = LK_EXCLUSIVE; + lockreq |= LK_INTERLOCK | LK_SLEEPFAIL; +loop: + /* Grab snapshot of secondary write counts */ + MNT_ILOCK(mp); + secondary_writes = mp->mnt_secondary_writes; + secondary_accwrites = mp->mnt_secondary_accwrites; + MNT_IUNLOCK(mp); + + /* Grab snapshot of softdep dependency counts */ + softdep_get_depcounts(mp, &softdep_deps, &softdep_accdeps); + + MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { + /* + * Depend on the vnode interlock to keep things stable enough + * for a quick test. Since there might be hundreds of + * thousands of vnodes, we cannot afford even a subroutine + * call unless there's a good chance that we have work to do. + */ + if (vp->v_type == VNON) { + VI_UNLOCK(vp); + continue; + } + ip = VTOI(vp); + if ((ip->i_flag & + (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && + vp->v_bufobj.bo_dirty.bv_cnt == 0) { + VI_UNLOCK(vp); + continue; + } + if ((error = vget(vp, lockreq, td)) != 0) { + if (error == ENOENT || error == ENOLCK) { + MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); + goto loop; + } + continue; + } + if ((error = ffs_syncvnode(vp, waitfor, 0)) != 0) + allerror = error; + vput(vp); + } + /* + * Force stale filesystem control information to be flushed. + */ + if (waitfor == MNT_WAIT || rebooting) { + if ((error = softdep_flushworklist(ump->um_mountp, &count, td))) + allerror = error; + /* Flushed work items may create new vnodes to clean */ + if (allerror == 0 && count) + goto loop; + } +#ifdef QUOTA + qsync(mp); +#endif + + devvp = ump->um_devvp; + bo = &devvp->v_bufobj; + BO_LOCK(bo); + if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { + BO_UNLOCK(bo); + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_FSYNC(devvp, waitfor, td); + VOP_UNLOCK(devvp, 0); + if (MOUNTEDSOFTDEP(mp) && (error == 0 || error == EAGAIN)) + error = ffs_sbupdate(ump, waitfor, 0); + if (error != 0) + allerror = error; + if (allerror == 0 && waitfor == MNT_WAIT) + goto loop; + } else if (suspend != 0) { + if (softdep_check_suspend(mp, + devvp, + softdep_deps, + softdep_accdeps, + secondary_writes, + secondary_accwrites) != 0) { + MNT_IUNLOCK(mp); + goto loop; /* More work needed */ + } + mtx_assert(MNT_MTX(mp), MA_OWNED); + mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED; + MNT_IUNLOCK(mp); + suspended = 1; + } else + BO_UNLOCK(bo); + /* + * Write back modified superblock. + */ + if (fs->fs_fmod != 0 && + (error = ffs_sbupdate(ump, waitfor, suspended)) != 0) + allerror = error; + return (allerror); +} + +int +ffs_vget(mp, ino, flags, vpp) + struct mount *mp; + ino_t ino; + int flags; + struct vnode **vpp; +{ + return (ffs_vgetf(mp, ino, flags, vpp, 0)); +} + +int +ffs_vgetf(mp, ino, flags, vpp, ffs_flags) + struct mount *mp; + ino_t ino; + int flags; + struct vnode **vpp; + int ffs_flags; +{ + struct fs *fs; + struct inode *ip; + struct ufsmount *ump; + struct buf *bp; + struct vnode *vp; + int error; + + error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL); + if (error || *vpp != NULL) + return (error); + + /* + * We must promote to an exclusive lock for vnode creation. This + * can happen if lookup is passed LOCKSHARED. + */ + if ((flags & LK_TYPE_MASK) == LK_SHARED) { + flags &= ~LK_TYPE_MASK; + flags |= LK_EXCLUSIVE; + } + + /* + * We do not lock vnode creation as it is believed to be too + * expensive for such rare case as simultaneous creation of vnode + * for same ino by different processes. We just allow them to race + * and check later to decide who wins. Let the race begin! + */ + + ump = VFSTOUFS(mp); + fs = ump->um_fs; + ip = uma_zalloc(uma_inode, M_WAITOK | M_ZERO); + + /* Allocate a new vnode/inode. */ + error = getnewvnode("ufs", mp, fs->fs_magic == FS_UFS1_MAGIC ? + &ffs_vnodeops1 : &ffs_vnodeops2, &vp); + if (error) { + *vpp = NULL; + uma_zfree(uma_inode, ip); + return (error); + } + /* + * FFS supports recursive locking. + */ + lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); + VN_LOCK_AREC(vp); + vp->v_data = ip; + vp->v_bufobj.bo_bsize = fs->fs_bsize; + ip->i_vnode = vp; + ip->i_ump = ump; + ip->i_number = ino; + ip->i_ea_refs = 0; + ip->i_nextclustercg = -1; + ip->i_flag = fs->fs_magic == FS_UFS1_MAGIC ? 0 : IN_UFS2; +#ifdef QUOTA + { + int i; + for (i = 0; i < MAXQUOTAS; i++) + ip->i_dquot[i] = NODQUOT; + } +#endif + + if (ffs_flags & FFSV_FORCEINSMQ) + vp->v_vflag |= VV_FORCEINSMQ; + error = insmntque(vp, mp); + if (error != 0) { + uma_zfree(uma_inode, ip); + *vpp = NULL; + return (error); + } + vp->v_vflag &= ~VV_FORCEINSMQ; + error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL); + if (error || *vpp != NULL) + return (error); + + /* Read in the disk contents for the inode, copy into the inode. */ + error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), + (int)fs->fs_bsize, NOCRED, &bp); + if (error) { + /* + * The inode does not contain anything useful, so it would + * be misleading to leave it on its hash chain. With mode + * still zero, it will be unlinked and returned to the free + * list by vput(). + */ + brelse(bp); + vput(vp); + *vpp = NULL; + return (error); + } + if (I_IS_UFS1(ip)) + ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK); + else + ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK); + ffs_load_inode(bp, ip, fs, ino); + if (DOINGSOFTDEP(vp)) + softdep_load_inodeblock(ip); + else + ip->i_effnlink = ip->i_nlink; + bqrelse(bp); + + /* + * Initialize the vnode from the inode, check for aliases. + * Note that the underlying vnode may have changed. + */ + error = ufs_vinit(mp, I_IS_UFS1(ip) ? &ffs_fifoops1 : &ffs_fifoops2, + &vp); + if (error) { + vput(vp); + *vpp = NULL; + return (error); + } + + /* + * Finish inode initialization. + */ + if (vp->v_type != VFIFO) { + /* FFS supports shared locking for all files except fifos. */ + VN_LOCK_ASHARE(vp); + } + + /* + * Set up a generation number for this inode if it does not + * already have one. This should only happen on old filesystems. + */ + if (ip->i_gen == 0) { + while (ip->i_gen == 0) + ip->i_gen = arc4random(); + if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { + ip->i_flag |= IN_MODIFIED; + DIP_SET(ip, i_gen, ip->i_gen); + } + } +#ifdef MAC + if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) { + /* + * If this vnode is already allocated, and we're running + * multi-label, attempt to perform a label association + * from the extended attributes on the inode. + */ + error = mac_vnode_associate_extattr(mp, vp); + if (error) { + /* ufs_inactive will release ip->i_devvp ref. */ + vput(vp); + *vpp = NULL; + return (error); + } + } +#endif + + *vpp = vp; + return (0); +} + +/* + * File handle to vnode + * + * Have to be really careful about stale file handles: + * - check that the inode number is valid + * - for UFS2 check that the inode number is initialized + * - call ffs_vget() to get the locked inode + * - check for an unallocated inode (i_mode == 0) + * - check that the given client host has export rights and return + * those rights via. exflagsp and credanonp + */ +static int +ffs_fhtovp(mp, fhp, flags, vpp) + struct mount *mp; + struct fid *fhp; + int flags; + struct vnode **vpp; +{ + struct ufid *ufhp; + struct ufsmount *ump; + struct fs *fs; + struct cg *cgp; + struct buf *bp; + ino_t ino; + u_int cg; + int error; + + ufhp = (struct ufid *)fhp; + ino = ufhp->ufid_ino; + ump = VFSTOUFS(mp); + fs = ump->um_fs; + if (ino < ROOTINO || ino >= fs->fs_ncg * fs->fs_ipg) + return (ESTALE); + /* + * Need to check if inode is initialized because UFS2 does lazy + * initialization and nfs_fhtovp can offer arbitrary inode numbers. + */ + if (fs->fs_magic != FS_UFS2_MAGIC) + return (ufs_fhtovp(mp, ufhp, flags, vpp)); + cg = ino_to_cg(fs, ino); + error = bread(ump->um_devvp, fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize, NOCRED, &bp); + if (error) + return (error); + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp) || ino >= cg * fs->fs_ipg + cgp->cg_initediblk) { + brelse(bp); + return (ESTALE); + } + brelse(bp); + return (ufs_fhtovp(mp, ufhp, flags, vpp)); +} + +/* + * Initialize the filesystem. + */ +static int +ffs_init(vfsp) + struct vfsconf *vfsp; +{ + + ffs_susp_initialize(); + softdep_initialize(); + return (ufs_init(vfsp)); +} + +/* + * Undo the work of ffs_init(). + */ +static int +ffs_uninit(vfsp) + struct vfsconf *vfsp; +{ + int ret; + + ret = ufs_uninit(vfsp); + softdep_uninitialize(); + ffs_susp_uninitialize(); + return (ret); +} + +/* + * Write a superblock and associated information back to disk. + */ +int +ffs_sbupdate(ump, waitfor, suspended) + struct ufsmount *ump; + int waitfor; + int suspended; +{ + struct fs *fs = ump->um_fs; + struct buf *sbbp; + struct buf *bp; + int blks; + void *space; + int i, size, error, allerror = 0; + + if (fs->fs_ronly == 1 && + (ump->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) != + (MNT_RDONLY | MNT_UPDATE) && ump->um_fsckpid == 0) + panic("ffs_sbupdate: write read-only filesystem"); + /* + * We use the superblock's buf to serialize calls to ffs_sbupdate(). + */ + sbbp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), + (int)fs->fs_sbsize, 0, 0, 0); + /* + * First write back the summary information. + */ + blks = howmany(fs->fs_cssize, fs->fs_fsize); + space = fs->fs_csp; + for (i = 0; i < blks; i += fs->fs_frag) { + size = fs->fs_bsize; + if (i + fs->fs_frag > blks) + size = (blks - i) * fs->fs_fsize; + bp = getblk(ump->um_devvp, fsbtodb(fs, fs->fs_csaddr + i), + size, 0, 0, 0); + bcopy(space, bp->b_data, (u_int)size); + space = (char *)space + size; + if (suspended) + bp->b_flags |= B_VALIDSUSPWRT; + if (waitfor != MNT_WAIT) + bawrite(bp); + else if ((error = bwrite(bp)) != 0) + allerror = error; + } + /* + * Now write back the superblock itself. If any errors occurred + * up to this point, then fail so that the superblock avoids + * being written out as clean. + */ + if (allerror) { + brelse(sbbp); + return (allerror); + } + bp = sbbp; + if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 && + (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { + printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n", + fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1); + fs->fs_sblockloc = SBLOCK_UFS1; + } + if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 && + (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { + printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n", + fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2); + fs->fs_sblockloc = SBLOCK_UFS2; + } + fs->fs_fmod = 0; + fs->fs_time = time_second; + if (MOUNTEDSOFTDEP(ump->um_mountp)) + softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, bp); + bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); + ffs_oldfscompat_write((struct fs *)bp->b_data, ump); + if (suspended) + bp->b_flags |= B_VALIDSUSPWRT; + if (waitfor != MNT_WAIT) + bawrite(bp); + else if ((error = bwrite(bp)) != 0) + allerror = error; + return (allerror); +} + +static int +ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, + int attrnamespace, const char *attrname) +{ + +#ifdef UFS_EXTATTR + return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace, + attrname)); +#else + return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, + attrname)); +#endif +} + +static void +ffs_ifree(struct ufsmount *ump, struct inode *ip) +{ + + if (ump->um_fstype == UFS1 && ip->i_din1 != NULL) + uma_zfree(uma_ufs1, ip->i_din1); + else if (ip->i_din2 != NULL) + uma_zfree(uma_ufs2, ip->i_din2); + uma_zfree(uma_inode, ip); +} + +static int dobkgrdwrite = 1; +SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, + "Do background writes (honoring the BV_BKGRDWRITE flag)?"); + +/* + * Complete a background write started from bwrite. + */ +static void +ffs_backgroundwritedone(struct buf *bp) +{ + struct bufobj *bufobj; + struct buf *origbp; + + /* + * Find the original buffer that we are writing. + */ + bufobj = bp->b_bufobj; + BO_LOCK(bufobj); + if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL) + panic("backgroundwritedone: lost buffer"); + + /* + * We should mark the cylinder group buffer origbp as + * dirty, to not loose the failed write. + */ + if ((bp->b_ioflags & BIO_ERROR) != 0) + origbp->b_vflags |= BV_BKGRDERR; + BO_UNLOCK(bufobj); + /* + * Process dependencies then return any unfinished ones. + */ + if (!LIST_EMPTY(&bp->b_dep) && (bp->b_ioflags & BIO_ERROR) == 0) + buf_complete(bp); +#ifdef SOFTUPDATES + if (!LIST_EMPTY(&bp->b_dep)) + softdep_move_dependencies(bp, origbp); +#endif + /* + * This buffer is marked B_NOCACHE so when it is released + * by biodone it will be tossed. + */ + bp->b_flags |= B_NOCACHE; + bp->b_flags &= ~B_CACHE; + pbrelvp(bp); + + /* + * Prevent brelse() from trying to keep and re-dirtying bp on + * errors. It causes b_bufobj dereference in + * bdirty()/reassignbuf(), and b_bufobj was cleared in + * pbrelvp() above. + */ + if ((bp->b_ioflags & BIO_ERROR) != 0) + bp->b_flags |= B_INVAL; + bufdone(bp); + BO_LOCK(bufobj); + /* + * Clear the BV_BKGRDINPROG flag in the original buffer + * and awaken it if it is waiting for the write to complete. + * If BV_BKGRDINPROG is not set in the original buffer it must + * have been released and re-instantiated - which is not legal. + */ + KASSERT((origbp->b_vflags & BV_BKGRDINPROG), + ("backgroundwritedone: lost buffer2")); + origbp->b_vflags &= ~BV_BKGRDINPROG; + if (origbp->b_vflags & BV_BKGRDWAIT) { + origbp->b_vflags &= ~BV_BKGRDWAIT; + wakeup(&origbp->b_xflags); + } + BO_UNLOCK(bufobj); +} + + +/* + * Write, release buffer on completion. (Done by iodone + * if async). Do not bother writing anything if the buffer + * is invalid. + * + * Note that we set B_CACHE here, indicating that buffer is + * fully valid and thus cacheable. This is true even of NFS + * now so we set it generally. This could be set either here + * or in biodone() since the I/O is synchronous. We put it + * here. + */ +static int +ffs_bufwrite(struct buf *bp) +{ + struct buf *newbp; + + CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); + if (bp->b_flags & B_INVAL) { + brelse(bp); + return (0); + } + + if (!BUF_ISLOCKED(bp)) + panic("bufwrite: buffer is not busy???"); + /* + * If a background write is already in progress, delay + * writing this block if it is asynchronous. Otherwise + * wait for the background write to complete. + */ + BO_LOCK(bp->b_bufobj); + if (bp->b_vflags & BV_BKGRDINPROG) { + if (bp->b_flags & B_ASYNC) { + BO_UNLOCK(bp->b_bufobj); + bdwrite(bp); + return (0); + } + bp->b_vflags |= BV_BKGRDWAIT; + msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj), PRIBIO, + "bwrbg", 0); + if (bp->b_vflags & BV_BKGRDINPROG) + panic("bufwrite: still writing"); + } + bp->b_vflags &= ~BV_BKGRDERR; + BO_UNLOCK(bp->b_bufobj); + + /* + * If this buffer is marked for background writing and we + * do not have to wait for it, make a copy and write the + * copy so as to leave this buffer ready for further use. + * + * This optimization eats a lot of memory. If we have a page + * or buffer shortfall we can't do it. + */ + if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && + (bp->b_flags & B_ASYNC) && + !vm_page_count_severe() && + !buf_dirty_count_severe()) { + KASSERT(bp->b_iodone == NULL, + ("bufwrite: needs chained iodone (%p)", bp->b_iodone)); + + /* get a new block */ + newbp = geteblk(bp->b_bufsize, GB_NOWAIT_BD); + if (newbp == NULL) + goto normal_write; + + KASSERT(buf_mapped(bp), ("Unmapped cg")); + memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); + BO_LOCK(bp->b_bufobj); + bp->b_vflags |= BV_BKGRDINPROG; + BO_UNLOCK(bp->b_bufobj); + newbp->b_xflags |= BX_BKGRDMARKER; + newbp->b_lblkno = bp->b_lblkno; + newbp->b_blkno = bp->b_blkno; + newbp->b_offset = bp->b_offset; + newbp->b_iodone = ffs_backgroundwritedone; + newbp->b_flags |= B_ASYNC; + newbp->b_flags &= ~B_INVAL; + pbgetvp(bp->b_vp, newbp); + +#ifdef SOFTUPDATES + /* + * Move over the dependencies. If there are rollbacks, + * leave the parent buffer dirtied as it will need to + * be written again. + */ + if (LIST_EMPTY(&bp->b_dep) || + softdep_move_dependencies(bp, newbp) == 0) + bundirty(bp); +#else + bundirty(bp); +#endif + + /* + * Initiate write on the copy, release the original. The + * BKGRDINPROG flag prevents it from going away until + * the background write completes. + */ + bqrelse(bp); + bp = newbp; + } else + /* Mark the buffer clean */ + bundirty(bp); + + + /* Let the normal bufwrite do the rest for us */ +normal_write: + return (bufwrite(bp)); +} + + +static void +ffs_geom_strategy(struct bufobj *bo, struct buf *bp) +{ + struct vnode *vp; + int error; + struct buf *tbp; + int nocopy; + + vp = bo->__bo_vnode; + if (bp->b_iocmd == BIO_WRITE) { + if ((bp->b_flags & B_VALIDSUSPWRT) == 0 && + bp->b_vp != NULL && bp->b_vp->v_mount != NULL && + (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0) + panic("ffs_geom_strategy: bad I/O"); + nocopy = bp->b_flags & B_NOCOPY; + bp->b_flags &= ~(B_VALIDSUSPWRT | B_NOCOPY); + if ((vp->v_vflag & VV_COPYONWRITE) && nocopy == 0 && + vp->v_rdev->si_snapdata != NULL) { + if ((bp->b_flags & B_CLUSTER) != 0) { + runningbufwakeup(bp); + TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, + b_cluster.cluster_entry) { + error = ffs_copyonwrite(vp, tbp); + if (error != 0 && + error != EOPNOTSUPP) { + bp->b_error = error; + bp->b_ioflags |= BIO_ERROR; + bufdone(bp); + return; + } + } + bp->b_runningbufspace = bp->b_bufsize; + atomic_add_long(&runningbufspace, + bp->b_runningbufspace); + } else { + error = ffs_copyonwrite(vp, bp); + if (error != 0 && error != EOPNOTSUPP) { + bp->b_error = error; + bp->b_ioflags |= BIO_ERROR; + bufdone(bp); + return; + } + } + } +#ifdef SOFTUPDATES + if ((bp->b_flags & B_CLUSTER) != 0) { + TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, + b_cluster.cluster_entry) { + if (!LIST_EMPTY(&tbp->b_dep)) + buf_start(tbp); + } + } else { + if (!LIST_EMPTY(&bp->b_dep)) + buf_start(bp); + } + +#endif + } + g_vfs_strategy(bo, bp); +} + +int +ffs_own_mount(const struct mount *mp) +{ + + if (mp->mnt_op == &ufs_vfsops) + return (1); + return (0); +} + +#ifdef DDB +#ifdef SOFTUPDATES + +/* defined in ffs_softdep.c */ +extern void db_print_ffs(struct ufsmount *ump); + +DB_SHOW_COMMAND(ffs, db_show_ffs) +{ + struct mount *mp; + struct ufsmount *ump; + + if (have_addr) { + ump = VFSTOUFS((struct mount *)addr); + db_print_ffs(ump); + return; + } + + TAILQ_FOREACH(mp, &mountlist, mnt_list) { + if (!strcmp(mp->mnt_stat.f_fstypename, ufs_vfsconf.vfc_name)) + db_print_ffs(VFSTOUFS(mp)); + } +} + +#endif /* SOFTUPDATES */ +#endif /* DDB */ diff --git a/Dump/ufs/ffs/ffs_vnops.c b/Dump/ufs/ffs/ffs_vnops.c new file mode 100644 index 0000000..50ceebe --- /dev/null +++ b/Dump/ufs/ffs/ffs_vnops.c @@ -0,0 +1,1745 @@ +/*- + * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Marshall + * Kirk McKusick and Network Associates Laboratories, the Security + * Research Division of Network Associates, Inc. under DARPA/SPAWAR + * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS + * research program + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 + * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ... + * @(#)ffs_vnops.c 8.15 (Berkeley) 5/14/95 + */ + +#include +__FBSDID("$FreeBSD: releng/11.2/sys/ufs/ffs/ffs_vnops.c 331722 2018-03-29 02:50:57Z eadler $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include "opt_directio.h" +#include "opt_ffs.h" + +#ifdef DIRECTIO +extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); +#endif +static vop_fdatasync_t ffs_fdatasync; +static vop_fsync_t ffs_fsync; +static vop_getpages_t ffs_getpages; +static vop_lock1_t ffs_lock; +static vop_read_t ffs_read; +static vop_write_t ffs_write; +static int ffs_extread(struct vnode *vp, struct uio *uio, int ioflag); +static int ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, + struct ucred *cred); +static vop_strategy_t ffsext_strategy; +static vop_closeextattr_t ffs_closeextattr; +static vop_deleteextattr_t ffs_deleteextattr; +static vop_getextattr_t ffs_getextattr; +static vop_listextattr_t ffs_listextattr; +static vop_openextattr_t ffs_openextattr; +static vop_setextattr_t ffs_setextattr; +static vop_vptofh_t ffs_vptofh; + +/* Global vfs data structures for ufs. */ +struct vop_vector ffs_vnodeops1 = { + .vop_default = &ufs_vnodeops, + .vop_fsync = ffs_fsync, + .vop_fdatasync = ffs_fdatasync, + .vop_getpages = ffs_getpages, + .vop_getpages_async = vnode_pager_local_getpages_async, + .vop_lock1 = ffs_lock, + .vop_read = ffs_read, + .vop_reallocblks = ffs_reallocblks, + .vop_write = ffs_write, + .vop_vptofh = ffs_vptofh, +}; + +struct vop_vector ffs_fifoops1 = { + .vop_default = &ufs_fifoops, + .vop_fsync = ffs_fsync, + .vop_fdatasync = ffs_fdatasync, + .vop_reallocblks = ffs_reallocblks, /* XXX: really ??? */ + .vop_vptofh = ffs_vptofh, +}; + +/* Global vfs data structures for ufs. */ +struct vop_vector ffs_vnodeops2 = { + .vop_default = &ufs_vnodeops, + .vop_fsync = ffs_fsync, + .vop_fdatasync = ffs_fdatasync, + .vop_getpages = ffs_getpages, + .vop_getpages_async = vnode_pager_local_getpages_async, + .vop_lock1 = ffs_lock, + .vop_read = ffs_read, + .vop_reallocblks = ffs_reallocblks, + .vop_write = ffs_write, + .vop_closeextattr = ffs_closeextattr, + .vop_deleteextattr = ffs_deleteextattr, + .vop_getextattr = ffs_getextattr, + .vop_listextattr = ffs_listextattr, + .vop_openextattr = ffs_openextattr, + .vop_setextattr = ffs_setextattr, + .vop_vptofh = ffs_vptofh, +}; + +struct vop_vector ffs_fifoops2 = { + .vop_default = &ufs_fifoops, + .vop_fsync = ffs_fsync, + .vop_fdatasync = ffs_fdatasync, + .vop_lock1 = ffs_lock, + .vop_reallocblks = ffs_reallocblks, + .vop_strategy = ffsext_strategy, + .vop_closeextattr = ffs_closeextattr, + .vop_deleteextattr = ffs_deleteextattr, + .vop_getextattr = ffs_getextattr, + .vop_listextattr = ffs_listextattr, + .vop_openextattr = ffs_openextattr, + .vop_setextattr = ffs_setextattr, + .vop_vptofh = ffs_vptofh, +}; + +/* + * Synch an open file. + */ +/* ARGSUSED */ +static int +ffs_fsync(struct vop_fsync_args *ap) +{ + struct vnode *vp; + struct bufobj *bo; + int error; + + vp = ap->a_vp; + bo = &vp->v_bufobj; +retry: + error = ffs_syncvnode(vp, ap->a_waitfor, 0); + if (error) + return (error); + if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) { + error = softdep_fsync(vp); + if (error) + return (error); + + /* + * The softdep_fsync() function may drop vp lock, + * allowing for dirty buffers to reappear on the + * bo_dirty list. Recheck and resync as needed. + */ + BO_LOCK(bo); + if ((vp->v_type == VREG || vp->v_type == VDIR) && + (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) { + BO_UNLOCK(bo); + goto retry; + } + BO_UNLOCK(bo); + } + return (0); +} + +int +ffs_syncvnode(struct vnode *vp, int waitfor, int flags) +{ + struct inode *ip; + struct bufobj *bo; + struct buf *bp, *nbp; + ufs_lbn_t lbn; + int error, passes; + bool still_dirty, wait; + + ip = VTOI(vp); + ip->i_flag &= ~IN_NEEDSYNC; + bo = &vp->v_bufobj; + + /* + * When doing MNT_WAIT we must first flush all dependencies + * on the inode. + */ + if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT && + (error = softdep_sync_metadata(vp)) != 0) + return (error); + + /* + * Flush all dirty buffers associated with a vnode. + */ + error = 0; + passes = 0; + wait = false; /* Always do an async pass first. */ + lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1)); + BO_LOCK(bo); +loop: + TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) + bp->b_vflags &= ~BV_SCANNED; + TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { + /* + * Reasons to skip this buffer: it has already been considered + * on this pass, the buffer has dependencies that will cause + * it to be redirtied and it has not already been deferred, + * or it is already being written. + */ + if ((bp->b_vflags & BV_SCANNED) != 0) + continue; + bp->b_vflags |= BV_SCANNED; + /* + * Flush indirects in order, if requested. + * + * Note that if only datasync is requested, we can + * skip indirect blocks when softupdates are not + * active. Otherwise we must flush them with data, + * since dependencies prevent data block writes. + */ + if (waitfor == MNT_WAIT && bp->b_lblkno <= -NDADDR && + (lbn_level(bp->b_lblkno) >= passes || + ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp)))) + continue; + if (bp->b_lblkno > lbn) + panic("ffs_syncvnode: syncing truncated data."); + if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) { + BO_UNLOCK(bo); + } else if (wait) { + if (BUF_LOCK(bp, + LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, + BO_LOCKPTR(bo)) != 0) { + bp->b_vflags &= ~BV_SCANNED; + goto next; + } + } else + continue; + if ((bp->b_flags & B_DELWRI) == 0) + panic("ffs_fsync: not dirty"); + /* + * Check for dependencies and potentially complete them. + */ + if (!LIST_EMPTY(&bp->b_dep) && + (error = softdep_sync_buf(vp, bp, + wait ? MNT_WAIT : MNT_NOWAIT)) != 0) { + /* I/O error. */ + if (error != EBUSY) { + BUF_UNLOCK(bp); + return (error); + } + /* If we deferred once, don't defer again. */ + if ((bp->b_flags & B_DEFERRED) == 0) { + bp->b_flags |= B_DEFERRED; + BUF_UNLOCK(bp); + goto next; + } + } + if (wait) { + bremfree(bp); + if ((error = bwrite(bp)) != 0) + return (error); + } else if ((bp->b_flags & B_CLUSTEROK)) { + (void) vfs_bio_awrite(bp); + } else { + bremfree(bp); + (void) bawrite(bp); + } +next: + /* + * Since we may have slept during the I/O, we need + * to start from a known point. + */ + BO_LOCK(bo); + nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd); + } + if (waitfor != MNT_WAIT) { + BO_UNLOCK(bo); + if ((flags & NO_INO_UPDT) != 0) + return (0); + else + return (ffs_update(vp, 0)); + } + /* Drain IO to see if we're done. */ + bufobj_wwait(bo, 0, 0); + /* + * Block devices associated with filesystems may have new I/O + * requests posted for them even if the vnode is locked, so no + * amount of trying will get them clean. We make several passes + * as a best effort. + * + * Regular files may need multiple passes to flush all dependency + * work as it is possible that we must write once per indirect + * level, once for the leaf, and once for the inode and each of + * these will be done with one sync and one async pass. + */ + if (bo->bo_dirty.bv_cnt > 0) { + if ((flags & DATA_ONLY) == 0) { + still_dirty = true; + } else { + /* + * For data-only sync, dirty indirect buffers + * are ignored. + */ + still_dirty = false; + TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { + if (bp->b_lblkno > -NDADDR) { + still_dirty = true; + break; + } + } + } + + if (still_dirty) { + /* Write the inode after sync passes to flush deps. */ + if (wait && DOINGSOFTDEP(vp) && + (flags & NO_INO_UPDT) == 0) { + BO_UNLOCK(bo); + ffs_update(vp, 1); + BO_LOCK(bo); + } + /* switch between sync/async. */ + wait = !wait; + if (wait || ++passes < NIADDR + 2) + goto loop; +#ifdef INVARIANTS + if (!vn_isdisk(vp, NULL)) + vn_printf(vp, "ffs_fsync: dirty "); +#endif + } + } + BO_UNLOCK(bo); + error = 0; + if ((flags & DATA_ONLY) == 0) { + if ((flags & NO_INO_UPDT) == 0) + error = ffs_update(vp, 1); + if (DOINGSUJ(vp)) + softdep_journal_fsync(VTOI(vp)); + } + return (error); +} + +static int +ffs_fdatasync(struct vop_fdatasync_args *ap) +{ + + return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY)); +} + +static int +ffs_lock(ap) + struct vop_lock1_args /* { + struct vnode *a_vp; + int a_flags; + struct thread *a_td; + char *file; + int line; + } */ *ap; +{ +#ifndef NO_FFS_SNAPSHOT + struct vnode *vp; + int flags; + struct lock *lkp; + int result; + + switch (ap->a_flags & LK_TYPE_MASK) { + case LK_SHARED: + case LK_UPGRADE: + case LK_EXCLUSIVE: + vp = ap->a_vp; + flags = ap->a_flags; + for (;;) { +#ifdef DEBUG_VFS_LOCKS + KASSERT(vp->v_holdcnt != 0, + ("ffs_lock %p: zero hold count", vp)); +#endif + lkp = vp->v_vnlock; + result = _lockmgr_args(lkp, flags, VI_MTX(vp), + LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, + ap->a_file, ap->a_line); + if (lkp == vp->v_vnlock || result != 0) + break; + /* + * Apparent success, except that the vnode + * mutated between snapshot file vnode and + * regular file vnode while this process + * slept. The lock currently held is not the + * right lock. Release it, and try to get the + * new lock. + */ + (void) _lockmgr_args(lkp, LK_RELEASE, NULL, + LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT, + ap->a_file, ap->a_line); + if ((flags & (LK_INTERLOCK | LK_NOWAIT)) == + (LK_INTERLOCK | LK_NOWAIT)) + return (EBUSY); + if ((flags & LK_TYPE_MASK) == LK_UPGRADE) + flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE; + flags &= ~LK_INTERLOCK; + } + break; + default: + result = VOP_LOCK1_APV(&ufs_vnodeops, ap); + } + return (result); +#else + return (VOP_LOCK1_APV(&ufs_vnodeops, ap)); +#endif +} + +/* + * Vnode op for reading. + */ +static int +ffs_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp; + struct inode *ip; + struct uio *uio; + struct fs *fs; + struct buf *bp; + ufs_lbn_t lbn, nextlbn; + off_t bytesinfile; + long size, xfersize, blkoffset; + ssize_t orig_resid; + int error; + int seqcount; + int ioflag; + + vp = ap->a_vp; + uio = ap->a_uio; + ioflag = ap->a_ioflag; + if (ap->a_ioflag & IO_EXT) +#ifdef notyet + return (ffs_extread(vp, uio, ioflag)); +#else + panic("ffs_read+IO_EXT"); +#endif +#ifdef DIRECTIO + if ((ioflag & IO_DIRECT) != 0) { + int workdone; + + error = ffs_rawread(vp, uio, &workdone); + if (error != 0 || workdone != 0) + return error; + } +#endif + + seqcount = ap->a_ioflag >> IO_SEQSHIFT; + ip = VTOI(vp); + +#ifdef INVARIANTS + if (uio->uio_rw != UIO_READ) + panic("ffs_read: mode"); + + if (vp->v_type == VLNK) { + if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) + panic("ffs_read: short symlink"); + } else if (vp->v_type != VREG && vp->v_type != VDIR) + panic("ffs_read: type %d", vp->v_type); +#endif + orig_resid = uio->uio_resid; + KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0")); + if (orig_resid == 0) + return (0); + KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0")); + fs = ITOFS(ip); + if (uio->uio_offset < ip->i_size && + uio->uio_offset >= fs->fs_maxfilesize) + return (EOVERFLOW); + + for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { + if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) + break; + lbn = lblkno(fs, uio->uio_offset); + nextlbn = lbn + 1; + + /* + * size of buffer. The buffer representing the + * end of the file is rounded up to the size of + * the block type ( fragment or full block, + * depending ). + */ + size = blksize(fs, ip, lbn); + blkoffset = blkoff(fs, uio->uio_offset); + + /* + * The amount we want to transfer in this iteration is + * one FS block less the amount of the data before + * our startpoint (duh!) + */ + xfersize = fs->fs_bsize - blkoffset; + + /* + * But if we actually want less than the block, + * or the file doesn't have a whole block more of data, + * then use the lesser number. + */ + if (uio->uio_resid < xfersize) + xfersize = uio->uio_resid; + if (bytesinfile < xfersize) + xfersize = bytesinfile; + + if (lblktosize(fs, nextlbn) >= ip->i_size) { + /* + * Don't do readahead if this is the end of the file. + */ + error = bread_gb(vp, lbn, size, NOCRED, + GB_UNMAPPED, &bp); + } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { + /* + * Otherwise if we are allowed to cluster, + * grab as much as we can. + * + * XXX This may not be a win if we are not + * doing sequential access. + */ + error = cluster_read(vp, ip->i_size, lbn, + size, NOCRED, blkoffset + uio->uio_resid, + seqcount, GB_UNMAPPED, &bp); + } else if (seqcount > 1) { + /* + * If we are NOT allowed to cluster, then + * if we appear to be acting sequentially, + * fire off a request for a readahead + * as well as a read. Note that the 4th and 5th + * arguments point to arrays of the size specified in + * the 6th argument. + */ + u_int nextsize = blksize(fs, ip, nextlbn); + error = breadn_flags(vp, lbn, size, &nextlbn, + &nextsize, 1, NOCRED, GB_UNMAPPED, &bp); + } else { + /* + * Failing all of the above, just read what the + * user asked for. Interestingly, the same as + * the first option above. + */ + error = bread_gb(vp, lbn, size, NOCRED, + GB_UNMAPPED, &bp); + } + if (error) { + brelse(bp); + bp = NULL; + break; + } + + /* + * We should only get non-zero b_resid when an I/O error + * has occurred, which should cause us to break above. + * However, if the short read did not cause an error, + * then we want to ensure that we do not uiomove bad + * or uninitialized data. + */ + size -= bp->b_resid; + if (size < xfersize) { + if (size == 0) + break; + xfersize = size; + } + + if (buf_mapped(bp)) { + error = vn_io_fault_uiomove((char *)bp->b_data + + blkoffset, (int)xfersize, uio); + } else { + error = vn_io_fault_pgmove(bp->b_pages, blkoffset, + (int)xfersize, uio); + } + if (error) + break; + + vfs_bio_brelse(bp, ioflag); + } + + /* + * This can only happen in the case of an error + * because the loop above resets bp to NULL on each iteration + * and on normal completion has not set a new value into it. + * so it must have come from a 'break' statement + */ + if (bp != NULL) + vfs_bio_brelse(bp, ioflag); + + if ((error == 0 || uio->uio_resid != orig_resid) && + (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0 && + (ip->i_flag & IN_ACCESS) == 0) { + VI_LOCK(vp); + ip->i_flag |= IN_ACCESS; + VI_UNLOCK(vp); + } + return (error); +} + +/* + * Vnode op for writing. + */ +static int +ffs_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp; + struct uio *uio; + struct inode *ip; + struct fs *fs; + struct buf *bp; + ufs_lbn_t lbn; + off_t osize; + ssize_t resid; + int seqcount; + int blkoffset, error, flags, ioflag, size, xfersize; + + vp = ap->a_vp; + uio = ap->a_uio; + ioflag = ap->a_ioflag; + if (ap->a_ioflag & IO_EXT) +#ifdef notyet + return (ffs_extwrite(vp, uio, ioflag, ap->a_cred)); +#else + panic("ffs_write+IO_EXT"); +#endif + + seqcount = ap->a_ioflag >> IO_SEQSHIFT; + ip = VTOI(vp); + +#ifdef INVARIANTS + if (uio->uio_rw != UIO_WRITE) + panic("ffs_write: mode"); +#endif + + switch (vp->v_type) { + case VREG: + if (ioflag & IO_APPEND) + uio->uio_offset = ip->i_size; + if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) + return (EPERM); + /* FALLTHROUGH */ + case VLNK: + break; + case VDIR: + panic("ffs_write: dir write"); + break; + default: + panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type, + (int)uio->uio_offset, + (int)uio->uio_resid + ); + } + + KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0")); + KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0")); + fs = ITOFS(ip); + if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) + return (EFBIG); + /* + * Maybe this should be above the vnode op call, but so long as + * file servers have no limits, I don't think it matters. + */ + if (vn_rlimit_fsize(vp, uio, uio->uio_td)) + return (EFBIG); + + resid = uio->uio_resid; + osize = ip->i_size; + if (seqcount > BA_SEQMAX) + flags = BA_SEQMAX << BA_SEQSHIFT; + else + flags = seqcount << BA_SEQSHIFT; + if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) + flags |= IO_SYNC; + flags |= BA_UNMAPPED; + + for (error = 0; uio->uio_resid > 0;) { + lbn = lblkno(fs, uio->uio_offset); + blkoffset = blkoff(fs, uio->uio_offset); + xfersize = fs->fs_bsize - blkoffset; + if (uio->uio_resid < xfersize) + xfersize = uio->uio_resid; + if (uio->uio_offset + xfersize > ip->i_size) + vnode_pager_setsize(vp, uio->uio_offset + xfersize); + + /* + * We must perform a read-before-write if the transfer size + * does not cover the entire buffer. + */ + if (fs->fs_bsize > xfersize) + flags |= BA_CLRBUF; + else + flags &= ~BA_CLRBUF; +/* XXX is uio->uio_offset the right thing here? */ + error = UFS_BALLOC(vp, uio->uio_offset, xfersize, + ap->a_cred, flags, &bp); + if (error != 0) { + vnode_pager_setsize(vp, ip->i_size); + break; + } + if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL)) + bp->b_flags |= B_NOCACHE; + + if (uio->uio_offset + xfersize > ip->i_size) { + ip->i_size = uio->uio_offset + xfersize; + DIP_SET(ip, i_size, ip->i_size); + } + + size = blksize(fs, ip, lbn) - bp->b_resid; + if (size < xfersize) + xfersize = size; + + if (buf_mapped(bp)) { + error = vn_io_fault_uiomove((char *)bp->b_data + + blkoffset, (int)xfersize, uio); + } else { + error = vn_io_fault_pgmove(bp->b_pages, blkoffset, + (int)xfersize, uio); + } + /* + * If the buffer is not already filled and we encounter an + * error while trying to fill it, we have to clear out any + * garbage data from the pages instantiated for the buffer. + * If we do not, a failed uiomove() during a write can leave + * the prior contents of the pages exposed to a userland mmap. + * + * Note that we need only clear buffers with a transfer size + * equal to the block size because buffers with a shorter + * transfer size were cleared above by the call to UFS_BALLOC() + * with the BA_CLRBUF flag set. + * + * If the source region for uiomove identically mmaps the + * buffer, uiomove() performed the NOP copy, and the buffer + * content remains valid because the page fault handler + * validated the pages. + */ + if (error != 0 && (bp->b_flags & B_CACHE) == 0 && + fs->fs_bsize == xfersize) + vfs_bio_clrbuf(bp); + + vfs_bio_set_flags(bp, ioflag); + + /* + * If IO_SYNC each buffer is written synchronously. Otherwise + * if we have a severe page deficiency write the buffer + * asynchronously. Otherwise try to cluster, and if that + * doesn't do it then either do an async write (if O_DIRECT), + * or a delayed write (if not). + */ + if (ioflag & IO_SYNC) { + (void)bwrite(bp); + } else if (vm_page_count_severe() || + buf_dirty_count_severe() || + (ioflag & IO_ASYNC)) { + bp->b_flags |= B_CLUSTEROK; + bawrite(bp); + } else if (xfersize + blkoffset == fs->fs_bsize) { + if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { + bp->b_flags |= B_CLUSTEROK; + cluster_write(vp, bp, ip->i_size, seqcount, + GB_UNMAPPED); + } else { + bawrite(bp); + } + } else if (ioflag & IO_DIRECT) { + bp->b_flags |= B_CLUSTEROK; + bawrite(bp); + } else { + bp->b_flags |= B_CLUSTEROK; + bdwrite(bp); + } + if (error || xfersize == 0) + break; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } + /* + * If we successfully wrote any data, and we are not the superuser + * we clear the setuid and setgid bits as a precaution against + * tampering. + */ + if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && + ap->a_cred) { + if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) { + ip->i_mode &= ~(ISUID | ISGID); + DIP_SET(ip, i_mode, ip->i_mode); + } + } + if (error) { + if (ioflag & IO_UNIT) { + (void)ffs_truncate(vp, osize, + IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred); + uio->uio_offset -= resid - uio->uio_resid; + uio->uio_resid = resid; + } + } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) + error = ffs_update(vp, 1); + return (error); +} + +/* + * Extended attribute area reading. + */ +static int +ffs_extread(struct vnode *vp, struct uio *uio, int ioflag) +{ + struct inode *ip; + struct ufs2_dinode *dp; + struct fs *fs; + struct buf *bp; + ufs_lbn_t lbn, nextlbn; + off_t bytesinfile; + long size, xfersize, blkoffset; + ssize_t orig_resid; + int error; + + ip = VTOI(vp); + fs = ITOFS(ip); + dp = ip->i_din2; + +#ifdef INVARIANTS + if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC) + panic("ffs_extread: mode"); + +#endif + orig_resid = uio->uio_resid; + KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0")); + if (orig_resid == 0) + return (0); + KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0")); + + for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { + if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0) + break; + lbn = lblkno(fs, uio->uio_offset); + nextlbn = lbn + 1; + + /* + * size of buffer. The buffer representing the + * end of the file is rounded up to the size of + * the block type ( fragment or full block, + * depending ). + */ + size = sblksize(fs, dp->di_extsize, lbn); + blkoffset = blkoff(fs, uio->uio_offset); + + /* + * The amount we want to transfer in this iteration is + * one FS block less the amount of the data before + * our startpoint (duh!) + */ + xfersize = fs->fs_bsize - blkoffset; + + /* + * But if we actually want less than the block, + * or the file doesn't have a whole block more of data, + * then use the lesser number. + */ + if (uio->uio_resid < xfersize) + xfersize = uio->uio_resid; + if (bytesinfile < xfersize) + xfersize = bytesinfile; + + if (lblktosize(fs, nextlbn) >= dp->di_extsize) { + /* + * Don't do readahead if this is the end of the info. + */ + error = bread(vp, -1 - lbn, size, NOCRED, &bp); + } else { + /* + * If we have a second block, then + * fire off a request for a readahead + * as well as a read. Note that the 4th and 5th + * arguments point to arrays of the size specified in + * the 6th argument. + */ + u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn); + + nextlbn = -1 - nextlbn; + error = breadn(vp, -1 - lbn, + size, &nextlbn, &nextsize, 1, NOCRED, &bp); + } + if (error) { + brelse(bp); + bp = NULL; + break; + } + + /* + * We should only get non-zero b_resid when an I/O error + * has occurred, which should cause us to break above. + * However, if the short read did not cause an error, + * then we want to ensure that we do not uiomove bad + * or uninitialized data. + */ + size -= bp->b_resid; + if (size < xfersize) { + if (size == 0) + break; + xfersize = size; + } + + error = uiomove((char *)bp->b_data + blkoffset, + (int)xfersize, uio); + if (error) + break; + vfs_bio_brelse(bp, ioflag); + } + + /* + * This can only happen in the case of an error + * because the loop above resets bp to NULL on each iteration + * and on normal completion has not set a new value into it. + * so it must have come from a 'break' statement + */ + if (bp != NULL) + vfs_bio_brelse(bp, ioflag); + return (error); +} + +/* + * Extended attribute area writing. + */ +static int +ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred) +{ + struct inode *ip; + struct ufs2_dinode *dp; + struct fs *fs; + struct buf *bp; + ufs_lbn_t lbn; + off_t osize; + ssize_t resid; + int blkoffset, error, flags, size, xfersize; + + ip = VTOI(vp); + fs = ITOFS(ip); + dp = ip->i_din2; + +#ifdef INVARIANTS + if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC) + panic("ffs_extwrite: mode"); +#endif + + if (ioflag & IO_APPEND) + uio->uio_offset = dp->di_extsize; + KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0")); + KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0")); + if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize) + return (EFBIG); + + resid = uio->uio_resid; + osize = dp->di_extsize; + flags = IO_EXT; + if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) + flags |= IO_SYNC; + + for (error = 0; uio->uio_resid > 0;) { + lbn = lblkno(fs, uio->uio_offset); + blkoffset = blkoff(fs, uio->uio_offset); + xfersize = fs->fs_bsize - blkoffset; + if (uio->uio_resid < xfersize) + xfersize = uio->uio_resid; + + /* + * We must perform a read-before-write if the transfer size + * does not cover the entire buffer. + */ + if (fs->fs_bsize > xfersize) + flags |= BA_CLRBUF; + else + flags &= ~BA_CLRBUF; + error = UFS_BALLOC(vp, uio->uio_offset, xfersize, + ucred, flags, &bp); + if (error != 0) + break; + /* + * If the buffer is not valid we have to clear out any + * garbage data from the pages instantiated for the buffer. + * If we do not, a failed uiomove() during a write can leave + * the prior contents of the pages exposed to a userland + * mmap(). XXX deal with uiomove() errors a better way. + */ + if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) + vfs_bio_clrbuf(bp); + + if (uio->uio_offset + xfersize > dp->di_extsize) + dp->di_extsize = uio->uio_offset + xfersize; + + size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid; + if (size < xfersize) + xfersize = size; + + error = + uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); + + vfs_bio_set_flags(bp, ioflag); + + /* + * If IO_SYNC each buffer is written synchronously. Otherwise + * if we have a severe page deficiency write the buffer + * asynchronously. Otherwise try to cluster, and if that + * doesn't do it then either do an async write (if O_DIRECT), + * or a delayed write (if not). + */ + if (ioflag & IO_SYNC) { + (void)bwrite(bp); + } else if (vm_page_count_severe() || + buf_dirty_count_severe() || + xfersize + blkoffset == fs->fs_bsize || + (ioflag & (IO_ASYNC | IO_DIRECT))) + bawrite(bp); + else + bdwrite(bp); + if (error || xfersize == 0) + break; + ip->i_flag |= IN_CHANGE; + } + /* + * If we successfully wrote any data, and we are not the superuser + * we clear the setuid and setgid bits as a precaution against + * tampering. + */ + if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) { + if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) { + ip->i_mode &= ~(ISUID | ISGID); + dp->di_mode = ip->i_mode; + } + } + if (error) { + if (ioflag & IO_UNIT) { + (void)ffs_truncate(vp, osize, + IO_EXT | (ioflag&IO_SYNC), ucred); + uio->uio_offset -= resid - uio->uio_resid; + uio->uio_resid = resid; + } + } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) + error = ffs_update(vp, 1); + return (error); +} + + +/* + * Vnode operating to retrieve a named extended attribute. + * + * Locate a particular EA (nspace:name) in the area (ptr:length), and return + * the length of the EA, and possibly the pointer to the entry and to the data. + */ +static int +ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac) +{ + u_char *p, *pe, *pn, *p0; + int eapad1, eapad2, ealength, ealen, nlen; + uint32_t ul; + + pe = ptr + length; + nlen = strlen(name); + + for (p = ptr; p < pe; p = pn) { + p0 = p; + bcopy(p, &ul, sizeof(ul)); + pn = p + ul; + /* make sure this entry is complete */ + if (pn > pe) + break; + p += sizeof(uint32_t); + if (*p != nspace) + continue; + p++; + eapad2 = *p++; + if (*p != nlen) + continue; + p++; + if (bcmp(p, name, nlen)) + continue; + ealength = sizeof(uint32_t) + 3 + nlen; + eapad1 = 8 - (ealength % 8); + if (eapad1 == 8) + eapad1 = 0; + ealength += eapad1; + ealen = ul - ealength - eapad2; + p += nlen + eapad1; + if (eap != NULL) + *eap = p0; + if (eac != NULL) + *eac = p; + return (ealen); + } + return(-1); +} + +static int +ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra) +{ + struct inode *ip; + struct ufs2_dinode *dp; + struct fs *fs; + struct uio luio; + struct iovec liovec; + u_int easize; + int error; + u_char *eae; + + ip = VTOI(vp); + fs = ITOFS(ip); + dp = ip->i_din2; + easize = dp->di_extsize; + if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize) + return (EFBIG); + + eae = malloc(easize + extra, M_TEMP, M_WAITOK); + + liovec.iov_base = eae; + liovec.iov_len = easize; + luio.uio_iov = &liovec; + luio.uio_iovcnt = 1; + luio.uio_offset = 0; + luio.uio_resid = easize; + luio.uio_segflg = UIO_SYSSPACE; + luio.uio_rw = UIO_READ; + luio.uio_td = td; + + error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC); + if (error) { + free(eae, M_TEMP); + return(error); + } + *p = eae; + return (0); +} + +static void +ffs_lock_ea(struct vnode *vp) +{ + struct inode *ip; + + ip = VTOI(vp); + VI_LOCK(vp); + while (ip->i_flag & IN_EA_LOCKED) { + ip->i_flag |= IN_EA_LOCKWAIT; + msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea", + 0); + } + ip->i_flag |= IN_EA_LOCKED; + VI_UNLOCK(vp); +} + +static void +ffs_unlock_ea(struct vnode *vp) +{ + struct inode *ip; + + ip = VTOI(vp); + VI_LOCK(vp); + if (ip->i_flag & IN_EA_LOCKWAIT) + wakeup(&ip->i_ea_refs); + ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT); + VI_UNLOCK(vp); +} + +static int +ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td) +{ + struct inode *ip; + struct ufs2_dinode *dp; + int error; + + ip = VTOI(vp); + + ffs_lock_ea(vp); + if (ip->i_ea_area != NULL) { + ip->i_ea_refs++; + ffs_unlock_ea(vp); + return (0); + } + dp = ip->i_din2; + error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0); + if (error) { + ffs_unlock_ea(vp); + return (error); + } + ip->i_ea_len = dp->di_extsize; + ip->i_ea_error = 0; + ip->i_ea_refs++; + ffs_unlock_ea(vp); + return (0); +} + +/* + * Vnode extattr transaction commit/abort + */ +static int +ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td) +{ + struct inode *ip; + struct uio luio; + struct iovec liovec; + int error; + struct ufs2_dinode *dp; + + ip = VTOI(vp); + + ffs_lock_ea(vp); + if (ip->i_ea_area == NULL) { + ffs_unlock_ea(vp); + return (EINVAL); + } + dp = ip->i_din2; + error = ip->i_ea_error; + if (commit && error == 0) { + ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit"); + if (cred == NOCRED) + cred = vp->v_mount->mnt_cred; + liovec.iov_base = ip->i_ea_area; + liovec.iov_len = ip->i_ea_len; + luio.uio_iov = &liovec; + luio.uio_iovcnt = 1; + luio.uio_offset = 0; + luio.uio_resid = ip->i_ea_len; + luio.uio_segflg = UIO_SYSSPACE; + luio.uio_rw = UIO_WRITE; + luio.uio_td = td; + /* XXX: I'm not happy about truncating to zero size */ + if (ip->i_ea_len < dp->di_extsize) + error = ffs_truncate(vp, 0, IO_EXT, cred); + error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred); + } + if (--ip->i_ea_refs == 0) { + free(ip->i_ea_area, M_TEMP); + ip->i_ea_area = NULL; + ip->i_ea_len = 0; + ip->i_ea_error = 0; + } + ffs_unlock_ea(vp); + return (error); +} + +/* + * Vnode extattr strategy routine for fifos. + * + * We need to check for a read or write of the external attributes. + * Otherwise we just fall through and do the usual thing. + */ +static int +ffsext_strategy(struct vop_strategy_args *ap) +/* +struct vop_strategy_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + struct buf *a_bp; +}; +*/ +{ + struct vnode *vp; + daddr_t lbn; + + vp = ap->a_vp; + lbn = ap->a_bp->b_lblkno; + if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -NXADDR) + return (VOP_STRATEGY_APV(&ufs_vnodeops, ap)); + if (vp->v_type == VFIFO) + return (VOP_STRATEGY_APV(&ufs_fifoops, ap)); + panic("spec nodes went here"); +} + +/* + * Vnode extattr transaction commit/abort + */ +static int +ffs_openextattr(struct vop_openextattr_args *ap) +/* +struct vop_openextattr_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +*/ +{ + + if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) + return (EOPNOTSUPP); + + return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td)); +} + + +/* + * Vnode extattr transaction commit/abort + */ +static int +ffs_closeextattr(struct vop_closeextattr_args *ap) +/* +struct vop_closeextattr_args { + struct vnodeop_desc *a_desc; + struct vnode *a_vp; + int a_commit; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +*/ +{ + + if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) + return (EOPNOTSUPP); + + if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)) + return (EROFS); + + return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td)); +} + +/* + * Vnode operation to remove a named attribute. + */ +static int +ffs_deleteextattr(struct vop_deleteextattr_args *ap) +/* +vop_deleteextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +*/ +{ + struct inode *ip; + struct fs *fs; + uint32_t ealength, ul; + int ealen, olen, eapad1, eapad2, error, i, easize; + u_char *eae, *p; + + ip = VTOI(ap->a_vp); + fs = ITOFS(ip); + + if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) + return (EOPNOTSUPP); + + if (strlen(ap->a_name) == 0) + return (EINVAL); + + if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VWRITE); + if (error) { + + /* + * ffs_lock_ea is not needed there, because the vnode + * must be exclusively locked. + */ + if (ip->i_ea_area != NULL && ip->i_ea_error == 0) + ip->i_ea_error = error; + return (error); + } + + error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); + if (error) + return (error); + + ealength = eapad1 = ealen = eapad2 = 0; + + eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK); + bcopy(ip->i_ea_area, eae, ip->i_ea_len); + easize = ip->i_ea_len; + + olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, + &p, NULL); + if (olen == -1) { + /* delete but nonexistent */ + free(eae, M_TEMP); + ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); + return(ENOATTR); + } + bcopy(p, &ul, sizeof ul); + i = p - eae + ul; + if (ul != ealength) { + bcopy(p + ul, p + ealength, easize - i); + easize += (ealength - ul); + } + if (easize > NXADDR * fs->fs_bsize) { + free(eae, M_TEMP); + ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); + if (ip->i_ea_area != NULL && ip->i_ea_error == 0) + ip->i_ea_error = ENOSPC; + return(ENOSPC); + } + p = ip->i_ea_area; + ip->i_ea_area = eae; + ip->i_ea_len = easize; + free(p, M_TEMP); + error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); + return(error); +} + +/* + * Vnode operation to retrieve a named extended attribute. + */ +static int +ffs_getextattr(struct vop_getextattr_args *ap) +/* +vop_getextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + INOUT struct uio *a_uio; + OUT size_t *a_size; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +*/ +{ + struct inode *ip; + u_char *eae, *p; + unsigned easize; + int error, ealen; + + ip = VTOI(ap->a_vp); + + if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) + return (EOPNOTSUPP); + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VREAD); + if (error) + return (error); + + error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); + if (error) + return (error); + + eae = ip->i_ea_area; + easize = ip->i_ea_len; + + ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name, + NULL, &p); + if (ealen >= 0) { + error = 0; + if (ap->a_size != NULL) + *ap->a_size = ealen; + else if (ap->a_uio != NULL) + error = uiomove(p, ealen, ap->a_uio); + } else + error = ENOATTR; + + ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); + return(error); +} + +/* + * Vnode operation to retrieve extended attributes on a vnode. + */ +static int +ffs_listextattr(struct vop_listextattr_args *ap) +/* +vop_listextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + INOUT struct uio *a_uio; + OUT size_t *a_size; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +*/ +{ + struct inode *ip; + u_char *eae, *p, *pe, *pn; + unsigned easize; + uint32_t ul; + int error, ealen; + + ip = VTOI(ap->a_vp); + + if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) + return (EOPNOTSUPP); + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VREAD); + if (error) + return (error); + + error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); + if (error) + return (error); + eae = ip->i_ea_area; + easize = ip->i_ea_len; + + error = 0; + if (ap->a_size != NULL) + *ap->a_size = 0; + pe = eae + easize; + for(p = eae; error == 0 && p < pe; p = pn) { + bcopy(p, &ul, sizeof(ul)); + pn = p + ul; + if (pn > pe) + break; + p += sizeof(ul); + if (*p++ != ap->a_attrnamespace) + continue; + p++; /* pad2 */ + ealen = *p; + if (ap->a_size != NULL) { + *ap->a_size += ealen + 1; + } else if (ap->a_uio != NULL) { + error = uiomove(p, ealen + 1, ap->a_uio); + } + } + ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); + return(error); +} + +/* + * Vnode operation to set a named attribute. + */ +static int +ffs_setextattr(struct vop_setextattr_args *ap) +/* +vop_setextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + INOUT struct uio *a_uio; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +*/ +{ + struct inode *ip; + struct fs *fs; + uint32_t ealength, ul; + ssize_t ealen; + int olen, eapad1, eapad2, error, i, easize; + u_char *eae, *p; + + ip = VTOI(ap->a_vp); + fs = ITOFS(ip); + + if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK) + return (EOPNOTSUPP); + + if (strlen(ap->a_name) == 0) + return (EINVAL); + + /* XXX Now unsupported API to delete EAs using NULL uio. */ + if (ap->a_uio == NULL) + return (EOPNOTSUPP); + + if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + + ealen = ap->a_uio->uio_resid; + if (ealen < 0 || ealen > lblktosize(fs, NXADDR)) + return (EINVAL); + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VWRITE); + if (error) { + + /* + * ffs_lock_ea is not needed there, because the vnode + * must be exclusively locked. + */ + if (ip->i_ea_area != NULL && ip->i_ea_error == 0) + ip->i_ea_error = error; + return (error); + } + + error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td); + if (error) + return (error); + + ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name); + eapad1 = 8 - (ealength % 8); + if (eapad1 == 8) + eapad1 = 0; + eapad2 = 8 - (ealen % 8); + if (eapad2 == 8) + eapad2 = 0; + ealength += eapad1 + ealen + eapad2; + + eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK); + bcopy(ip->i_ea_area, eae, ip->i_ea_len); + easize = ip->i_ea_len; + + olen = ffs_findextattr(eae, easize, + ap->a_attrnamespace, ap->a_name, &p, NULL); + if (olen == -1) { + /* new, append at end */ + p = eae + easize; + easize += ealength; + } else { + bcopy(p, &ul, sizeof ul); + i = p - eae + ul; + if (ul != ealength) { + bcopy(p + ul, p + ealength, easize - i); + easize += (ealength - ul); + } + } + if (easize > lblktosize(fs, NXADDR)) { + free(eae, M_TEMP); + ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); + if (ip->i_ea_area != NULL && ip->i_ea_error == 0) + ip->i_ea_error = ENOSPC; + return(ENOSPC); + } + bcopy(&ealength, p, sizeof(ealength)); + p += sizeof(ealength); + *p++ = ap->a_attrnamespace; + *p++ = eapad2; + *p++ = strlen(ap->a_name); + strcpy(p, ap->a_name); + p += strlen(ap->a_name); + bzero(p, eapad1); + p += eapad1; + error = uiomove(p, ealen, ap->a_uio); + if (error) { + free(eae, M_TEMP); + ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td); + if (ip->i_ea_area != NULL && ip->i_ea_error == 0) + ip->i_ea_error = error; + return(error); + } + p += ealen; + bzero(p, eapad2); + + p = ip->i_ea_area; + ip->i_ea_area = eae; + ip->i_ea_len = easize; + free(p, M_TEMP); + error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td); + return(error); +} + +/* + * Vnode pointer to File handle + */ +static int +ffs_vptofh(struct vop_vptofh_args *ap) +/* +vop_vptofh { + IN struct vnode *a_vp; + IN struct fid *a_fhp; +}; +*/ +{ + struct inode *ip; + struct ufid *ufhp; + + ip = VTOI(ap->a_vp); + ufhp = (struct ufid *)ap->a_fhp; + ufhp->ufid_len = sizeof(struct ufid); + ufhp->ufid_ino = ip->i_number; + ufhp->ufid_gen = ip->i_gen; + return (0); +} + +SYSCTL_DECL(_vfs_ffs); +static int use_buf_pager = 0; +SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0, + "Always use buffer pager instead of bmap"); + +static daddr_t +ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off) +{ + + return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off)); +} + +static int +ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn) +{ + + return (blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn)); +} + +static int +ffs_getpages(struct vop_getpages_args *ap) +{ + struct vnode *vp; + struct ufsmount *um; + + vp = ap->a_vp; + um = VFSTOUFS(vp->v_mount); + + if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) + return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count, + ap->a_rbehind, ap->a_rahead, NULL, NULL)); + return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind, + ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz)); +} diff --git a/Dump/ufs/ffs/fs.h b/Dump/ufs/ffs/fs.h new file mode 100644 index 0000000..233b347 --- /dev/null +++ b/Dump/ufs/ffs/fs.h @@ -0,0 +1,792 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)fs.h 8.13 (Berkeley) 3/21/95 + * $FreeBSD: releng/11.2/sys/ufs/ffs/fs.h 331722 2018-03-29 02:50:57Z eadler $ + */ + +#ifndef _UFS_FFS_FS_H_ +#define _UFS_FFS_FS_H_ + +#include +#include + +/* + * Each disk drive contains some number of filesystems. + * A filesystem consists of a number of cylinder groups. + * Each cylinder group has inodes and data. + * + * A filesystem is described by its super-block, which in turn + * describes the cylinder groups. The super-block is critical + * data and is replicated in each cylinder group to protect against + * catastrophic loss. This is done at `newfs' time and the critical + * super-block data does not change, so the copies need not be + * referenced further unless disaster strikes. + * + * For filesystem fs, the offsets of the various blocks of interest + * are given in the super block as: + * [fs->fs_sblkno] Super-block + * [fs->fs_cblkno] Cylinder group block + * [fs->fs_iblkno] Inode blocks + * [fs->fs_dblkno] Data blocks + * The beginning of cylinder group cg in fs, is given by + * the ``cgbase(fs, cg)'' macro. + * + * Depending on the architecture and the media, the superblock may + * reside in any one of four places. For tiny media where every block + * counts, it is placed at the very front of the partition. Historically, + * UFS1 placed it 8K from the front to leave room for the disk label and + * a small bootstrap. For UFS2 it got moved to 64K from the front to leave + * room for the disk label and a bigger bootstrap, and for really piggy + * systems we check at 256K from the front if the first three fail. In + * all cases the size of the superblock will be SBLOCKSIZE. All values are + * given in byte-offset form, so they do not imply a sector size. The + * SBLOCKSEARCH specifies the order in which the locations should be searched. + */ +#define SBLOCK_FLOPPY 0 +#define SBLOCK_UFS1 8192 +#define SBLOCK_UFS2 65536 +#define SBLOCK_PIGGY 262144 +#define SBLOCKSIZE 8192 +#define SBLOCKSEARCH \ + { SBLOCK_UFS2, SBLOCK_UFS1, SBLOCK_FLOPPY, SBLOCK_PIGGY, -1 } + +/* + * Max number of fragments per block. This value is NOT tweakable. + */ +#define MAXFRAG 8 + +/* + * Addresses stored in inodes are capable of addressing fragments + * of `blocks'. File system blocks of at most size MAXBSIZE can + * be optionally broken into 2, 4, or 8 pieces, each of which is + * addressable; these pieces may be DEV_BSIZE, or some multiple of + * a DEV_BSIZE unit. + * + * Large files consist of exclusively large data blocks. To avoid + * undue wasted disk space, the last data block of a small file may be + * allocated as only as many fragments of a large block as are + * necessary. The filesystem format retains only a single pointer + * to such a fragment, which is a piece of a single large block that + * has been divided. The size of such a fragment is determinable from + * information in the inode, using the ``blksize(fs, ip, lbn)'' macro. + * + * The filesystem records space availability at the fragment level; + * to determine block availability, aligned fragments are examined. + */ + +/* + * MINBSIZE is the smallest allowable block size. + * In order to insure that it is possible to create files of size + * 2^32 with only two levels of indirection, MINBSIZE is set to 4096. + * MINBSIZE must be big enough to hold a cylinder group block, + * thus changes to (struct cg) must keep its size within MINBSIZE. + * Note that super blocks are always of size SBLOCKSIZE, + * and that both SBLOCKSIZE and MAXBSIZE must be >= MINBSIZE. + */ +#define MINBSIZE 4096 + +/* + * The path name on which the filesystem is mounted is maintained + * in fs_fsmnt. MAXMNTLEN defines the amount of space allocated in + * the super block for this name. + */ +#define MAXMNTLEN 468 + +/* + * The volume name for this filesystem is maintained in fs_volname. + * MAXVOLLEN defines the length of the buffer allocated. + */ +#define MAXVOLLEN 32 + +/* + * There is a 128-byte region in the superblock reserved for in-core + * pointers to summary information. Originally this included an array + * of pointers to blocks of struct csum; now there are just a few + * pointers and the remaining space is padded with fs_ocsp[]. + * + * NOCSPTRS determines the size of this padding. One pointer (fs_csp) + * is taken away to point to a contiguous array of struct csum for + * all cylinder groups; a second (fs_maxcluster) points to an array + * of cluster sizes that is computed as cylinder groups are inspected, + * and the third points to an array that tracks the creation of new + * directories. A fourth pointer, fs_active, is used when creating + * snapshots; it points to a bitmap of cylinder groups for which the + * free-block bitmap has changed since the snapshot operation began. + */ +#define NOCSPTRS ((128 / sizeof(void *)) - 4) + +/* + * A summary of contiguous blocks of various sizes is maintained + * in each cylinder group. Normally this is set by the initial + * value of fs_maxcontig. To conserve space, a maximum summary size + * is set by FS_MAXCONTIG. + */ +#define FS_MAXCONTIG 16 + +/* + * MINFREE gives the minimum acceptable percentage of filesystem + * blocks which may be free. If the freelist drops below this level + * only the superuser may continue to allocate blocks. This may + * be set to 0 if no reserve of free blocks is deemed necessary, + * however throughput drops by fifty percent if the filesystem + * is run at between 95% and 100% full; thus the minimum default + * value of fs_minfree is 5%. However, to get good clustering + * performance, 10% is a better choice. hence we use 10% as our + * default value. With 10% free space, fragmentation is not a + * problem, so we choose to optimize for time. + */ +#define MINFREE 8 +#define DEFAULTOPT FS_OPTTIME + +/* + * Grigoriy Orlov has done some extensive work to fine + * tune the layout preferences for directories within a filesystem. + * His algorithm can be tuned by adjusting the following parameters + * which tell the system the average file size and the average number + * of files per directory. These defaults are well selected for typical + * filesystems, but may need to be tuned for odd cases like filesystems + * being used for squid caches or news spools. + */ +#define AVFILESIZ 16384 /* expected average file size */ +#define AFPDIR 64 /* expected number of files per directory */ + +/* + * The maximum number of snapshot nodes that can be associated + * with each filesystem. This limit affects only the number of + * snapshot files that can be recorded within the superblock so + * that they can be found when the filesystem is mounted. However, + * maintaining too many will slow the filesystem performance, so + * having this limit is a good idea. + */ +#define FSMAXSNAP 20 + +/* + * Used to identify special blocks in snapshots: + * + * BLK_NOCOPY - A block that was unallocated at the time the snapshot + * was taken, hence does not need to be copied when written. + * BLK_SNAP - A block held by another snapshot that is not needed by this + * snapshot. When the other snapshot is freed, the BLK_SNAP entries + * are converted to BLK_NOCOPY. These are needed to allow fsck to + * identify blocks that are in use by other snapshots (which are + * expunged from this snapshot). + */ +#define BLK_NOCOPY ((ufs2_daddr_t)(1)) +#define BLK_SNAP ((ufs2_daddr_t)(2)) + +/* + * Sysctl values for the fast filesystem. + */ +#define FFS_ADJ_REFCNT 1 /* adjust inode reference count */ +#define FFS_ADJ_BLKCNT 2 /* adjust inode used block count */ +#define FFS_BLK_FREE 3 /* free range of blocks in map */ +#define FFS_DIR_FREE 4 /* free specified dir inodes in map */ +#define FFS_FILE_FREE 5 /* free specified file inodes in map */ +#define FFS_SET_FLAGS 6 /* set filesystem flags */ +#define FFS_ADJ_NDIR 7 /* adjust number of directories */ +#define FFS_ADJ_NBFREE 8 /* adjust number of free blocks */ +#define FFS_ADJ_NIFREE 9 /* adjust number of free inodes */ +#define FFS_ADJ_NFFREE 10 /* adjust number of free frags */ +#define FFS_ADJ_NUMCLUSTERS 11 /* adjust number of free clusters */ +#define FFS_SET_CWD 12 /* set current directory */ +#define FFS_SET_DOTDOT 13 /* set inode number for ".." */ +#define FFS_UNLINK 14 /* remove a name in the filesystem */ +#define FFS_SET_INODE 15 /* update an on-disk inode */ +#define FFS_SET_BUFOUTPUT 16 /* set buffered writing on descriptor */ +#define FFS_MAXID 16 /* number of valid ffs ids */ + +/* + * Command structure passed in to the filesystem to adjust filesystem values. + */ +#define FFS_CMD_VERSION 0x19790518 /* version ID */ +struct fsck_cmd { + int32_t version; /* version of command structure */ + int32_t handle; /* reference to filesystem to be changed */ + int64_t value; /* inode or block number to be affected */ + int64_t size; /* amount or range to be adjusted */ + int64_t spare; /* reserved for future use */ +}; + +/* + * A recovery structure placed at the end of the boot block area by newfs + * that can be used by fsck to search for alternate superblocks. + */ +#define RESID (4096 - 20) /* disk sector size minus recovery area size */ +struct fsrecovery { + char block[RESID]; /* unused part of sector */ + int32_t fsr_magic; /* magic number */ + int32_t fsr_fsbtodb; /* fsbtodb and dbtofsb shift constant */ + int32_t fsr_sblkno; /* offset of super-block in filesys */ + int32_t fsr_fpg; /* blocks per group * fs_frag */ + u_int32_t fsr_ncg; /* number of cylinder groups */ +}; + +/* + * Per cylinder group information; summarized in blocks allocated + * from first cylinder group data blocks. These blocks have to be + * read in from fs_csaddr (size fs_cssize) in addition to the + * super block. + */ +struct csum { + int32_t cs_ndir; /* number of directories */ + int32_t cs_nbfree; /* number of free blocks */ + int32_t cs_nifree; /* number of free inodes */ + int32_t cs_nffree; /* number of free frags */ +}; +struct csum_total { + int64_t cs_ndir; /* number of directories */ + int64_t cs_nbfree; /* number of free blocks */ + int64_t cs_nifree; /* number of free inodes */ + int64_t cs_nffree; /* number of free frags */ + int64_t cs_numclusters; /* number of free clusters */ + int64_t cs_spare[3]; /* future expansion */ +}; + +/* + * Super block for an FFS filesystem. + */ +struct fs { + int32_t fs_firstfield; /* historic filesystem linked list, */ + int32_t fs_unused_1; /* used for incore super blocks */ + int32_t fs_sblkno; /* offset of super-block in filesys */ + int32_t fs_cblkno; /* offset of cyl-block in filesys */ + int32_t fs_iblkno; /* offset of inode-blocks in filesys */ + int32_t fs_dblkno; /* offset of first data after cg */ + int32_t fs_old_cgoffset; /* cylinder group offset in cylinder */ + int32_t fs_old_cgmask; /* used to calc mod fs_ntrak */ + int32_t fs_old_time; /* last time written */ + int32_t fs_old_size; /* number of blocks in fs */ + int32_t fs_old_dsize; /* number of data blocks in fs */ + u_int32_t fs_ncg; /* number of cylinder groups */ + int32_t fs_bsize; /* size of basic blocks in fs */ + int32_t fs_fsize; /* size of frag blocks in fs */ + int32_t fs_frag; /* number of frags in a block in fs */ +/* these are configuration parameters */ + int32_t fs_minfree; /* minimum percentage of free blocks */ + int32_t fs_old_rotdelay; /* num of ms for optimal next block */ + int32_t fs_old_rps; /* disk revolutions per second */ +/* these fields can be computed from the others */ + int32_t fs_bmask; /* ``blkoff'' calc of blk offsets */ + int32_t fs_fmask; /* ``fragoff'' calc of frag offsets */ + int32_t fs_bshift; /* ``lblkno'' calc of logical blkno */ + int32_t fs_fshift; /* ``numfrags'' calc number of frags */ +/* these are configuration parameters */ + int32_t fs_maxcontig; /* max number of contiguous blks */ + int32_t fs_maxbpg; /* max number of blks per cyl group */ +/* these fields can be computed from the others */ + int32_t fs_fragshift; /* block to frag shift */ + int32_t fs_fsbtodb; /* fsbtodb and dbtofsb shift constant */ + int32_t fs_sbsize; /* actual size of super block */ + int32_t fs_spare1[2]; /* old fs_csmask */ + /* old fs_csshift */ + int32_t fs_nindir; /* value of NINDIR */ + u_int32_t fs_inopb; /* value of INOPB */ + int32_t fs_old_nspf; /* value of NSPF */ +/* yet another configuration parameter */ + int32_t fs_optim; /* optimization preference, see below */ + int32_t fs_old_npsect; /* # sectors/track including spares */ + int32_t fs_old_interleave; /* hardware sector interleave */ + int32_t fs_old_trackskew; /* sector 0 skew, per track */ + int32_t fs_id[2]; /* unique filesystem id */ +/* sizes determined by number of cylinder groups and their sizes */ + int32_t fs_old_csaddr; /* blk addr of cyl grp summary area */ + int32_t fs_cssize; /* size of cyl grp summary area */ + int32_t fs_cgsize; /* cylinder group size */ + int32_t fs_spare2; /* old fs_ntrak */ + int32_t fs_old_nsect; /* sectors per track */ + int32_t fs_old_spc; /* sectors per cylinder */ + int32_t fs_old_ncyl; /* cylinders in filesystem */ + int32_t fs_old_cpg; /* cylinders per group */ + u_int32_t fs_ipg; /* inodes per group */ + int32_t fs_fpg; /* blocks per group * fs_frag */ +/* this data must be re-computed after crashes */ + struct csum fs_old_cstotal; /* cylinder summary information */ +/* these fields are cleared at mount time */ + int8_t fs_fmod; /* super block modified flag */ + int8_t fs_clean; /* filesystem is clean flag */ + int8_t fs_ronly; /* mounted read-only flag */ + int8_t fs_old_flags; /* old FS_ flags */ + u_char fs_fsmnt[MAXMNTLEN]; /* name mounted on */ + u_char fs_volname[MAXVOLLEN]; /* volume name */ + u_int64_t fs_swuid; /* system-wide uid */ + int32_t fs_pad; /* due to alignment of fs_swuid */ +/* these fields retain the current block allocation info */ + int32_t fs_cgrotor; /* last cg searched */ + void *fs_ocsp[NOCSPTRS]; /* padding; was list of fs_cs buffers */ + u_int8_t *fs_contigdirs; /* (u) # of contig. allocated dirs */ + struct csum *fs_csp; /* (u) cg summary info buffer */ + int32_t *fs_maxcluster; /* (u) max cluster in each cyl group */ + u_int *fs_active; /* (u) used by snapshots to track fs */ + int32_t fs_old_cpc; /* cyl per cycle in postbl */ + int32_t fs_maxbsize; /* maximum blocking factor permitted */ + int64_t fs_unrefs; /* number of unreferenced inodes */ + int64_t fs_providersize; /* size of underlying GEOM provider */ + int64_t fs_metaspace; /* size of area reserved for metadata */ + int64_t fs_sparecon64[14]; /* old rotation block list head */ + int64_t fs_sblockloc; /* byte offset of standard superblock */ + struct csum_total fs_cstotal; /* (u) cylinder summary information */ + ufs_time_t fs_time; /* last time written */ + int64_t fs_size; /* number of blocks in fs */ + int64_t fs_dsize; /* number of data blocks in fs */ + ufs2_daddr_t fs_csaddr; /* blk addr of cyl grp summary area */ + int64_t fs_pendingblocks; /* (u) blocks being freed */ + u_int32_t fs_pendinginodes; /* (u) inodes being freed */ + uint32_t fs_snapinum[FSMAXSNAP];/* list of snapshot inode numbers */ + u_int32_t fs_avgfilesize; /* expected average file size */ + u_int32_t fs_avgfpdir; /* expected # of files per directory */ + int32_t fs_save_cgsize; /* save real cg size to use fs_bsize */ + ufs_time_t fs_mtime; /* Last mount or fsck time. */ + int32_t fs_sujfree; /* SUJ free list */ + int32_t fs_sparecon32[23]; /* reserved for future constants */ + int32_t fs_flags; /* see FS_ flags below */ + int32_t fs_contigsumsize; /* size of cluster summary array */ + int32_t fs_maxsymlinklen; /* max length of an internal symlink */ + int32_t fs_old_inodefmt; /* format of on-disk inodes */ + u_int64_t fs_maxfilesize; /* maximum representable file size */ + int64_t fs_qbmask; /* ~fs_bmask for use with 64-bit size */ + int64_t fs_qfmask; /* ~fs_fmask for use with 64-bit size */ + int32_t fs_state; /* validate fs_clean field */ + int32_t fs_old_postblformat; /* format of positional layout tables */ + int32_t fs_old_nrpos; /* number of rotational positions */ + int32_t fs_spare5[2]; /* old fs_postbloff */ + /* old fs_rotbloff */ + int32_t fs_magic; /* magic number */ +}; + +/* Sanity checking. */ +#ifdef CTASSERT +CTASSERT(sizeof(struct fs) == 1376); +#endif + +/* + * Filesystem identification + */ +#define FS_UFS1_MAGIC 0x011954 /* UFS1 fast filesystem magic number */ +#define FS_UFS2_MAGIC 0x19540119 /* UFS2 fast filesystem magic number */ +#define FS_BAD_MAGIC 0x19960408 /* UFS incomplete newfs magic number */ +#define FS_OKAY 0x7c269d38 /* superblock checksum */ +#define FS_42INODEFMT -1 /* 4.2BSD inode format */ +#define FS_44INODEFMT 2 /* 4.4BSD inode format */ + +/* + * Preference for optimization. + */ +#define FS_OPTTIME 0 /* minimize allocation time */ +#define FS_OPTSPACE 1 /* minimize disk fragmentation */ + +/* + * Filesystem flags. + * + * The FS_UNCLEAN flag is set by the kernel when the filesystem was + * mounted with fs_clean set to zero. The FS_DOSOFTDEP flag indicates + * that the filesystem should be managed by the soft updates code. + * Note that the FS_NEEDSFSCK flag is set and cleared only by the + * fsck utility. It is set when background fsck finds an unexpected + * inconsistency which requires a traditional foreground fsck to be + * run. Such inconsistencies should only be found after an uncorrectable + * disk error. A foreground fsck will clear the FS_NEEDSFSCK flag when + * it has successfully cleaned up the filesystem. The kernel uses this + * flag to enforce that inconsistent filesystems be mounted read-only. + * The FS_INDEXDIRS flag when set indicates that the kernel maintains + * on-disk auxiliary indexes (such as B-trees) for speeding directory + * accesses. Kernels that do not support auxiliary indices clear the + * flag to indicate that the indices need to be rebuilt (by fsck) before + * they can be used. + * + * FS_ACLS indicates that POSIX.1e ACLs are administratively enabled + * for the file system, so they should be loaded from extended attributes, + * observed for access control purposes, and be administered by object + * owners. FS_NFS4ACLS indicates that NFSv4 ACLs are administratively + * enabled. This flag is mutually exclusive with FS_ACLS. FS_MULTILABEL + * indicates that the TrustedBSD MAC Framework should attempt to back MAC + * labels into extended attributes on the file system rather than maintain + * a single mount label for all objects. + */ +#define FS_UNCLEAN 0x0001 /* filesystem not clean at mount */ +#define FS_DOSOFTDEP 0x0002 /* filesystem using soft dependencies */ +#define FS_NEEDSFSCK 0x0004 /* filesystem needs sync fsck before mount */ +#define FS_SUJ 0x0008 /* Filesystem using softupdate journal */ +#define FS_ACLS 0x0010 /* file system has POSIX.1e ACLs enabled */ +#define FS_MULTILABEL 0x0020 /* file system is MAC multi-label */ +#define FS_GJOURNAL 0x0040 /* gjournaled file system */ +#define FS_FLAGS_UPDATED 0x0080 /* flags have been moved to new location */ +#define FS_NFS4ACLS 0x0100 /* file system has NFSv4 ACLs enabled */ +#define FS_INDEXDIRS 0x0200 /* kernel supports indexed directories */ +#define FS_TRIM 0x0400 /* issue BIO_DELETE for deleted blocks */ + +/* + * Macros to access bits in the fs_active array. + */ +#define ACTIVECGNUM(fs, cg) ((fs)->fs_active[(cg) / (NBBY * sizeof(int))]) +#define ACTIVECGOFF(cg) (1 << ((cg) % (NBBY * sizeof(int)))) +#define ACTIVESET(fs, cg) do { \ + if ((fs)->fs_active) \ + ACTIVECGNUM((fs), (cg)) |= ACTIVECGOFF((cg)); \ +} while (0) +#define ACTIVECLEAR(fs, cg) do { \ + if ((fs)->fs_active) \ + ACTIVECGNUM((fs), (cg)) &= ~ACTIVECGOFF((cg)); \ +} while (0) + +/* + * The size of a cylinder group is calculated by CGSIZE. The maximum size + * is limited by the fact that cylinder groups are at most one block. + * Its size is derived from the size of the maps maintained in the + * cylinder group and the (struct cg) size. + */ +#define CGSIZE(fs) \ + /* base cg */ (sizeof(struct cg) + sizeof(int32_t) + \ + /* old btotoff */ (fs)->fs_old_cpg * sizeof(int32_t) + \ + /* old boff */ (fs)->fs_old_cpg * sizeof(u_int16_t) + \ + /* inode map */ howmany((fs)->fs_ipg, NBBY) + \ + /* block map */ howmany((fs)->fs_fpg, NBBY) +\ + /* if present */ ((fs)->fs_contigsumsize <= 0 ? 0 : \ + /* cluster sum */ (fs)->fs_contigsumsize * sizeof(int32_t) + \ + /* cluster map */ howmany(fragstoblks(fs, (fs)->fs_fpg), NBBY))) + +/* + * The minimal number of cylinder groups that should be created. + */ +#define MINCYLGRPS 4 + +/* + * Convert cylinder group to base address of its global summary info. + */ +#define fs_cs(fs, indx) fs_csp[indx] + +/* + * Cylinder group block for a filesystem. + */ +#define CG_MAGIC 0x090255 +struct cg { + int32_t cg_firstfield; /* historic cyl groups linked list */ + int32_t cg_magic; /* magic number */ + int32_t cg_old_time; /* time last written */ + u_int32_t cg_cgx; /* we are the cgx'th cylinder group */ + int16_t cg_old_ncyl; /* number of cyl's this cg */ + int16_t cg_old_niblk; /* number of inode blocks this cg */ + u_int32_t cg_ndblk; /* number of data blocks this cg */ + struct csum cg_cs; /* cylinder summary information */ + u_int32_t cg_rotor; /* position of last used block */ + u_int32_t cg_frotor; /* position of last used frag */ + u_int32_t cg_irotor; /* position of last used inode */ + u_int32_t cg_frsum[MAXFRAG]; /* counts of available frags */ + int32_t cg_old_btotoff; /* (int32) block totals per cylinder */ + int32_t cg_old_boff; /* (u_int16) free block positions */ + u_int32_t cg_iusedoff; /* (u_int8) used inode map */ + u_int32_t cg_freeoff; /* (u_int8) free block map */ + u_int32_t cg_nextfreeoff; /* (u_int8) next available space */ + u_int32_t cg_clustersumoff; /* (u_int32) counts of avail clusters */ + u_int32_t cg_clusteroff; /* (u_int8) free cluster map */ + u_int32_t cg_nclusterblks; /* number of clusters this cg */ + u_int32_t cg_niblk; /* number of inode blocks this cg */ + u_int32_t cg_initediblk; /* last initialized inode */ + u_int32_t cg_unrefs; /* number of unreferenced inodes */ + int32_t cg_sparecon32[2]; /* reserved for future use */ + ufs_time_t cg_time; /* time last written */ + int64_t cg_sparecon64[3]; /* reserved for future use */ + u_int8_t cg_space[1]; /* space for cylinder group maps */ +/* actually longer */ +}; + +/* + * Macros for access to cylinder group array structures + */ +#define cg_chkmagic(cgp) ((cgp)->cg_magic == CG_MAGIC) +#define cg_inosused(cgp) \ + ((u_int8_t *)((u_int8_t *)(cgp) + (cgp)->cg_iusedoff)) +#define cg_blksfree(cgp) \ + ((u_int8_t *)((u_int8_t *)(cgp) + (cgp)->cg_freeoff)) +#define cg_clustersfree(cgp) \ + ((u_int8_t *)((u_int8_t *)(cgp) + (cgp)->cg_clusteroff)) +#define cg_clustersum(cgp) \ + ((int32_t *)((uintptr_t)(cgp) + (cgp)->cg_clustersumoff)) + +/* + * Turn filesystem block numbers into disk block addresses. + * This maps filesystem blocks to device size blocks. + */ +#define fsbtodb(fs, b) ((daddr_t)(b) << (fs)->fs_fsbtodb) +#define dbtofsb(fs, b) ((b) >> (fs)->fs_fsbtodb) + +/* + * Cylinder group macros to locate things in cylinder groups. + * They calc filesystem addresses of cylinder group data structures. + */ +#define cgbase(fs, c) (((ufs2_daddr_t)(fs)->fs_fpg) * (c)) +#define cgdata(fs, c) (cgdmin(fs, c) + (fs)->fs_metaspace) /* data zone */ +#define cgmeta(fs, c) (cgdmin(fs, c)) /* meta data */ +#define cgdmin(fs, c) (cgstart(fs, c) + (fs)->fs_dblkno) /* 1st data */ +#define cgimin(fs, c) (cgstart(fs, c) + (fs)->fs_iblkno) /* inode blk */ +#define cgsblock(fs, c) (cgstart(fs, c) + (fs)->fs_sblkno) /* super blk */ +#define cgtod(fs, c) (cgstart(fs, c) + (fs)->fs_cblkno) /* cg block */ +#define cgstart(fs, c) \ + ((fs)->fs_magic == FS_UFS2_MAGIC ? cgbase(fs, c) : \ + (cgbase(fs, c) + (fs)->fs_old_cgoffset * ((c) & ~((fs)->fs_old_cgmask)))) + +/* + * Macros for handling inode numbers: + * inode number to filesystem block offset. + * inode number to cylinder group number. + * inode number to filesystem block address. + */ +#define ino_to_cg(fs, x) (((ino_t)(x)) / (fs)->fs_ipg) +#define ino_to_fsba(fs, x) \ + ((ufs2_daddr_t)(cgimin(fs, ino_to_cg(fs, (ino_t)(x))) + \ + (blkstofrags((fs), ((((ino_t)(x)) % (fs)->fs_ipg) / INOPB(fs)))))) +#define ino_to_fsbo(fs, x) (((ino_t)(x)) % INOPB(fs)) + +/* + * Give cylinder group number for a filesystem block. + * Give cylinder group block number for a filesystem block. + */ +#define dtog(fs, d) ((d) / (fs)->fs_fpg) +#define dtogd(fs, d) ((d) % (fs)->fs_fpg) + +/* + * Extract the bits for a block from a map. + * Compute the cylinder and rotational position of a cyl block addr. + */ +#define blkmap(fs, map, loc) \ + (((map)[(loc) / NBBY] >> ((loc) % NBBY)) & (0xff >> (NBBY - (fs)->fs_frag))) + +/* + * The following macros optimize certain frequently calculated + * quantities by using shifts and masks in place of divisions + * modulos and multiplications. + */ +#define blkoff(fs, loc) /* calculates (loc % fs->fs_bsize) */ \ + ((loc) & (fs)->fs_qbmask) +#define fragoff(fs, loc) /* calculates (loc % fs->fs_fsize) */ \ + ((loc) & (fs)->fs_qfmask) +#define lfragtosize(fs, frag) /* calculates ((off_t)frag * fs->fs_fsize) */ \ + (((off_t)(frag)) << (fs)->fs_fshift) +#define lblktosize(fs, blk) /* calculates ((off_t)blk * fs->fs_bsize) */ \ + (((off_t)(blk)) << (fs)->fs_bshift) +/* Use this only when `blk' is known to be small, e.g., < NDADDR. */ +#define smalllblktosize(fs, blk) /* calculates (blk * fs->fs_bsize) */ \ + ((blk) << (fs)->fs_bshift) +#define lblkno(fs, loc) /* calculates (loc / fs->fs_bsize) */ \ + ((loc) >> (fs)->fs_bshift) +#define numfrags(fs, loc) /* calculates (loc / fs->fs_fsize) */ \ + ((loc) >> (fs)->fs_fshift) +#define blkroundup(fs, size) /* calculates roundup(size, fs->fs_bsize) */ \ + (((size) + (fs)->fs_qbmask) & (fs)->fs_bmask) +#define fragroundup(fs, size) /* calculates roundup(size, fs->fs_fsize) */ \ + (((size) + (fs)->fs_qfmask) & (fs)->fs_fmask) +#define fragstoblks(fs, frags) /* calculates (frags / fs->fs_frag) */ \ + ((frags) >> (fs)->fs_fragshift) +#define blkstofrags(fs, blks) /* calculates (blks * fs->fs_frag) */ \ + ((blks) << (fs)->fs_fragshift) +#define fragnum(fs, fsb) /* calculates (fsb % fs->fs_frag) */ \ + ((fsb) & ((fs)->fs_frag - 1)) +#define blknum(fs, fsb) /* calculates rounddown(fsb, fs->fs_frag) */ \ + ((fsb) &~ ((fs)->fs_frag - 1)) + +/* + * Determine the number of available frags given a + * percentage to hold in reserve. + */ +#define freespace(fs, percentreserved) \ + (blkstofrags((fs), (fs)->fs_cstotal.cs_nbfree) + \ + (fs)->fs_cstotal.cs_nffree - \ + (((off_t)((fs)->fs_dsize)) * (percentreserved) / 100)) + +/* + * Determining the size of a file block in the filesystem. + */ +#define blksize(fs, ip, lbn) \ + (((lbn) >= NDADDR || (ip)->i_size >= smalllblktosize(fs, (lbn) + 1)) \ + ? (fs)->fs_bsize \ + : (fragroundup(fs, blkoff(fs, (ip)->i_size)))) +#define sblksize(fs, size, lbn) \ + (((lbn) >= NDADDR || (size) >= ((lbn) + 1) << (fs)->fs_bshift) \ + ? (fs)->fs_bsize \ + : (fragroundup(fs, blkoff(fs, (size))))) + +/* + * Number of indirects in a filesystem block. + */ +#define NINDIR(fs) ((fs)->fs_nindir) + +/* + * Indirect lbns are aligned on NDADDR addresses where single indirects + * are the negated address of the lowest lbn reachable, double indirects + * are this lbn - 1 and triple indirects are this lbn - 2. This yields + * an unusual bit order to determine level. + */ +static inline int +lbn_level(ufs_lbn_t lbn) +{ + if (lbn >= 0) + return 0; + switch (lbn & 0x3) { + case 0: + return (0); + case 1: + break; + case 2: + return (2); + case 3: + return (1); + default: + break; + } + return (-1); +} + +static inline ufs_lbn_t +lbn_offset(struct fs *fs, int level) +{ + ufs_lbn_t res; + + for (res = 1; level > 0; level--) + res *= NINDIR(fs); + return (res); +} + +/* + * Number of inodes in a secondary storage block/fragment. + */ +#define INOPB(fs) ((fs)->fs_inopb) +#define INOPF(fs) ((fs)->fs_inopb >> (fs)->fs_fragshift) + +/* + * Softdep journal record format. + */ + +#define JOP_ADDREF 1 /* Add a reference to an inode. */ +#define JOP_REMREF 2 /* Remove a reference from an inode. */ +#define JOP_NEWBLK 3 /* Allocate a block. */ +#define JOP_FREEBLK 4 /* Free a block or a tree of blocks. */ +#define JOP_MVREF 5 /* Move a reference from one off to another. */ +#define JOP_TRUNC 6 /* Partial truncation record. */ +#define JOP_SYNC 7 /* fsync() complete record. */ + +#define JREC_SIZE 32 /* Record and segment header size. */ + +#define SUJ_MIN (4 * 1024 * 1024) /* Minimum journal size */ +#define SUJ_MAX (32 * 1024 * 1024) /* Maximum journal size */ +#define SUJ_FILE ".sujournal" /* Journal file name */ + +/* + * Size of the segment record header. There is at most one for each disk + * block in the journal. The segment header is followed by an array of + * records. fsck depends on the first element in each record being 'op' + * and the second being 'ino'. Segments may span multiple disk blocks but + * the header is present on each. + */ +struct jsegrec { + uint64_t jsr_seq; /* Our sequence number */ + uint64_t jsr_oldest; /* Oldest valid sequence number */ + uint16_t jsr_cnt; /* Count of valid records */ + uint16_t jsr_blocks; /* Count of device bsize blocks. */ + uint32_t jsr_crc; /* 32bit crc of the valid space */ + ufs_time_t jsr_time; /* timestamp for mount instance */ +}; + +/* + * Reference record. Records a single link count modification. + */ +struct jrefrec { + uint32_t jr_op; + uint32_t jr_ino; + uint32_t jr_parent; + uint16_t jr_nlink; + uint16_t jr_mode; + int64_t jr_diroff; + uint64_t jr_unused; +}; + +/* + * Move record. Records a reference moving within a directory block. The + * nlink is unchanged but we must search both locations. + */ +struct jmvrec { + uint32_t jm_op; + uint32_t jm_ino; + uint32_t jm_parent; + uint16_t jm_unused; + int64_t jm_oldoff; + int64_t jm_newoff; +}; + +/* + * Block record. A set of frags or tree of blocks starting at an indirect are + * freed or a set of frags are allocated. + */ +struct jblkrec { + uint32_t jb_op; + uint32_t jb_ino; + ufs2_daddr_t jb_blkno; + ufs_lbn_t jb_lbn; + uint16_t jb_frags; + uint16_t jb_oldfrags; + uint32_t jb_unused; +}; + +/* + * Truncation record. Records a partial truncation so that it may be + * completed at check time. Also used for sync records. + */ +struct jtrncrec { + uint32_t jt_op; + uint32_t jt_ino; + int64_t jt_size; + uint32_t jt_extsize; + uint32_t jt_pad[3]; +}; + +union jrec { + struct jsegrec rec_jsegrec; + struct jrefrec rec_jrefrec; + struct jmvrec rec_jmvrec; + struct jblkrec rec_jblkrec; + struct jtrncrec rec_jtrncrec; +}; + +#ifdef CTASSERT +CTASSERT(sizeof(struct jsegrec) == JREC_SIZE); +CTASSERT(sizeof(struct jrefrec) == JREC_SIZE); +CTASSERT(sizeof(struct jmvrec) == JREC_SIZE); +CTASSERT(sizeof(struct jblkrec) == JREC_SIZE); +CTASSERT(sizeof(struct jtrncrec) == JREC_SIZE); +CTASSERT(sizeof(union jrec) == JREC_SIZE); +#endif + +extern int inside[], around[]; +extern u_char *fragtbl[]; + +/* + * IOCTLs used for filesystem write suspension. + */ +#define UFSSUSPEND _IOW('U', 1, fsid_t) +#define UFSRESUME _IO('U', 2) + +#endif diff --git a/Dump/ufs/ffs/softdep.h b/Dump/ufs/ffs/softdep.h new file mode 100644 index 0000000..8f5222e --- /dev/null +++ b/Dump/ufs/ffs/softdep.h @@ -0,0 +1,1100 @@ +/*- + * Copyright 1998, 2000 Marshall Kirk McKusick. All Rights Reserved. + * + * The soft updates code is derived from the appendix of a University + * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, + * "Soft Updates: A Solution to the Metadata Update Problem in File + * Systems", CSE-TR-254-95, August 1995). + * + * Further information about soft updates can be obtained from: + * + * Marshall Kirk McKusick http://www.mckusick.com/softdep/ + * 1614 Oxford Street mckusick@mckusick.com + * Berkeley, CA 94709-1608 +1-510-843-9542 + * USA + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)softdep.h 9.7 (McKusick) 6/21/00 + * $FreeBSD: releng/11.2/sys/ufs/ffs/softdep.h 320057 2017-06-17 17:10:50Z kib $ + */ + +#include + +/* + * Allocation dependencies are handled with undo/redo on the in-memory + * copy of the data. A particular data dependency is eliminated when + * it is ALLCOMPLETE: that is ATTACHED, DEPCOMPLETE, and COMPLETE. + * + * The ATTACHED flag means that the data is not currently being written + * to disk. + * + * The UNDONE flag means that the data has been rolled back to a safe + * state for writing to the disk. When the I/O completes, the data is + * restored to its current form and the state reverts to ATTACHED. + * The data must be locked throughout the rollback, I/O, and roll + * forward so that the rolled back information is never visible to + * user processes. + * + * The COMPLETE flag indicates that the item has been written. For example, + * a dependency that requires that an inode be written will be marked + * COMPLETE after the inode has been written to disk. + * + * The DEPCOMPLETE flag indicates the completion of any other + * dependencies such as the writing of a cylinder group map has been + * completed. A dependency structure may be freed only when both it + * and its dependencies have completed and any rollbacks that are in + * progress have finished as indicated by the set of ALLCOMPLETE flags + * all being set. + * + * The two MKDIR flags indicate additional dependencies that must be done + * when creating a new directory. MKDIR_BODY is cleared when the directory + * data block containing the "." and ".." entries has been written. + * MKDIR_PARENT is cleared when the parent inode with the increased link + * count for ".." has been written. When both MKDIR flags have been + * cleared, the DEPCOMPLETE flag is set to indicate that the directory + * dependencies have been completed. The writing of the directory inode + * itself sets the COMPLETE flag which then allows the directory entry for + * the new directory to be written to disk. The RMDIR flag marks a dirrem + * structure as representing the removal of a directory rather than a + * file. When the removal dependencies are completed, additional work needs + * to be done* (an additional decrement of the associated inode, and a + * decrement of the parent inode). + * + * The DIRCHG flag marks a diradd structure as representing the changing + * of an existing entry rather than the addition of a new one. When + * the update is complete the dirrem associated with the inode for + * the old name must be added to the worklist to do the necessary + * reference count decrement. + * + * The GOINGAWAY flag indicates that the data structure is frozen from + * further change until its dependencies have been completed and its + * resources freed after which it will be discarded. + * + * The IOSTARTED flag prevents multiple calls to the I/O start routine from + * doing multiple rollbacks. + * + * The NEWBLOCK flag marks pagedep structures that have just been allocated, + * so must be claimed by the inode before all dependencies are complete. + * + * The INPROGRESS flag marks worklist structures that are still on the + * worklist, but are being considered for action by some process. + * + * The UFS1FMT flag indicates that the inode being processed is a ufs1 format. + * + * The EXTDATA flag indicates that the allocdirect describes an + * extended-attributes dependency. + * + * The ONWORKLIST flag shows whether the structure is currently linked + * onto a worklist. + * + * The UNLINK* flags track the progress of updating the on-disk linked + * list of active but unlinked inodes. When an inode is first unlinked + * it is marked as UNLINKED. When its on-disk di_freelink has been + * written its UNLINKNEXT flags is set. When its predecessor in the + * list has its di_freelink pointing at us its UNLINKPREV is set. + * When the on-disk list can reach it from the superblock, its + * UNLINKONLIST flag is set. Once all of these flags are set, it + * is safe to let its last name be removed. + */ +#define ATTACHED 0x000001 +#define UNDONE 0x000002 +#define COMPLETE 0x000004 +#define DEPCOMPLETE 0x000008 +#define MKDIR_PARENT 0x000010 /* diradd, mkdir, jaddref, jsegdep only */ +#define MKDIR_BODY 0x000020 /* diradd, mkdir, jaddref only */ +#define RMDIR 0x000040 /* dirrem only */ +#define DIRCHG 0x000080 /* diradd, dirrem only */ +#define GOINGAWAY 0x000100 /* indirdep, jremref only */ +#define IOSTARTED 0x000200 /* inodedep, pagedep, bmsafemap only */ +#define DELAYEDFREE 0x000400 /* allocindirect free delayed. */ +#define NEWBLOCK 0x000800 /* pagedep, jaddref only */ +#define INPROGRESS 0x001000 /* dirrem, freeblks, freefrag, freefile only */ +#define UFS1FMT 0x002000 /* indirdep only */ +#define EXTDATA 0x004000 /* allocdirect only */ +#define ONWORKLIST 0x008000 +#define IOWAITING 0x010000 /* Thread is waiting for IO to complete. */ +#define ONDEPLIST 0x020000 /* Structure is on a dependency list. */ +#define UNLINKED 0x040000 /* inodedep has been unlinked. */ +#define UNLINKNEXT 0x080000 /* inodedep has valid di_freelink */ +#define UNLINKPREV 0x100000 /* inodedep is pointed at in the unlink list */ +#define UNLINKONLIST 0x200000 /* inodedep is in the unlinked list on disk */ +#define UNLINKLINKS (UNLINKNEXT | UNLINKPREV) +#define WRITESUCCEEDED 0x400000 /* the disk write completed successfully */ + +#define ALLCOMPLETE (ATTACHED | COMPLETE | DEPCOMPLETE) + +/* + * Values for each of the soft dependency types. + */ +#define D_PAGEDEP 0 +#define D_INODEDEP 1 +#define D_BMSAFEMAP 2 +#define D_NEWBLK 3 +#define D_ALLOCDIRECT 4 +#define D_INDIRDEP 5 +#define D_ALLOCINDIR 6 +#define D_FREEFRAG 7 +#define D_FREEBLKS 8 +#define D_FREEFILE 9 +#define D_DIRADD 10 +#define D_MKDIR 11 +#define D_DIRREM 12 +#define D_NEWDIRBLK 13 +#define D_FREEWORK 14 +#define D_FREEDEP 15 +#define D_JADDREF 16 +#define D_JREMREF 17 +#define D_JMVREF 18 +#define D_JNEWBLK 19 +#define D_JFREEBLK 20 +#define D_JFREEFRAG 21 +#define D_JSEG 22 +#define D_JSEGDEP 23 +#define D_SBDEP 24 +#define D_JTRUNC 25 +#define D_JFSYNC 26 +#define D_SENTINEL 27 +#define D_LAST D_SENTINEL + +/* + * The workitem queue. + * + * It is sometimes useful and/or necessary to clean up certain dependencies + * in the background rather than during execution of an application process + * or interrupt service routine. To realize this, we append dependency + * structures corresponding to such tasks to a "workitem" queue. In a soft + * updates implementation, most pending workitems should not wait for more + * than a couple of seconds, so the filesystem syncer process awakens once + * per second to process the items on the queue. + */ + +/* LIST_HEAD(workhead, worklist); -- declared in buf.h */ + +/* + * Each request can be linked onto a work queue through its worklist structure. + * To avoid the need for a pointer to the structure itself, this structure + * MUST be declared FIRST in each type in which it appears! If more than one + * worklist is needed in the structure, then a wk_data field must be added + * and the macros below changed to use it. + */ +struct worklist { + LIST_ENTRY(worklist) wk_list; /* list of work requests */ + struct mount *wk_mp; /* Mount we live in */ + unsigned int wk_type:8, /* type of request */ + wk_state:24; /* state flags */ +}; +#define WK_DATA(wk) ((void *)(wk)) +#define WK_PAGEDEP(wk) ((struct pagedep *)(wk)) +#define WK_INODEDEP(wk) ((struct inodedep *)(wk)) +#define WK_BMSAFEMAP(wk) ((struct bmsafemap *)(wk)) +#define WK_NEWBLK(wk) ((struct newblk *)(wk)) +#define WK_ALLOCDIRECT(wk) ((struct allocdirect *)(wk)) +#define WK_INDIRDEP(wk) ((struct indirdep *)(wk)) +#define WK_ALLOCINDIR(wk) ((struct allocindir *)(wk)) +#define WK_FREEFRAG(wk) ((struct freefrag *)(wk)) +#define WK_FREEBLKS(wk) ((struct freeblks *)(wk)) +#define WK_FREEWORK(wk) ((struct freework *)(wk)) +#define WK_FREEFILE(wk) ((struct freefile *)(wk)) +#define WK_DIRADD(wk) ((struct diradd *)(wk)) +#define WK_MKDIR(wk) ((struct mkdir *)(wk)) +#define WK_DIRREM(wk) ((struct dirrem *)(wk)) +#define WK_NEWDIRBLK(wk) ((struct newdirblk *)(wk)) +#define WK_JADDREF(wk) ((struct jaddref *)(wk)) +#define WK_JREMREF(wk) ((struct jremref *)(wk)) +#define WK_JMVREF(wk) ((struct jmvref *)(wk)) +#define WK_JSEGDEP(wk) ((struct jsegdep *)(wk)) +#define WK_JSEG(wk) ((struct jseg *)(wk)) +#define WK_JNEWBLK(wk) ((struct jnewblk *)(wk)) +#define WK_JFREEBLK(wk) ((struct jfreeblk *)(wk)) +#define WK_FREEDEP(wk) ((struct freedep *)(wk)) +#define WK_JFREEFRAG(wk) ((struct jfreefrag *)(wk)) +#define WK_SBDEP(wk) ((struct sbdep *)(wk)) +#define WK_JTRUNC(wk) ((struct jtrunc *)(wk)) +#define WK_JFSYNC(wk) ((struct jfsync *)(wk)) + +/* + * Various types of lists + */ +LIST_HEAD(dirremhd, dirrem); +LIST_HEAD(diraddhd, diradd); +LIST_HEAD(newblkhd, newblk); +LIST_HEAD(inodedephd, inodedep); +LIST_HEAD(allocindirhd, allocindir); +LIST_HEAD(allocdirecthd, allocdirect); +TAILQ_HEAD(allocdirectlst, allocdirect); +LIST_HEAD(indirdephd, indirdep); +LIST_HEAD(jaddrefhd, jaddref); +LIST_HEAD(jremrefhd, jremref); +LIST_HEAD(jmvrefhd, jmvref); +LIST_HEAD(jnewblkhd, jnewblk); +LIST_HEAD(jblkdephd, jblkdep); +LIST_HEAD(freeworkhd, freework); +TAILQ_HEAD(freeworklst, freework); +TAILQ_HEAD(jseglst, jseg); +TAILQ_HEAD(inoreflst, inoref); +TAILQ_HEAD(freeblklst, freeblks); + +/* + * The "pagedep" structure tracks the various dependencies related to + * a particular directory page. If a directory page has any dependencies, + * it will have a pagedep linked to its associated buffer. The + * pd_dirremhd list holds the list of dirrem requests which decrement + * inode reference counts. These requests are processed after the + * directory page with the corresponding zero'ed entries has been + * written. The pd_diraddhd list maintains the list of diradd requests + * which cannot be committed until their corresponding inode has been + * written to disk. Because a directory may have many new entries + * being created, several lists are maintained hashed on bits of the + * offset of the entry into the directory page to keep the lists from + * getting too long. Once a new directory entry has been cleared to + * be written, it is moved to the pd_pendinghd list. After the new + * entry has been written to disk it is removed from the pd_pendinghd + * list, any removed operations are done, and the dependency structure + * is freed. + */ +#define DAHASHSZ 5 +#define DIRADDHASH(offset) (((offset) >> 2) % DAHASHSZ) +struct pagedep { + struct worklist pd_list; /* page buffer */ +# define pd_state pd_list.wk_state /* check for multiple I/O starts */ + LIST_ENTRY(pagedep) pd_hash; /* hashed lookup */ + ino_t pd_ino; /* associated file */ + ufs_lbn_t pd_lbn; /* block within file */ + struct newdirblk *pd_newdirblk; /* associated newdirblk if NEWBLOCK */ + struct dirremhd pd_dirremhd; /* dirrem's waiting for page */ + struct diraddhd pd_diraddhd[DAHASHSZ]; /* diradd dir entry updates */ + struct diraddhd pd_pendinghd; /* directory entries awaiting write */ + struct jmvrefhd pd_jmvrefhd; /* Dependent journal writes. */ +}; + +/* + * The "inodedep" structure tracks the set of dependencies associated + * with an inode. One task that it must manage is delayed operations + * (i.e., work requests that must be held until the inodedep's associated + * inode has been written to disk). Getting an inode from its incore + * state to the disk requires two steps to be taken by the filesystem + * in this order: first the inode must be copied to its disk buffer by + * the VOP_UPDATE operation; second the inode's buffer must be written + * to disk. To ensure that both operations have happened in the required + * order, the inodedep maintains two lists. Delayed operations are + * placed on the id_inowait list. When the VOP_UPDATE is done, all + * operations on the id_inowait list are moved to the id_bufwait list. + * When the buffer is written, the items on the id_bufwait list can be + * safely moved to the work queue to be processed. A second task of the + * inodedep structure is to track the status of block allocation within + * the inode. Each block that is allocated is represented by an + * "allocdirect" structure (see below). It is linked onto the id_newinoupdt + * list until both its contents and its allocation in the cylinder + * group map have been written to disk. Once these dependencies have been + * satisfied, it is removed from the id_newinoupdt list and any followup + * actions such as releasing the previous block or fragment are placed + * on the id_inowait list. When an inode is updated (a VOP_UPDATE is + * done), the "inodedep" structure is linked onto the buffer through + * its worklist. Thus, it will be notified when the buffer is about + * to be written and when it is done. At the update time, all the + * elements on the id_newinoupdt list are moved to the id_inoupdt list + * since those changes are now relevant to the copy of the inode in the + * buffer. Also at update time, the tasks on the id_inowait list are + * moved to the id_bufwait list so that they will be executed when + * the updated inode has been written to disk. When the buffer containing + * the inode is written to disk, any updates listed on the id_inoupdt + * list are rolled back as they are not yet safe. Following the write, + * the changes are once again rolled forward and any actions on the + * id_bufwait list are processed (since those actions are now safe). + * The entries on the id_inoupdt and id_newinoupdt lists must be kept + * sorted by logical block number to speed the calculation of the size + * of the rolled back inode (see explanation in initiate_write_inodeblock). + * When a directory entry is created, it is represented by a diradd. + * The diradd is added to the id_inowait list as it cannot be safely + * written to disk until the inode that it represents is on disk. After + * the inode is written, the id_bufwait list is processed and the diradd + * entries are moved to the id_pendinghd list where they remain until + * the directory block containing the name has been written to disk. + * The purpose of keeping the entries on the id_pendinghd list is so that + * the softdep_fsync function can find and push the inode's directory + * name(s) as part of the fsync operation for that file. + */ +struct inodedep { + struct worklist id_list; /* buffer holding inode block */ +# define id_state id_list.wk_state /* inode dependency state */ + LIST_ENTRY(inodedep) id_hash; /* hashed lookup */ + TAILQ_ENTRY(inodedep) id_unlinked; /* Unlinked but ref'd inodes */ + struct fs *id_fs; /* associated filesystem */ + ino_t id_ino; /* dependent inode */ + nlink_t id_nlinkdelta; /* saved effective link count */ + nlink_t id_savednlink; /* Link saved during rollback */ + LIST_ENTRY(inodedep) id_deps; /* bmsafemap's list of inodedep's */ + struct bmsafemap *id_bmsafemap; /* related bmsafemap (if pending) */ + struct diradd *id_mkdiradd; /* diradd for a mkdir. */ + struct inoreflst id_inoreflst; /* Inode reference adjustments. */ + long id_savedextsize; /* ext size saved during rollback */ + off_t id_savedsize; /* file size saved during rollback */ + struct dirremhd id_dirremhd; /* Removals pending. */ + struct workhead id_pendinghd; /* entries awaiting directory write */ + struct workhead id_bufwait; /* operations after inode written */ + struct workhead id_inowait; /* operations waiting inode update */ + struct allocdirectlst id_inoupdt; /* updates before inode written */ + struct allocdirectlst id_newinoupdt; /* updates when inode written */ + struct allocdirectlst id_extupdt; /* extdata updates pre-inode write */ + struct allocdirectlst id_newextupdt; /* extdata updates at ino write */ + struct freeblklst id_freeblklst; /* List of partial truncates. */ + union { + struct ufs1_dinode *idu_savedino1; /* saved ufs1_dinode contents */ + struct ufs2_dinode *idu_savedino2; /* saved ufs2_dinode contents */ + } id_un; +}; +#define id_savedino1 id_un.idu_savedino1 +#define id_savedino2 id_un.idu_savedino2 + +/* + * A "bmsafemap" structure maintains a list of dependency structures + * that depend on the update of a particular cylinder group map. + * It has lists for newblks, allocdirects, allocindirs, and inodedeps. + * It is attached to the buffer of a cylinder group block when any of + * these things are allocated from the cylinder group. It is freed + * after the cylinder group map is written and the state of its + * dependencies are updated with DEPCOMPLETE to indicate that it has + * been processed. + */ +struct bmsafemap { + struct worklist sm_list; /* cylgrp buffer */ +# define sm_state sm_list.wk_state + LIST_ENTRY(bmsafemap) sm_hash; /* Hash links. */ + LIST_ENTRY(bmsafemap) sm_next; /* Mount list. */ + int sm_cg; + struct buf *sm_buf; /* associated buffer */ + struct allocdirecthd sm_allocdirecthd; /* allocdirect deps */ + struct allocdirecthd sm_allocdirectwr; /* writing allocdirect deps */ + struct allocindirhd sm_allocindirhd; /* allocindir deps */ + struct allocindirhd sm_allocindirwr; /* writing allocindir deps */ + struct inodedephd sm_inodedephd; /* inodedep deps */ + struct inodedephd sm_inodedepwr; /* writing inodedep deps */ + struct newblkhd sm_newblkhd; /* newblk deps */ + struct newblkhd sm_newblkwr; /* writing newblk deps */ + struct jaddrefhd sm_jaddrefhd; /* Pending inode allocations. */ + struct jnewblkhd sm_jnewblkhd; /* Pending block allocations. */ + struct workhead sm_freehd; /* Freedep deps. */ + struct workhead sm_freewr; /* Written freedeps. */ +}; + +/* + * A "newblk" structure is attached to a bmsafemap structure when a block + * or fragment is allocated from a cylinder group. Its state is set to + * DEPCOMPLETE when its cylinder group map is written. It is converted to + * an allocdirect or allocindir allocation once the allocator calls the + * appropriate setup function. It will initially be linked onto a bmsafemap + * list. Once converted it can be linked onto the lists described for + * allocdirect or allocindir as described below. + */ +struct newblk { + struct worklist nb_list; /* See comment above. */ +# define nb_state nb_list.wk_state + LIST_ENTRY(newblk) nb_hash; /* Hashed lookup. */ + LIST_ENTRY(newblk) nb_deps; /* Bmsafemap's list of newblks. */ + struct jnewblk *nb_jnewblk; /* New block journal entry. */ + struct bmsafemap *nb_bmsafemap;/* Cylgrp dep (if pending). */ + struct freefrag *nb_freefrag; /* Fragment to be freed (if any). */ + struct indirdephd nb_indirdeps; /* Children indirect blocks. */ + struct workhead nb_newdirblk; /* Dir block to notify when written. */ + struct workhead nb_jwork; /* Journal work pending. */ + ufs2_daddr_t nb_newblkno; /* New value of block pointer. */ +}; + +/* + * An "allocdirect" structure is attached to an "inodedep" when a new block + * or fragment is allocated and pointed to by the inode described by + * "inodedep". The worklist is linked to the buffer that holds the block. + * When the block is first allocated, it is linked to the bmsafemap + * structure associated with the buffer holding the cylinder group map + * from which it was allocated. When the cylinder group map is written + * to disk, ad_state has the DEPCOMPLETE flag set. When the block itself + * is written, the COMPLETE flag is set. Once both the cylinder group map + * and the data itself have been written, it is safe to write the inode + * that claims the block. If there was a previous fragment that had been + * allocated before the file was increased in size, the old fragment may + * be freed once the inode claiming the new block is written to disk. + * This ad_fragfree request is attached to the id_inowait list of the + * associated inodedep (pointed to by ad_inodedep) for processing after + * the inode is written. When a block is allocated to a directory, an + * fsync of a file whose name is within that block must ensure not only + * that the block containing the file name has been written, but also + * that the on-disk inode references that block. When a new directory + * block is created, we allocate a newdirblk structure which is linked + * to the associated allocdirect (on its ad_newdirblk list). When the + * allocdirect has been satisfied, the newdirblk structure is moved to + * the inodedep id_bufwait list of its directory to await the inode + * being written. When the inode is written, the directory entries are + * fully committed and can be deleted from their pagedep->id_pendinghd + * and inodedep->id_pendinghd lists. + */ +struct allocdirect { + struct newblk ad_block; /* Common block logic */ +# define ad_list ad_block.nb_list /* block pointer worklist */ +# define ad_state ad_list.wk_state /* block pointer state */ + TAILQ_ENTRY(allocdirect) ad_next; /* inodedep's list of allocdirect's */ + struct inodedep *ad_inodedep; /* associated inodedep */ + ufs2_daddr_t ad_oldblkno; /* old value of block pointer */ + int ad_offset; /* Pointer offset in parent. */ + long ad_newsize; /* size of new block */ + long ad_oldsize; /* size of old block */ +}; +#define ad_newblkno ad_block.nb_newblkno +#define ad_freefrag ad_block.nb_freefrag +#define ad_newdirblk ad_block.nb_newdirblk + +/* + * A single "indirdep" structure manages all allocation dependencies for + * pointers in an indirect block. The up-to-date state of the indirect + * block is stored in ir_savedata. The set of pointers that may be safely + * written to the disk is stored in ir_safecopy. The state field is used + * only to track whether the buffer is currently being written (in which + * case it is not safe to update ir_safecopy). Ir_deplisthd contains the + * list of allocindir structures, one for each block that needs to be + * written to disk. Once the block and its bitmap allocation have been + * written the safecopy can be updated to reflect the allocation and the + * allocindir structure freed. If ir_state indicates that an I/O on the + * indirect block is in progress when ir_safecopy is to be updated, the + * update is deferred by placing the allocindir on the ir_donehd list. + * When the I/O on the indirect block completes, the entries on the + * ir_donehd list are processed by updating their corresponding ir_safecopy + * pointers and then freeing the allocindir structure. + */ +struct indirdep { + struct worklist ir_list; /* buffer holding indirect block */ +# define ir_state ir_list.wk_state /* indirect block pointer state */ + LIST_ENTRY(indirdep) ir_next; /* alloc{direct,indir} list */ + TAILQ_HEAD(, freework) ir_trunc; /* List of truncations. */ + caddr_t ir_saveddata; /* buffer cache contents */ + struct buf *ir_savebp; /* buffer holding safe copy */ + struct buf *ir_bp; /* buffer holding live copy */ + struct allocindirhd ir_completehd; /* waiting for indirdep complete */ + struct allocindirhd ir_writehd; /* Waiting for the pointer write. */ + struct allocindirhd ir_donehd; /* done waiting to update safecopy */ + struct allocindirhd ir_deplisthd; /* allocindir deps for this block */ + struct freeblks *ir_freeblks; /* Freeblks that frees this indir. */ +}; + +/* + * An "allocindir" structure is attached to an "indirdep" when a new block + * is allocated and pointed to by the indirect block described by the + * "indirdep". The worklist is linked to the buffer that holds the new block. + * When the block is first allocated, it is linked to the bmsafemap + * structure associated with the buffer holding the cylinder group map + * from which it was allocated. When the cylinder group map is written + * to disk, ai_state has the DEPCOMPLETE flag set. When the block itself + * is written, the COMPLETE flag is set. Once both the cylinder group map + * and the data itself have been written, it is safe to write the entry in + * the indirect block that claims the block; the "allocindir" dependency + * can then be freed as it is no longer applicable. + */ +struct allocindir { + struct newblk ai_block; /* Common block area */ +# define ai_state ai_block.nb_list.wk_state /* indirect pointer state */ + LIST_ENTRY(allocindir) ai_next; /* indirdep's list of allocindir's */ + struct indirdep *ai_indirdep; /* address of associated indirdep */ + ufs2_daddr_t ai_oldblkno; /* old value of block pointer */ + ufs_lbn_t ai_lbn; /* Logical block number. */ + int ai_offset; /* Pointer offset in parent. */ +}; +#define ai_newblkno ai_block.nb_newblkno +#define ai_freefrag ai_block.nb_freefrag +#define ai_newdirblk ai_block.nb_newdirblk + +/* + * The allblk union is used to size the newblk structure on allocation so + * that it may be any one of three types. + */ +union allblk { + struct allocindir ab_allocindir; + struct allocdirect ab_allocdirect; + struct newblk ab_newblk; +}; + +/* + * A "freefrag" structure is attached to an "inodedep" when a previously + * allocated fragment is replaced with a larger fragment, rather than extended. + * The "freefrag" structure is constructed and attached when the replacement + * block is first allocated. It is processed after the inode claiming the + * bigger block that replaces it has been written to disk. + */ +struct freefrag { + struct worklist ff_list; /* id_inowait or delayed worklist */ +# define ff_state ff_list.wk_state + struct worklist *ff_jdep; /* Associated journal entry. */ + struct workhead ff_jwork; /* Journal work pending. */ + ufs2_daddr_t ff_blkno; /* fragment physical block number */ + long ff_fragsize; /* size of fragment being deleted */ + ino_t ff_inum; /* owning inode number */ + enum vtype ff_vtype; /* owning inode's file type */ +}; + +/* + * A "freeblks" structure is attached to an "inodedep" when the + * corresponding file's length is reduced to zero. It records all + * the information needed to free the blocks of a file after its + * zero'ed inode has been written to disk. The actual work is done + * by child freework structures which are responsible for individual + * inode pointers while freeblks is responsible for retiring the + * entire operation when it is complete and holding common members. + */ +struct freeblks { + struct worklist fb_list; /* id_inowait or delayed worklist */ +# define fb_state fb_list.wk_state /* inode and dirty block state */ + TAILQ_ENTRY(freeblks) fb_next; /* List of inode truncates. */ + struct jblkdephd fb_jblkdephd; /* Journal entries pending */ + struct workhead fb_freeworkhd; /* Work items pending */ + struct workhead fb_jwork; /* Journal work pending */ + struct vnode *fb_devvp; /* filesystem device vnode */ +#ifdef QUOTA + struct dquot *fb_quota[MAXQUOTAS]; /* quotas to be adjusted */ +#endif + uint64_t fb_modrev; /* Inode revision at start of trunc. */ + off_t fb_len; /* Length we're truncating to. */ + ufs2_daddr_t fb_chkcnt; /* Blocks released. */ + ino_t fb_inum; /* inode owner of blocks */ + enum vtype fb_vtype; /* inode owner's file type */ + uid_t fb_uid; /* uid of previous owner of blocks */ + int fb_ref; /* Children outstanding. */ + int fb_cgwait; /* cg writes outstanding. */ +}; + +/* + * A "freework" structure handles the release of a tree of blocks or a single + * block. Each indirect block in a tree is allocated its own freework + * structure so that the indirect block may be freed only when all of its + * children are freed. In this way we enforce the rule that an allocated + * block must have a valid path to a root that is journaled. Each child + * block acquires a reference and when the ref hits zero the parent ref + * is decremented. If there is no parent the freeblks ref is decremented. + */ +struct freework { + struct worklist fw_list; /* Delayed worklist. */ +# define fw_state fw_list.wk_state + LIST_ENTRY(freework) fw_segs; /* Seg list. */ + TAILQ_ENTRY(freework) fw_next; /* Hash/Trunc list. */ + struct jnewblk *fw_jnewblk; /* Journal entry to cancel. */ + struct freeblks *fw_freeblks; /* Root of operation. */ + struct freework *fw_parent; /* Parent indirect. */ + struct indirdep *fw_indir; /* indirect block. */ + ufs2_daddr_t fw_blkno; /* Our block #. */ + ufs_lbn_t fw_lbn; /* Original lbn before free. */ + uint16_t fw_frags; /* Number of frags. */ + uint16_t fw_ref; /* Number of children out. */ + uint16_t fw_off; /* Current working position. */ + uint16_t fw_start; /* Start of partial truncate. */ +}; + +/* + * A "freedep" structure is allocated to track the completion of a bitmap + * write for a freework. One freedep may cover many freed blocks so long + * as they reside in the same cylinder group. When the cg is written + * the freedep decrements the ref on the freework which may permit it + * to be freed as well. + */ +struct freedep { + struct worklist fd_list; /* Delayed worklist. */ + struct freework *fd_freework; /* Parent freework. */ +}; + +/* + * A "freefile" structure is attached to an inode when its + * link count is reduced to zero. It marks the inode as free in + * the cylinder group map after the zero'ed inode has been written + * to disk and any associated blocks and fragments have been freed. + */ +struct freefile { + struct worklist fx_list; /* id_inowait or delayed worklist */ + mode_t fx_mode; /* mode of inode */ + ino_t fx_oldinum; /* inum of the unlinked file */ + struct vnode *fx_devvp; /* filesystem device vnode */ + struct workhead fx_jwork; /* journal work pending. */ +}; + +/* + * A "diradd" structure is linked to an "inodedep" id_inowait list when a + * new directory entry is allocated that references the inode described + * by "inodedep". When the inode itself is written (either the initial + * allocation for new inodes or with the increased link count for + * existing inodes), the COMPLETE flag is set in da_state. If the entry + * is for a newly allocated inode, the "inodedep" structure is associated + * with a bmsafemap which prevents the inode from being written to disk + * until the cylinder group has been updated. Thus the da_state COMPLETE + * flag cannot be set until the inode bitmap dependency has been removed. + * When creating a new file, it is safe to write the directory entry that + * claims the inode once the referenced inode has been written. Since + * writing the inode clears the bitmap dependencies, the DEPCOMPLETE flag + * in the diradd can be set unconditionally when creating a file. When + * creating a directory, there are two additional dependencies described by + * mkdir structures (see their description below). When these dependencies + * are resolved the DEPCOMPLETE flag is set in the diradd structure. + * If there are multiple links created to the same inode, there will be + * a separate diradd structure created for each link. The diradd is + * linked onto the pg_diraddhd list of the pagedep for the directory + * page that contains the entry. When a directory page is written, + * the pg_diraddhd list is traversed to rollback any entries that are + * not yet ready to be written to disk. If a directory entry is being + * changed (by rename) rather than added, the DIRCHG flag is set and + * the da_previous entry points to the entry that will be "removed" + * once the new entry has been committed. During rollback, entries + * with da_previous are replaced with the previous inode number rather + * than zero. + * + * The overlaying of da_pagedep and da_previous is done to keep the + * structure down. If a da_previous entry is present, the pointer to its + * pagedep is available in the associated dirrem entry. If the DIRCHG flag + * is set, the da_previous entry is valid; if not set the da_pagedep entry + * is valid. The DIRCHG flag never changes; it is set when the structure + * is created if appropriate and is never cleared. + */ +struct diradd { + struct worklist da_list; /* id_inowait or id_pendinghd list */ +# define da_state da_list.wk_state /* state of the new directory entry */ + LIST_ENTRY(diradd) da_pdlist; /* pagedep holding directory block */ + doff_t da_offset; /* offset of new dir entry in dir blk */ + ino_t da_newinum; /* inode number for the new dir entry */ + union { + struct dirrem *dau_previous; /* entry being replaced in dir change */ + struct pagedep *dau_pagedep; /* pagedep dependency for addition */ + } da_un; + struct workhead da_jwork; /* Journal work awaiting completion. */ +}; +#define da_previous da_un.dau_previous +#define da_pagedep da_un.dau_pagedep + +/* + * Two "mkdir" structures are needed to track the additional dependencies + * associated with creating a new directory entry. Normally a directory + * addition can be committed as soon as the newly referenced inode has been + * written to disk with its increased link count. When a directory is + * created there are two additional dependencies: writing the directory + * data block containing the "." and ".." entries (MKDIR_BODY) and writing + * the parent inode with the increased link count for ".." (MKDIR_PARENT). + * These additional dependencies are tracked by two mkdir structures that + * reference the associated "diradd" structure. When they have completed, + * they set the DEPCOMPLETE flag on the diradd so that it knows that its + * extra dependencies have been completed. The md_state field is used only + * to identify which type of dependency the mkdir structure is tracking. + * It is not used in the mainline code for any purpose other than consistency + * checking. All the mkdir structures in the system are linked together on + * a list. This list is needed so that a diradd can find its associated + * mkdir structures and deallocate them if it is prematurely freed (as for + * example if a mkdir is immediately followed by a rmdir of the same directory). + * Here, the free of the diradd must traverse the list to find the associated + * mkdir structures that reference it. The deletion would be faster if the + * diradd structure were simply augmented to have two pointers that referenced + * the associated mkdir's. However, this would increase the size of the diradd + * structure to speed a very infrequent operation. + */ +struct mkdir { + struct worklist md_list; /* id_inowait or buffer holding dir */ +# define md_state md_list.wk_state /* type: MKDIR_PARENT or MKDIR_BODY */ + struct diradd *md_diradd; /* associated diradd */ + struct jaddref *md_jaddref; /* dependent jaddref. */ + struct buf *md_buf; /* MKDIR_BODY: buffer holding dir */ + LIST_ENTRY(mkdir) md_mkdirs; /* list of all mkdirs */ +}; + +/* + * A "dirrem" structure describes an operation to decrement the link + * count on an inode. The dirrem structure is attached to the pg_dirremhd + * list of the pagedep for the directory page that contains the entry. + * It is processed after the directory page with the deleted entry has + * been written to disk. + */ +struct dirrem { + struct worklist dm_list; /* delayed worklist */ +# define dm_state dm_list.wk_state /* state of the old directory entry */ + LIST_ENTRY(dirrem) dm_next; /* pagedep's list of dirrem's */ + LIST_ENTRY(dirrem) dm_inonext; /* inodedep's list of dirrem's */ + struct jremrefhd dm_jremrefhd; /* Pending remove reference deps. */ + ino_t dm_oldinum; /* inum of the removed dir entry */ + doff_t dm_offset; /* offset of removed dir entry in blk */ + union { + struct pagedep *dmu_pagedep; /* pagedep dependency for remove */ + ino_t dmu_dirinum; /* parent inode number (for rmdir) */ + } dm_un; + struct workhead dm_jwork; /* Journal work awaiting completion. */ +}; +#define dm_pagedep dm_un.dmu_pagedep +#define dm_dirinum dm_un.dmu_dirinum + +/* + * A "newdirblk" structure tracks the progress of a newly allocated + * directory block from its creation until it is claimed by its on-disk + * inode. When a block is allocated to a directory, an fsync of a file + * whose name is within that block must ensure not only that the block + * containing the file name has been written, but also that the on-disk + * inode references that block. When a new directory block is created, + * we allocate a newdirblk structure which is linked to the associated + * allocdirect (on its ad_newdirblk list). When the allocdirect has been + * satisfied, the newdirblk structure is moved to the inodedep id_bufwait + * list of its directory to await the inode being written. When the inode + * is written, the directory entries are fully committed and can be + * deleted from their pagedep->id_pendinghd and inodedep->id_pendinghd + * lists. Note that we could track directory blocks allocated to indirect + * blocks using a similar scheme with the allocindir structures. Rather + * than adding this level of complexity, we simply write those newly + * allocated indirect blocks synchronously as such allocations are rare. + * In the case of a new directory the . and .. links are tracked with + * a mkdir rather than a pagedep. In this case we track the mkdir + * so it can be released when it is written. A workhead is used + * to simplify canceling a mkdir that is removed by a subsequent dirrem. + */ +struct newdirblk { + struct worklist db_list; /* id_inowait or pg_newdirblk */ +# define db_state db_list.wk_state + struct pagedep *db_pagedep; /* associated pagedep */ + struct workhead db_mkdir; +}; + +/* + * The inoref structure holds the elements common to jaddref and jremref + * so they may easily be queued in-order on the inodedep. + */ +struct inoref { + struct worklist if_list; /* Journal pending or jseg entries. */ +# define if_state if_list.wk_state + TAILQ_ENTRY(inoref) if_deps; /* Links for inodedep. */ + struct jsegdep *if_jsegdep; /* Will track our journal record. */ + off_t if_diroff; /* Directory offset. */ + ino_t if_ino; /* Inode number. */ + ino_t if_parent; /* Parent inode number. */ + nlink_t if_nlink; /* nlink before addition. */ + uint16_t if_mode; /* File mode, needed for IFMT. */ +}; + +/* + * A "jaddref" structure tracks a new reference (link count) on an inode + * and prevents the link count increase and bitmap allocation until a + * journal entry can be written. Once the journal entry is written, + * the inode is put on the pendinghd of the bmsafemap and a diradd or + * mkdir entry is placed on the bufwait list of the inode. The DEPCOMPLETE + * flag is used to indicate that all of the required information for writing + * the journal entry is present. MKDIR_BODY and MKDIR_PARENT are used to + * differentiate . and .. links from regular file names. NEWBLOCK indicates + * a bitmap is still pending. If a new reference is canceled by a delete + * prior to writing the journal the jaddref write is canceled and the + * structure persists to prevent any disk-visible changes until it is + * ultimately released when the file is freed or the link is dropped again. + */ +struct jaddref { + struct inoref ja_ref; /* see inoref above. */ +# define ja_list ja_ref.if_list /* Jrnl pending, id_inowait, dm_jwork.*/ +# define ja_state ja_ref.if_list.wk_state + LIST_ENTRY(jaddref) ja_bmdeps; /* Links for bmsafemap. */ + union { + struct diradd *jau_diradd; /* Pending diradd. */ + struct mkdir *jau_mkdir; /* MKDIR_{PARENT,BODY} */ + } ja_un; +}; +#define ja_diradd ja_un.jau_diradd +#define ja_mkdir ja_un.jau_mkdir +#define ja_diroff ja_ref.if_diroff +#define ja_ino ja_ref.if_ino +#define ja_parent ja_ref.if_parent +#define ja_mode ja_ref.if_mode + +/* + * A "jremref" structure tracks a removed reference (unlink) on an + * inode and prevents the directory remove from proceeding until the + * journal entry is written. Once the journal has been written the remove + * may proceed as normal. + */ +struct jremref { + struct inoref jr_ref; /* see inoref above. */ +# define jr_list jr_ref.if_list /* Linked to softdep_journal_pending. */ +# define jr_state jr_ref.if_list.wk_state + LIST_ENTRY(jremref) jr_deps; /* Links for dirrem. */ + struct dirrem *jr_dirrem; /* Back pointer to dirrem. */ +}; + +/* + * A "jmvref" structure tracks a name relocations within the same + * directory block that occur as a result of directory compaction. + * It prevents the updated directory entry from being written to disk + * until the journal entry is written. Once the journal has been + * written the compacted directory may be written to disk. + */ +struct jmvref { + struct worklist jm_list; /* Linked to softdep_journal_pending. */ + LIST_ENTRY(jmvref) jm_deps; /* Jmvref on pagedep. */ + struct pagedep *jm_pagedep; /* Back pointer to pagedep. */ + ino_t jm_parent; /* Containing directory inode number. */ + ino_t jm_ino; /* Inode number of our entry. */ + off_t jm_oldoff; /* Our old offset in directory. */ + off_t jm_newoff; /* Our new offset in directory. */ +}; + +/* + * A "jnewblk" structure tracks a newly allocated block or fragment and + * prevents the direct or indirect block pointer as well as the cg bitmap + * from being written until it is logged. After it is logged the jsegdep + * is attached to the allocdirect or allocindir until the operation is + * completed or reverted. If the operation is reverted prior to the journal + * write the jnewblk structure is maintained to prevent the bitmaps from + * reaching the disk. Ultimately the jnewblk structure will be passed + * to the free routine as the in memory cg is modified back to the free + * state at which time it can be released. It may be held on any of the + * fx_jwork, fw_jwork, fb_jwork, ff_jwork, nb_jwork, or ir_jwork lists. + */ +struct jnewblk { + struct worklist jn_list; /* See lists above. */ +# define jn_state jn_list.wk_state + struct jsegdep *jn_jsegdep; /* Will track our journal record. */ + LIST_ENTRY(jnewblk) jn_deps; /* Jnewblks on sm_jnewblkhd. */ + struct worklist *jn_dep; /* Dependency to ref completed seg. */ + ufs_lbn_t jn_lbn; /* Lbn to which allocated. */ + ufs2_daddr_t jn_blkno; /* Blkno allocated */ + ino_t jn_ino; /* Ino to which allocated. */ + int jn_oldfrags; /* Previous fragments when extended. */ + int jn_frags; /* Number of fragments. */ +}; + +/* + * A "jblkdep" structure tracks jfreeblk and jtrunc records attached to a + * freeblks structure. + */ +struct jblkdep { + struct worklist jb_list; /* For softdep journal pending. */ + struct jsegdep *jb_jsegdep; /* Reference to the jseg. */ + struct freeblks *jb_freeblks; /* Back pointer to freeblks. */ + LIST_ENTRY(jblkdep) jb_deps; /* Dep list on freeblks. */ + +}; + +/* + * A "jfreeblk" structure tracks the journal write for freeing a block + * or tree of blocks. The block pointer must not be cleared in the inode + * or indirect prior to the jfreeblk being written to the journal. + */ +struct jfreeblk { + struct jblkdep jf_dep; /* freeblks linkage. */ + ufs_lbn_t jf_lbn; /* Lbn from which blocks freed. */ + ufs2_daddr_t jf_blkno; /* Blkno being freed. */ + ino_t jf_ino; /* Ino from which blocks freed. */ + int jf_frags; /* Number of frags being freed. */ +}; + +/* + * A "jfreefrag" tracks the freeing of a single block when a fragment is + * extended or an indirect page is replaced. It is not part of a larger + * freeblks operation. + */ +struct jfreefrag { + struct worklist fr_list; /* Linked to softdep_journal_pending. */ +# define fr_state fr_list.wk_state + struct jsegdep *fr_jsegdep; /* Will track our journal record. */ + struct freefrag *fr_freefrag; /* Back pointer to freefrag. */ + ufs_lbn_t fr_lbn; /* Lbn from which frag freed. */ + ufs2_daddr_t fr_blkno; /* Blkno being freed. */ + ino_t fr_ino; /* Ino from which frag freed. */ + int fr_frags; /* Size of frag being freed. */ +}; + +/* + * A "jtrunc" journals the intent to truncate an inode's data or extent area. + */ +struct jtrunc { + struct jblkdep jt_dep; /* freeblks linkage. */ + off_t jt_size; /* Final file size. */ + int jt_extsize; /* Final extent size. */ + ino_t jt_ino; /* Ino being truncated. */ +}; + +/* + * A "jfsync" journals the completion of an fsync which invalidates earlier + * jtrunc records in the journal. + */ +struct jfsync { + struct worklist jfs_list; /* For softdep journal pending. */ + off_t jfs_size; /* Sync file size. */ + int jfs_extsize; /* Sync extent size. */ + ino_t jfs_ino; /* ino being synced. */ +}; + +/* + * A "jsegdep" structure tracks a single reference to a written journal + * segment so the journal space can be reclaimed when all dependencies + * have been written. It can hang off of id_inowait, dm_jwork, da_jwork, + * nb_jwork, ff_jwork, or fb_jwork lists. + */ +struct jsegdep { + struct worklist jd_list; /* See above for lists. */ +# define jd_state jd_list.wk_state + struct jseg *jd_seg; /* Our journal record. */ +}; + +/* + * A "jseg" structure contains all of the journal records written in a + * single disk write. The jaddref and jremref structures are linked into + * js_entries so thay may be completed when the write completes. The + * js_entries also include the write dependency structures: jmvref, + * jnewblk, jfreeblk, jfreefrag, and jtrunc. The js_refs field counts + * the number of entries on the js_entries list. Thus there is a single + * jseg entry to describe each journal write. + */ +struct jseg { + struct worklist js_list; /* b_deps link for journal */ +# define js_state js_list.wk_state + struct workhead js_entries; /* Entries awaiting write */ + LIST_HEAD(, freework) js_indirs;/* List of indirects in this seg. */ + TAILQ_ENTRY(jseg) js_next; /* List of all unfinished segments. */ + struct jblocks *js_jblocks; /* Back pointer to block/seg list */ + struct buf *js_buf; /* Buffer while unwritten */ + uint64_t js_seq; /* Journal record sequence number. */ + uint64_t js_oldseq; /* Oldest valid sequence number. */ + int js_size; /* Size of journal record in bytes. */ + int js_cnt; /* Total items allocated. */ + int js_refs; /* Count of js_entries items. */ +}; + +/* + * A 'sbdep' structure tracks the head of the free inode list and + * superblock writes. This makes sure the superblock is always pointing at + * the first possible unlinked inode for the suj recovery process. If a + * block write completes and we discover a new head is available the buf + * is dirtied and the dep is kept. See the description of the UNLINK* + * flags above for more details. + */ +struct sbdep { + struct worklist sb_list; /* b_dep linkage */ + struct fs *sb_fs; /* Filesystem pointer within buf. */ + struct ufsmount *sb_ump; /* Our mount structure */ +}; + +/* + * Private journaling structures. + */ +struct jblocks { + struct jseglst jb_segs; /* TAILQ of current segments. */ + struct jseg *jb_writeseg; /* Next write to complete. */ + struct jseg *jb_oldestseg; /* Oldest segment with valid entries. */ + struct jextent *jb_extent; /* Extent array. */ + uint64_t jb_nextseq; /* Next sequence number. */ + uint64_t jb_oldestwrseq; /* Oldest written sequence number. */ + uint8_t jb_needseg; /* Need a forced segment. */ + uint8_t jb_suspended; /* Did journal suspend writes? */ + int jb_avail; /* Available extents. */ + int jb_used; /* Last used extent. */ + int jb_head; /* Allocator head. */ + int jb_off; /* Allocator extent offset. */ + int jb_blocks; /* Total disk blocks covered. */ + int jb_free; /* Total disk blocks free. */ + int jb_min; /* Minimum free space. */ + int jb_low; /* Low on space. */ + int jb_age; /* Insertion time of oldest rec. */ +}; + +struct jextent { + ufs2_daddr_t je_daddr; /* Disk block address. */ + int je_blocks; /* Disk block count. */ +}; + +/* + * Hash table declarations. + */ +LIST_HEAD(mkdirlist, mkdir); +LIST_HEAD(pagedep_hashhead, pagedep); +LIST_HEAD(inodedep_hashhead, inodedep); +LIST_HEAD(newblk_hashhead, newblk); +LIST_HEAD(bmsafemap_hashhead, bmsafemap); +TAILQ_HEAD(indir_hashhead, freework); + +/* + * Per-filesystem soft dependency data. + * Allocated at mount and freed at unmount. + */ +struct mount_softdeps { + struct rwlock sd_fslock; /* softdep lock */ + struct workhead sd_workitem_pending; /* softdep work queue */ + struct worklist *sd_worklist_tail; /* Tail pointer for above */ + struct workhead sd_journal_pending; /* journal work queue */ + struct worklist *sd_journal_tail; /* Tail pointer for above */ + struct jblocks *sd_jblocks; /* Journal block information */ + struct inodedeplst sd_unlinked; /* Unlinked inodes */ + struct bmsafemaphd sd_dirtycg; /* Dirty CGs */ + struct mkdirlist sd_mkdirlisthd; /* Track mkdirs */ + struct pagedep_hashhead *sd_pdhash; /* pagedep hash table */ + u_long sd_pdhashsize; /* pagedep hash table size-1 */ + long sd_pdnextclean; /* next hash bucket to clean */ + struct inodedep_hashhead *sd_idhash; /* inodedep hash table */ + u_long sd_idhashsize; /* inodedep hash table size-1 */ + long sd_idnextclean; /* next hash bucket to clean */ + struct newblk_hashhead *sd_newblkhash; /* newblk hash table */ + u_long sd_newblkhashsize; /* newblk hash table size-1 */ + struct bmsafemap_hashhead *sd_bmhash; /* bmsafemap hash table */ + u_long sd_bmhashsize; /* bmsafemap hash table size-1*/ + struct indir_hashhead *sd_indirhash; /* indir hash table */ + u_long sd_indirhashsize; /* indir hash table size-1 */ + int sd_on_journal; /* Items on the journal list */ + int sd_on_worklist; /* Items on the worklist */ + int sd_deps; /* Total dependency count */ + int sd_accdeps; /* accumulated dep count */ + int sd_req; /* Wakeup when deps hits 0. */ + int sd_flags; /* comm with flushing thread */ + int sd_cleanups; /* Calls to cleanup */ + struct thread *sd_flushtd; /* thread handling flushing */ + TAILQ_ENTRY(mount_softdeps) sd_next; /* List of softdep filesystem */ + struct ufsmount *sd_ump; /* our ufsmount structure */ + u_long sd_curdeps[D_LAST + 1]; /* count of current deps */ +}; +/* + * Flags for communicating with the syncer thread. + */ +#define FLUSH_EXIT 0x0001 /* time to exit */ +#define FLUSH_CLEANUP 0x0002 /* need to clear out softdep structures */ +#define FLUSH_STARTING 0x0004 /* flush thread not yet started */ +#define FLUSH_RC_ACTIVE 0x0008 /* a thread is flushing the mount point */ + +/* + * Keep the old names from when these were in the ufsmount structure. + */ +#define softdep_workitem_pending um_softdep->sd_workitem_pending +#define softdep_worklist_tail um_softdep->sd_worklist_tail +#define softdep_journal_pending um_softdep->sd_journal_pending +#define softdep_journal_tail um_softdep->sd_journal_tail +#define softdep_jblocks um_softdep->sd_jblocks +#define softdep_unlinked um_softdep->sd_unlinked +#define softdep_dirtycg um_softdep->sd_dirtycg +#define softdep_mkdirlisthd um_softdep->sd_mkdirlisthd +#define pagedep_hashtbl um_softdep->sd_pdhash +#define pagedep_hash_size um_softdep->sd_pdhashsize +#define pagedep_nextclean um_softdep->sd_pdnextclean +#define inodedep_hashtbl um_softdep->sd_idhash +#define inodedep_hash_size um_softdep->sd_idhashsize +#define inodedep_nextclean um_softdep->sd_idnextclean +#define newblk_hashtbl um_softdep->sd_newblkhash +#define newblk_hash_size um_softdep->sd_newblkhashsize +#define bmsafemap_hashtbl um_softdep->sd_bmhash +#define bmsafemap_hash_size um_softdep->sd_bmhashsize +#define indir_hashtbl um_softdep->sd_indirhash +#define indir_hash_size um_softdep->sd_indirhashsize +#define softdep_on_journal um_softdep->sd_on_journal +#define softdep_on_worklist um_softdep->sd_on_worklist +#define softdep_deps um_softdep->sd_deps +#define softdep_accdeps um_softdep->sd_accdeps +#define softdep_req um_softdep->sd_req +#define softdep_flags um_softdep->sd_flags +#define softdep_flushtd um_softdep->sd_flushtd +#define softdep_curdeps um_softdep->sd_curdeps diff --git a/Dump/ufs/ufs/README.acls b/Dump/ufs/ufs/README.acls new file mode 100644 index 0000000..0e8a9d5 --- /dev/null +++ b/Dump/ufs/ufs/README.acls @@ -0,0 +1,79 @@ +$FreeBSD: releng/11.2/sys/ufs/ufs/README.acls 105456 2002-10-19 16:09:16Z rwatson $ + + UFS Access Control Lists Copyright + +The UFS Access Control Lists implementation is copyright Robert Watson, +and is made available under a Berkeley-style license. + + About UFS Access Control Lists (ACLs) + +Access control lists allow the association of fine-grained discretionary +access control information with files and directories, extending the +base UNIX permission model in a (mostly) compatible way. This +implementation largely follows the POSIX.1e model, and relies on the +availability of extended attributes to store extended components of +the ACL, while maintaining the base permission information in the inode. + + Using UFS Access Control Lists (ACLs) + +Support for UFS access control lists may be enabled by adding: + + options UFS_ACL + +to your kernel configuration. As ACLs rely on the availability of extended +attributes, your file systems must have support for extended attributes. +For UFS2, this is supported natively, so no further configuration is +necessary. For UFS1, you must also enable the optional extended attributes +support documented in README.extattr. A summary of the instructions +and ACL-specific information follows. + +To enable support for ACLs on a file system, the 'acls' mount flag +must be set for the file system. This may be set using the tunefs +'-a' flag: + + tunefs -a enable /dev/md0a + +Or by using the mount-time flag: + + mount -o acls /dev/md0a /mnt + +The flag may also be set in /etc/fstab. Note that mounting a file +system previously configured for ACLs without ACL-support will result +in incorrect application of discretionary protections. Likewise, +mounting an ACL-enabled file system without kernel support for ACLs +will result in incorrect application of discretionary protections. If +the kernel is not configured for ACL support, a warning will be +printed by the kernel at mount-time. For reliability purposes, it +is recommended that the superblock flag be used instead of the +mount-time flag, as this will avoid re-mount isses with the root file +system. For reliability and performance reasons, the use of ACLs on +UFS1 is discouraged; UFS2 extended attributes provide a more reliable +storage mechanism for ACLs. + +Currently, support for ACLs on UFS1 requires the use of UFS1 EAs, which may +be enabled by adding: + + options UFS_EXTATTR + +to your kernel configuration file and rebuilding. Because of filesystem +mount atomicity requirements, it is also recommended that: + + options UFS_EXTATTR_AUTOSTART + +be added to the kernel so as to support the atomic enabling of the +required extended attributes with the filesystem mount operation. To +enable ACLs, two extended attributes must be available in the +EXTATTR_NAMESPACE_SYSTEM namespace: "posix1e.acl_access", which holds +the access ACL, and "posix1e.acl_default" which holds the default ACL +for directories. If you're using UFS1 Extended Attributes, the following +commands may be used to create the necessary EA backing files for +ACLs in the filesystem root of each filesystem. In these examples, +the root filesystem is used; see README.extattr for more details. + + mkdir -p /.attribute/system + cd /.attribute/system + extattrctl initattr -p / 388 posix1e.acl_access + extattrctl initattr -p / 388 posix1e.acl_default + +On the next mount of the root filesystem, the attributes will be +automatically started, and ACLs will be enabled. diff --git a/Dump/ufs/ufs/README.extattr b/Dump/ufs/ufs/README.extattr new file mode 100644 index 0000000..eea7628 --- /dev/null +++ b/Dump/ufs/ufs/README.extattr @@ -0,0 +1,91 @@ +$FreeBSD: releng/11.2/sys/ufs/ufs/README.extattr 105417 2002-10-18 21:11:36Z rwatson $ + + UFS Extended Attributes Copyright + +The UFS Extended Attributes implementation is copyright Robert Watson, and +is made available under a Berkeley-style license. + + About UFS Extended Attributes + +Extended attributes allow the association of additional arbitrary +meta-data with files and directories. Extended attributes are defined in +the form name=value, where name is an nul-terminated string in the style +of a filename, and value is a binary blob of zero or more bytes. The UFS +extended attribute service layers support for extended attributes onto a +backing file, in the style of the quota implementation, meaning that it +requires no underlying format changes in the filesystem. This design +choice exchanges simplicity, usability and easy deployment for +performance. When defined, extended attribute names exist in a series of +disjoint namespaces: currently, two namespaces are defined: +EXTATTR_NAMESPACE_SYSTEM and EXTATTR_NAMESPACE_USER. The primary +distinction lies in the protection model: USER EAs are protected using the +normal inode protections, whereas SYSTEM EAs require privilege to access +or modify. + + Using UFS Extended Attributes + +Support for UFS extended attributes is natively available in UFS2, and +requires no special configuration. For reliability, administrative, +and performance reasons, if you plan to use extended attributes, it +is recommended that you use UFS2 in preference to UFS1. + +Support for UFS extended attributes may be enabled for UFS1 by adding: + + options UFS_EXTATTR + +to your kernel configuration file. This allows UFS-based filesystems to +support extended attributes, but requires manual administration of EAs +using the extattrctl tool, including the starting of EA support for each +filesystem, and the enabling of individual attributes for the file +system. The extattrctl utility may be used to initialize backing files +before first use, to start and stop EA service on a filesystem, and to +enable and disable named attributes. The command lines for extattrctl +take the following forms: + + extattrctl start [path] + extattrctl stop [path] + extattrctl initattr [-f] [-p path] [attrsize] [attrfile] + extattrctl enable [path] [attrnamespace] [attrname] [attrfile] + extattrctl disable [path] [attrnamespace] [attrname] + +In each case, [path] is used to indicate the mounted filesystem on which +to perform the operation. [attrnamespace] refers to the namespace in +which the attribute is being manipulated, and may be "system" or "user". +The [attrname] is the attribute name to use for the operation. The +[attrfile] argument specifies the attribute backing file to use. When +using the "initattr" function to initialize a backing file, the maximum +size of attribute data must be defined in bytes using the [attrsize] +field. Optionally, the [-p path] argument may be used to indicate to +extattrctl that it should pre-allocate space for EA data, rather than +creating a sparse backing file. This prevents attribute operations from +failing in low disk-space conditions (which can be important when EAs are +used for security purposes), but pre-allocation will consume space +proportional to the product of the defined maximum attribute size and +number of attributes on the specified filesystem. + +Manual configuration increases administrative overhead, but also +introduces the possibility of race conditions during filesystem mount, if +EAs are used to support other features, as starting the EAs manually is +not atomic with the mount operation. To address this problem, an +additional kernel option may be defined to auto-start EAs on a UFS file +system based on special directories at mount-time: + + options UFS_EXTATTR_AUTOSTART + +If this option is defined, UFS will search for a ".attribute" +sub-directory of the filesystem root during the mount operation. If it +is found, EA support will be started for the filesystem. UFS will then +search for "system" and "user" sub-directories of the ".attribute" +directory for any potential backing files, and enable an EA for each valid +backing file with the name of the backing file as the attribute name. +For example, by creating the following tree, the two EAs, +posix1e.acl_access and posix1e.acl_default will be enabled in the system +namespace of the root filesystem, reserving space for attribute data: + + mkdir -p /.attribute/system + cd /.attribute/system + extattrctl initattr -p / 388 posix1e.acl_access + extattrctl initattr -p / 388 posix1e.acl_default + +On the next mount of the root filesystem, the attributes will be +automatically started. diff --git a/Dump/ufs/ufs/acl.h b/Dump/ufs/ufs/acl.h new file mode 100644 index 0000000..63b32dd --- /dev/null +++ b/Dump/ufs/ufs/acl.h @@ -0,0 +1,51 @@ +/*- + * Copyright (c) 1999-2001 Robert N. M. Watson + * All rights reserved. + * + * This software was developed by Robert Watson for the TrustedBSD Project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: releng/11.2/sys/ufs/ufs/acl.h 200796 2009-12-21 19:39:10Z trasz $ + */ +/* + * Developed by the TrustedBSD Project. + * Support for POSIX.1e access control lists. + */ + +#ifndef _UFS_UFS_ACL_H_ +#define _UFS_UFS_ACL_H_ + +#ifdef _KERNEL + +int ufs_getacl_nfs4_internal(struct vnode *vp, struct acl *aclp, struct thread *td); +int ufs_setacl_nfs4_internal(struct vnode *vp, struct acl *aclp, struct thread *td); +void ufs_sync_acl_from_inode(struct inode *ip, struct acl *acl); +void ufs_sync_inode_from_acl(struct acl *acl, struct inode *ip); + +int ufs_getacl(struct vop_getacl_args *); +int ufs_setacl(struct vop_setacl_args *); +int ufs_aclcheck(struct vop_aclcheck_args *); + +#endif /* !_KERNEL */ + +#endif /* !_UFS_UFS_ACL_H_ */ diff --git a/Dump/ufs/ufs/dinode.h b/Dump/ufs/ufs/dinode.h new file mode 100644 index 0000000..386ac8c --- /dev/null +++ b/Dump/ufs/ufs/dinode.h @@ -0,0 +1,189 @@ +/*- + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Marshall + * Kirk McKusick and Network Associates Laboratories, the Security + * Research Division of Network Associates, Inc. under DARPA/SPAWAR + * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS + * research program + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (c) 1982, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)dinode.h 8.3 (Berkeley) 1/21/94 + * $FreeBSD: releng/11.2/sys/ufs/ufs/dinode.h 257029 2013-10-24 00:33:29Z pfg $ + */ + +#ifndef _UFS_UFS_DINODE_H_ +#define _UFS_UFS_DINODE_H_ + +/* + * The root inode is the root of the filesystem. Inode 0 can't be used for + * normal purposes and historically bad blocks were linked to inode 1, thus + * the root inode is 2. (Inode 1 is no longer used for this purpose, however + * numerous dump tapes make this assumption, so we are stuck with it). + */ +#define ROOTINO ((ino_t)2) + +/* + * The Whiteout inode# is a dummy non-zero inode number which will + * never be allocated to a real file. It is used as a place holder + * in the directory entry which has been tagged as a DT_WHT entry. + * See the comments about ROOTINO above. + */ +#define WINO ((ino_t)1) + +/* + * The size of physical and logical block numbers and time fields in UFS. + */ +typedef int32_t ufs1_daddr_t; +typedef int64_t ufs2_daddr_t; +typedef int64_t ufs_lbn_t; +typedef int64_t ufs_time_t; + +/* File permissions. */ +#define IEXEC 0000100 /* Executable. */ +#define IWRITE 0000200 /* Writeable. */ +#define IREAD 0000400 /* Readable. */ +#define ISVTX 0001000 /* Sticky bit. */ +#define ISGID 0002000 /* Set-gid. */ +#define ISUID 0004000 /* Set-uid. */ + +/* File types. */ +#define IFMT 0170000 /* Mask of file type. */ +#define IFIFO 0010000 /* Named pipe (fifo). */ +#define IFCHR 0020000 /* Character device. */ +#define IFDIR 0040000 /* Directory file. */ +#define IFBLK 0060000 /* Block device. */ +#define IFREG 0100000 /* Regular file. */ +#define IFLNK 0120000 /* Symbolic link. */ +#define IFSOCK 0140000 /* UNIX domain socket. */ +#define IFWHT 0160000 /* Whiteout. */ + +/* + * A dinode contains all the meta-data associated with a UFS2 file. + * This structure defines the on-disk format of a dinode. Since + * this structure describes an on-disk structure, all its fields + * are defined by types with precise widths. + */ + +#define NXADDR 2 /* External addresses in inode. */ +#define NDADDR 12 /* Direct addresses in inode. */ +#define NIADDR 3 /* Indirect addresses in inode. */ + +struct ufs2_dinode { + u_int16_t di_mode; /* 0: IFMT, permissions; see below. */ + int16_t di_nlink; /* 2: File link count. */ + u_int32_t di_uid; /* 4: File owner. */ + u_int32_t di_gid; /* 8: File group. */ + u_int32_t di_blksize; /* 12: Inode blocksize. */ + u_int64_t di_size; /* 16: File byte count. */ + u_int64_t di_blocks; /* 24: Blocks actually held. */ + ufs_time_t di_atime; /* 32: Last access time. */ + ufs_time_t di_mtime; /* 40: Last modified time. */ + ufs_time_t di_ctime; /* 48: Last inode change time. */ + ufs_time_t di_birthtime; /* 56: Inode creation time. */ + int32_t di_mtimensec; /* 64: Last modified time. */ + int32_t di_atimensec; /* 68: Last access time. */ + int32_t di_ctimensec; /* 72: Last inode change time. */ + int32_t di_birthnsec; /* 76: Inode creation time. */ + u_int32_t di_gen; /* 80: Generation number. */ + u_int32_t di_kernflags; /* 84: Kernel flags. */ + u_int32_t di_flags; /* 88: Status flags (chflags). */ + u_int32_t di_extsize; /* 92: External attributes size. */ + ufs2_daddr_t di_extb[NXADDR];/* 96: External attributes block. */ + ufs2_daddr_t di_db[NDADDR]; /* 112: Direct disk blocks. */ + ufs2_daddr_t di_ib[NIADDR]; /* 208: Indirect disk blocks. */ + u_int64_t di_modrev; /* 232: i_modrev for NFSv4 */ + uint32_t di_freelink; /* 240: SUJ: Next unlinked inode. */ + uint32_t di_spare[3]; /* 244: Reserved; currently unused */ +}; + +/* + * The di_db fields may be overlaid with other information for + * file types that do not have associated disk storage. Block + * and character devices overlay the first data block with their + * dev_t value. Short symbolic links place their path in the + * di_db area. + */ +#define di_rdev di_db[0] + +/* + * A UFS1 dinode contains all the meta-data associated with a UFS1 file. + * This structure defines the on-disk format of a UFS1 dinode. Since + * this structure describes an on-disk structure, all its fields + * are defined by types with precise widths. + */ +struct ufs1_dinode { + u_int16_t di_mode; /* 0: IFMT, permissions; see below. */ + int16_t di_nlink; /* 2: File link count. */ + uint32_t di_freelink; /* 4: SUJ: Next unlinked inode. */ + u_int64_t di_size; /* 8: File byte count. */ + int32_t di_atime; /* 16: Last access time. */ + int32_t di_atimensec; /* 20: Last access time. */ + int32_t di_mtime; /* 24: Last modified time. */ + int32_t di_mtimensec; /* 28: Last modified time. */ + int32_t di_ctime; /* 32: Last inode change time. */ + int32_t di_ctimensec; /* 36: Last inode change time. */ + ufs1_daddr_t di_db[NDADDR]; /* 40: Direct disk blocks. */ + ufs1_daddr_t di_ib[NIADDR]; /* 88: Indirect disk blocks. */ + u_int32_t di_flags; /* 100: Status flags (chflags). */ + u_int32_t di_blocks; /* 104: Blocks actually held. */ + u_int32_t di_gen; /* 108: Generation number. */ + u_int32_t di_uid; /* 112: File owner. */ + u_int32_t di_gid; /* 116: File group. */ + u_int64_t di_modrev; /* 120: i_modrev for NFSv4 */ +}; + +#endif /* _UFS_UFS_DINODE_H_ */ diff --git a/Dump/ufs/ufs/dir.h b/Dump/ufs/ufs/dir.h new file mode 100644 index 0000000..77aa5c7 --- /dev/null +++ b/Dump/ufs/ufs/dir.h @@ -0,0 +1,155 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)dir.h 8.2 (Berkeley) 1/21/94 + * $FreeBSD: releng/11.2/sys/ufs/ufs/dir.h 331722 2018-03-29 02:50:57Z eadler $ + */ + +#ifndef _UFS_UFS_DIR_H_ +#define _UFS_UFS_DIR_H_ + +/* + * Theoretically, directories can be more than 2Gb in length, however, in + * practice this seems unlikely. So, we define the type doff_t as a 32-bit + * quantity to keep down the cost of doing lookup on a 32-bit machine. + */ +#define doff_t int32_t +#define MAXDIRSIZE (0x7fffffff) + +/* + * A directory consists of some number of blocks of DIRBLKSIZ + * bytes, where DIRBLKSIZ is chosen such that it can be transferred + * to disk in a single atomic operation (e.g. 512 bytes on most machines). + * + * Each DIRBLKSIZ byte block contains some number of directory entry + * structures, which are of variable length. Each directory entry has + * a struct direct at the front of it, containing its inode number, + * the length of the entry, and the length of the name contained in + * the entry. These are followed by the name padded to a 4 byte boundary + * with null bytes. All names are guaranteed null terminated. + * The maximum length of a name in a directory is MAXNAMLEN. + * + * The macro DIRSIZ(fmt, dp) gives the amount of space required to represent + * a directory entry. Free space in a directory is represented by + * entries which have dp->d_reclen > DIRSIZ(fmt, dp). All DIRBLKSIZ bytes + * in a directory block are claimed by the directory entries. This + * usually results in the last entry in a directory having a large + * dp->d_reclen. When entries are deleted from a directory, the + * space is returned to the previous entry in the same directory + * block by increasing its dp->d_reclen. If the first entry of + * a directory block is free, then its dp->d_ino is set to 0. + * Entries other than the first in a directory do not normally have + * dp->d_ino set to 0. + */ +#define DIRBLKSIZ DEV_BSIZE +#define MAXNAMLEN 255 + +struct direct { + u_int32_t d_ino; /* inode number of entry */ + u_int16_t d_reclen; /* length of this record */ + u_int8_t d_type; /* file type, see below */ + u_int8_t d_namlen; /* length of string in d_name */ + char d_name[MAXNAMLEN + 1];/* name with length <= MAXNAMLEN */ +}; + +/* + * File types + */ +#define DT_UNKNOWN 0 +#define DT_FIFO 1 +#define DT_CHR 2 +#define DT_DIR 4 +#define DT_BLK 6 +#define DT_REG 8 +#define DT_LNK 10 +#define DT_SOCK 12 +#define DT_WHT 14 + +/* + * Convert between stat structure types and directory types. + */ +#define IFTODT(mode) (((mode) & 0170000) >> 12) +#define DTTOIF(dirtype) ((dirtype) << 12) + +/* + * The DIRSIZ macro gives the minimum record length which will hold + * the directory entry. This requires the amount of space in struct direct + * without the d_name field, plus enough space for the name with a terminating + * null byte (dp->d_namlen+1), rounded up to a 4 byte boundary. + * + * + */ +#define DIRECTSIZ(namlen) \ + ((__offsetof(struct direct, d_name) + \ + ((namlen)+1)*sizeof(((struct direct *)0)->d_name[0]) + 3) & ~3) +#if (BYTE_ORDER == LITTLE_ENDIAN) +#define DIRSIZ(oldfmt, dp) \ + ((oldfmt) ? DIRECTSIZ((dp)->d_type) : DIRECTSIZ((dp)->d_namlen)) +#else +#define DIRSIZ(oldfmt, dp) \ + DIRECTSIZ((dp)->d_namlen) +#endif +#define OLDDIRFMT 1 +#define NEWDIRFMT 0 + +/* + * Template for manipulating directories. Should use struct direct's, + * but the name field is MAXNAMLEN - 1, and this just won't do. + */ +struct dirtemplate { + u_int32_t dot_ino; + int16_t dot_reclen; + u_int8_t dot_type; + u_int8_t dot_namlen; + char dot_name[4]; /* must be multiple of 4 */ + u_int32_t dotdot_ino; + int16_t dotdot_reclen; + u_int8_t dotdot_type; + u_int8_t dotdot_namlen; + char dotdot_name[4]; /* ditto */ +}; + +/* + * This is the old format of directories, sanz type element. + */ +struct odirtemplate { + u_int32_t dot_ino; + int16_t dot_reclen; + u_int16_t dot_namlen; + char dot_name[4]; /* must be multiple of 4 */ + u_int32_t dotdot_ino; + int16_t dotdot_reclen; + u_int16_t dotdot_namlen; + char dotdot_name[4]; /* ditto */ +}; +#endif /* !_DIR_H_ */ diff --git a/Dump/ufs/ufs/dirhash.h b/Dump/ufs/ufs/dirhash.h new file mode 100644 index 0000000..f58e2df --- /dev/null +++ b/Dump/ufs/ufs/dirhash.h @@ -0,0 +1,133 @@ +/*- + * Copyright (c) 2001 Ian Dowse. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: releng/11.2/sys/ufs/ufs/dirhash.h 298804 2016-04-29 20:43:51Z pfg $ + */ + +#ifndef _UFS_UFS_DIRHASH_H_ +#define _UFS_UFS_DIRHASH_H_ + +#include +#include + +/* + * For fast operations on large directories, we maintain a hash + * that maps the file name to the offset of the directory entry within + * the directory file. + * + * The hashing uses a dumb spillover to the next free slot on + * collisions, so we must keep the utilisation low to avoid + * long linear searches. Deleted entries that are not the last + * in a chain must be marked DIRHASH_DEL. + * + * We also maintain information about free space in each block + * to speed up creations. + */ +#define DIRHASH_EMPTY (-1) /* entry unused */ +#define DIRHASH_DEL (-2) /* deleted entry; may be part of chain */ + +#define DIRALIGN 4 +#define DH_NFSTATS (DIRECTSIZ(MAXNAMLEN + 1) / DIRALIGN) + /* max DIRALIGN words in a directory entry */ + +/* + * Dirhash uses a score mechanism to achieve a hybrid between a + * least-recently-used and a least-often-used algorithm for entry + * recycling. The score is incremented when a directory is used, and + * decremented when the directory is a candidate for recycling. When + * the score reaches zero, the hash is recycled. Hashes are linked + * together on a TAILQ list, and hashes with higher scores filter + * towards the tail (most recently used) end of the list. + * + * New hash entries are given an initial score of DH_SCOREINIT and are + * placed at the most-recently-used end of the list. This helps a lot + * in the worst-case case scenario where every directory access is + * to a directory that is not hashed (i.e. the working set of hash + * candidates is much larger than the configured memry limit). In this + * case it limits the number of hash builds to 1/DH_SCOREINIT of the + * number of accesses. + */ +#define DH_SCOREINIT 8 /* initial dh_score when dirhash built */ +#define DH_SCOREMAX 64 /* max dh_score value */ + +/* + * The main hash table has 2 levels. It is an array of pointers to + * blocks of DH_NBLKOFF offsets. + */ +#define DH_BLKOFFSHIFT 8 +#define DH_NBLKOFF (1 << DH_BLKOFFSHIFT) +#define DH_BLKOFFMASK (DH_NBLKOFF - 1) + +#define DH_ENTRY(dh, slot) \ + ((dh)->dh_hash[(slot) >> DH_BLKOFFSHIFT][(slot) & DH_BLKOFFMASK]) + +struct dirhash { + struct sx dh_lock; /* protects all fields except list & score */ + int dh_refcount; + + doff_t **dh_hash; /* the hash array (2-level) */ + int dh_narrays; /* number of entries in dh_hash */ + int dh_hlen; /* total slots in the 2-level hash array */ + int dh_hused; /* entries in use */ + int dh_memreq; /* Memory used. */ + + /* Free space statistics. XXX assumes DIRBLKSIZ is 512. */ + u_int8_t *dh_blkfree; /* free DIRALIGN words in each dir block */ + int dh_nblk; /* size of dh_blkfree array */ + int dh_dirblks; /* number of DIRBLKSIZ blocks in dir */ + int dh_firstfree[DH_NFSTATS + 1]; /* first blk with N words free */ + + doff_t dh_seqoff; /* sequential access optimisation offset */ + + int dh_score; /* access count for this dirhash */ + + int dh_onlist; /* true if on the ufsdirhash_list chain */ + + time_t dh_lastused; /* time the dirhash was last read or written*/ + + /* Protected by ufsdirhash_mtx. */ + TAILQ_ENTRY(dirhash) dh_list; /* chain of all dirhashes */ +}; + + +/* + * Dirhash functions. + */ +void ufsdirhash_init(void); +void ufsdirhash_uninit(void); +int ufsdirhash_build(struct inode *); +doff_t ufsdirhash_findfree(struct inode *, int, int *); +doff_t ufsdirhash_enduseful(struct inode *); +int ufsdirhash_lookup(struct inode *, char *, int, doff_t *, struct buf **, + doff_t *); +void ufsdirhash_newblk(struct inode *, doff_t); +void ufsdirhash_add(struct inode *, struct direct *, doff_t); +void ufsdirhash_remove(struct inode *, struct direct *, doff_t); +void ufsdirhash_move(struct inode *, struct direct *, doff_t, doff_t); +void ufsdirhash_dirtrunc(struct inode *, doff_t); +void ufsdirhash_free(struct inode *); + +void ufsdirhash_checkblock(struct inode *, char *, doff_t); + +#endif /* !_UFS_UFS_DIRHASH_H_ */ diff --git a/Dump/ufs/ufs/extattr.h b/Dump/ufs/ufs/extattr.h new file mode 100644 index 0000000..61a6939 --- /dev/null +++ b/Dump/ufs/ufs/extattr.h @@ -0,0 +1,161 @@ +/*- + * Copyright (c) 1999-2001 Robert N. M. Watson + * All rights reserved. + * + * This software was developed by Robert Watson for the TrustedBSD Project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: releng/11.2/sys/ufs/ufs/extattr.h 306553 2016-10-01 09:19:43Z kib $ + */ +/* + * Developed by the TrustedBSD Project. + * Support for extended filesystem attributes. + */ + +#ifndef _UFS_UFS_EXTATTR_H_ +#define _UFS_UFS_EXTATTR_H_ + +#define UFS_EXTATTR_MAGIC 0x00b5d5ec +#define UFS_EXTATTR_VERSION 0x00000003 +#define UFS_EXTATTR_FSROOTSUBDIR ".attribute" +#define UFS_EXTATTR_SUBDIR_SYSTEM "system" +#define UFS_EXTATTR_SUBDIR_USER "user" +#define UFS_EXTATTR_MAXEXTATTRNAME 65 /* including null */ + +#define UFS_EXTATTR_ATTR_FLAG_INUSE 0x00000001 /* attr has been set */ +#define UFS_EXTATTR_PERM_KERNEL 0x00000000 +#define UFS_EXTATTR_PERM_ROOT 0x00000001 +#define UFS_EXTATTR_PERM_OWNER 0x00000002 +#define UFS_EXTATTR_PERM_ANYONE 0x00000003 + +#define UFS_EXTATTR_UEPM_INITIALIZED 0x00000001 +#define UFS_EXTATTR_UEPM_STARTED 0x00000002 + +#define UFS_EXTATTR_CMD_START 0x00000001 +#define UFS_EXTATTR_CMD_STOP 0x00000002 +#define UFS_EXTATTR_CMD_ENABLE 0x00000003 +#define UFS_EXTATTR_CMD_DISABLE 0x00000004 + +struct ufs_extattr_fileheader { + u_int uef_magic; /* magic number for sanity checking */ + u_int uef_version; /* version of attribute file */ + u_int uef_size; /* size of attributes, w/o header */ +}; + +struct ufs_extattr_header { + u_int ueh_flags; /* flags for attribute */ + u_int ueh_len; /* local defined length; <= uef_size */ + u_int32_t ueh_i_gen; /* generation number for sanity */ + /* data follows the header */ +}; + +/* + * This structure defines the required fields of an extended-attribute header. + */ +struct extattr { + int32_t ea_length; /* length of this attribute */ + int8_t ea_namespace; /* name space of this attribute */ + int8_t ea_contentpadlen; /* bytes of padding at end of attribute */ + int8_t ea_namelength; /* length of attribute name */ + char ea_name[1]; /* null-terminated attribute name */ + /* extended attribute content follows */ +}; + +/* + * These macros are used to access and manipulate an extended attribute: + * + * EXTATTR_NEXT(eap) returns a pointer to the next extended attribute + * following eap. + * EXTATTR_CONTENT(eap) returns a pointer to the extended attribute + * content referenced by eap. + * EXTATTR_CONTENT_SIZE(eap) returns the size of the extended attribute + * content referenced by eap. + * EXTATTR_SET_LENGTHS(eap, contentsize) called after initializing the + * attribute name to calculate and set the ea_length, ea_namelength, + * and ea_contentpadlen fields of the extended attribute structure. + */ +#define EXTATTR_NEXT(eap) \ + ((struct extattr *)(((void *)(eap)) + (eap)->ea_length)) +#define EXTATTR_CONTENT(eap) (((void *)(eap)) + EXTATTR_BASE_LENGTH(eap)) +#define EXTATTR_CONTENT_SIZE(eap) \ + ((eap)->ea_length - EXTATTR_BASE_LENGTH(eap) - (eap)->ea_contentpadlen) +#define EXTATTR_BASE_LENGTH(eap) \ + ((sizeof(struct extattr) + (eap)->ea_namelength + 7) & ~7) +#define EXTATTR_SET_LENGTHS(eap, contentsize) do { \ + KASSERT(((eap)->ea_name[0] != 0), \ + ("Must initialize name before setting lengths")); \ + (eap)->ea_namelength = strlen((eap)->ea_name); \ + (eap)->ea_contentpadlen = ((contentsize) % 8) ? \ + 8 - ((contentsize) % 8) : 0; \ + (eap)->ea_length = EXTATTR_BASE_LENGTH(eap) + \ + (contentsize) + (eap)->ea_contentpadlen; \ +} while (0) + +#ifdef _KERNEL + +#include + +struct vnode; +LIST_HEAD(ufs_extattr_list_head, ufs_extattr_list_entry); +struct ufs_extattr_list_entry { + LIST_ENTRY(ufs_extattr_list_entry) uele_entries; + struct ufs_extattr_fileheader uele_fileheader; + int uele_attrnamespace; + char uele_attrname[UFS_EXTATTR_MAXEXTATTRNAME]; + struct vnode *uele_backing_vnode; +}; + +struct ucred; +struct ufs_extattr_per_mount { + struct sx uepm_lock; + struct ufs_extattr_list_head uepm_list; + struct ucred *uepm_ucred; + int uepm_flags; +}; + +struct vop_getextattr_args; +struct vop_deleteextattr_args; +struct vop_setextattr_args; + +void ufs_extattr_uepm_init(struct ufs_extattr_per_mount *uepm); +void ufs_extattr_uepm_destroy(struct ufs_extattr_per_mount *uepm); +int ufs_extattr_start(struct mount *mp, struct thread *td); +int ufs_extattr_autostart(struct mount *mp, struct thread *td); +int ufs_extattr_stop(struct mount *mp, struct thread *td); +int ufs_extattrctl(struct mount *mp, int cmd, struct vnode *filename, + int attrnamespace, const char *attrname); +int ufs_getextattr(struct vop_getextattr_args *ap); +int ufs_deleteextattr(struct vop_deleteextattr_args *ap); +int ufs_setextattr(struct vop_setextattr_args *ap); +void ufs_extattr_vnode_inactive(struct vnode *vp, struct thread *td); + +#else + +/* User-level definition of KASSERT for macros above */ +#define KASSERT(cond, str) do { \ + if (!(cond)) { printf("panic: "); printf(str); printf("\n"); exit(1); }\ +} while (0) + +#endif /* !_KERNEL */ + +#endif /* !_UFS_UFS_EXTATTR_H_ */ diff --git a/Dump/ufs/ufs/gjournal.h b/Dump/ufs/ufs/gjournal.h new file mode 100644 index 0000000..cd57fd8 --- /dev/null +++ b/Dump/ufs/ufs/gjournal.h @@ -0,0 +1,37 @@ +/*- + * Copyright (c) 2005-2006 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: releng/11.2/sys/ufs/ufs/gjournal.h 262678 2014-03-02 02:52:34Z pfg $ + */ + +#ifndef _UFS_UFS_GJOURNAL_H_ +#define _UFS_UFS_GJOURNAL_H_ + +/* + * GEOM journal function prototypes. + */ +void ufs_gjournal_orphan(struct vnode *fvp); +void ufs_gjournal_close(struct vnode *vp); +#endif /* !_UFS_UFS_GJOURNAL_H_ */ diff --git a/Dump/ufs/ufs/inode.h b/Dump/ufs/ufs/inode.h new file mode 100644 index 0000000..6c9bd06 --- /dev/null +++ b/Dump/ufs/ufs/inode.h @@ -0,0 +1,207 @@ +/*- + * Copyright (c) 1982, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)inode.h 8.9 (Berkeley) 5/14/95 + * $FreeBSD: releng/11.2/sys/ufs/ufs/inode.h 331722 2018-03-29 02:50:57Z eadler $ + */ + +#ifndef _UFS_UFS_INODE_H_ +#define _UFS_UFS_INODE_H_ + +#include +#include +#include + +/* + * This must agree with the definition in . + */ +#define doff_t int32_t + +/* + * The inode is used to describe each active (or recently active) file in the + * UFS filesystem. It is composed of two types of information. The first part + * is the information that is needed only while the file is active (such as + * the identity of the file and linkage to speed its lookup). The second part + * is the permanent meta-data associated with the file which is read in + * from the permanent dinode from long term storage when the file becomes + * active, and is put back when the file is no longer being used. + * + * An inode may only be changed while holding either the exclusive + * vnode lock or the shared vnode lock and the vnode interlock. We use + * the latter only for "read" and "get" operations that require + * changing i_flag, or a timestamp. This locking protocol allows executing + * those operations without having to upgrade the vnode lock from shared to + * exclusive. + */ +struct inode { + TAILQ_ENTRY(inode) i_nextsnap; /* snapshot file list. */ + struct vnode *i_vnode;/* Vnode associated with this inode. */ + struct ufsmount *i_ump;/* Ufsmount point associated with this inode. */ + struct dquot *i_dquot[MAXQUOTAS]; /* Dquot structures. */ + union { + struct dirhash *dirhash; /* Hashing for large directories. */ + daddr_t *snapblklist; /* Collect expunged snapshot blocks. */ + } i_un; + /* + * The real copy of the on-disk inode. + */ + union { + struct ufs1_dinode *din1; /* UFS1 on-disk dinode. */ + struct ufs2_dinode *din2; /* UFS2 on-disk dinode. */ + } dinode_u; + + ino_t i_number; /* The identity of the inode. */ + u_int32_t i_flag; /* flags, see below */ + int i_effnlink; /* i_nlink when I/O completes */ + + + /* + * Side effects; used during directory lookup. + */ + int32_t i_count; /* Size of free slot in directory. */ + doff_t i_endoff; /* End of useful stuff in directory. */ + doff_t i_diroff; /* Offset in dir, where we found last entry. */ + doff_t i_offset; /* Offset of free space in directory. */ + + int i_nextclustercg; /* last cg searched for cluster */ + + /* + * Data for extended attribute modification. + */ + u_char *i_ea_area; /* Pointer to malloced copy of EA area */ + unsigned i_ea_len; /* Length of i_ea_area */ + int i_ea_error; /* First errno in transaction */ + int i_ea_refs; /* Number of users of EA area */ + + /* + * Copies from the on-disk dinode itself. + */ + u_int64_t i_size; /* File byte count. */ + u_int64_t i_gen; /* Generation number. */ + u_int32_t i_flags; /* Status flags (chflags). */ + u_int32_t i_uid; /* File owner. */ + u_int32_t i_gid; /* File group. */ + u_int16_t i_mode; /* IFMT, permissions; see below. */ + int16_t i_nlink; /* File link count. */ +}; +/* + * These flags are kept in i_flag. + */ +#define IN_ACCESS 0x0001 /* Access time update request. */ +#define IN_CHANGE 0x0002 /* Inode change time update request. */ +#define IN_UPDATE 0x0004 /* Modification time update request. */ +#define IN_MODIFIED 0x0008 /* Inode has been modified. */ +#define IN_NEEDSYNC 0x0010 /* Inode requires fsync. */ +#define IN_LAZYMOD 0x0020 /* Modified, but don't write yet. */ +#define IN_LAZYACCESS 0x0040 /* Process IN_ACCESS after the + suspension finished */ +#define IN_EA_LOCKED 0x0080 +#define IN_EA_LOCKWAIT 0x0100 + +#define IN_TRUNCATED 0x0200 /* Journaled truncation pending. */ + +#define IN_UFS2 0x0400 /* UFS2 vs UFS1 */ + +#define i_dirhash i_un.dirhash +#define i_snapblklist i_un.snapblklist +#define i_din1 dinode_u.din1 +#define i_din2 dinode_u.din2 + +#ifdef _KERNEL + +#define ITOUMP(ip) ((ip)->i_ump) +#define ITODEV(ip) (ITOUMP(ip)->um_dev) +#define ITODEVVP(ip) (ITOUMP(ip)->um_devvp) +#define ITOFS(ip) (ITOUMP(ip)->um_fs) +#define ITOVFS(ip) ((ip)->i_vnode->v_mount) + +static inline _Bool +I_IS_UFS1(const struct inode *ip) +{ + + return ((ip->i_flag & IN_UFS2) == 0); +} + +static inline _Bool +I_IS_UFS2(const struct inode *ip) +{ + + return ((ip->i_flag & IN_UFS2) != 0); +} + +/* + * The DIP macro is used to access fields in the dinode that are + * not cached in the inode itself. + */ +#define DIP(ip, field) (I_IS_UFS1(ip) ? (ip)->i_din1->d##field : \ + (ip)->i_din2->d##field) +#define DIP_SET(ip, field, val) do { \ + if (I_IS_UFS1(ip)) \ + (ip)->i_din1->d##field = (val); \ + else \ + (ip)->i_din2->d##field = (val); \ + } while (0) + +#define SHORTLINK(ip) (I_IS_UFS1(ip) ? \ + (caddr_t)(ip)->i_din1->di_db : (caddr_t)(ip)->i_din2->di_db) +#define IS_SNAPSHOT(ip) ((ip)->i_flags & SF_SNAPSHOT) + +/* + * Structure used to pass around logical block paths generated by + * ufs_getlbns and used by truncate and bmap code. + */ +struct indir { + ufs2_daddr_t in_lbn; /* Logical block number. */ + int in_off; /* Offset in buffer. */ +}; + +/* Convert between inode pointers and vnode pointers. */ +#define VTOI(vp) ((struct inode *)(vp)->v_data) +#define ITOV(ip) ((ip)->i_vnode) + +/* Determine if soft dependencies are being done */ +#define DOINGSOFTDEP(vp) ((vp)->v_mount->mnt_flag & (MNT_SOFTDEP | MNT_SUJ)) +#define MOUNTEDSOFTDEP(mp) ((mp)->mnt_flag & (MNT_SOFTDEP | MNT_SUJ)) +#define DOINGSUJ(vp) ((vp)->v_mount->mnt_flag & MNT_SUJ) +#define MOUNTEDSUJ(mp) ((mp)->mnt_flag & MNT_SUJ) + +/* This overlays the fid structure (see mount.h). */ +struct ufid { + u_int16_t ufid_len; /* Length of structure. */ + u_int16_t ufid_pad; /* Force 32-bit alignment. */ + uint32_t ufid_ino; /* File number (ino). */ + uint32_t ufid_gen; /* Generation number. */ +}; +#endif /* _KERNEL */ + +#endif /* !_UFS_UFS_INODE_H_ */ diff --git a/Dump/ufs/ufs/quota.h b/Dump/ufs/ufs/quota.h new file mode 100644 index 0000000..71cbb70 --- /dev/null +++ b/Dump/ufs/ufs/quota.h @@ -0,0 +1,259 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Robert Elz at The University of Melbourne. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)quota.h 8.3 (Berkeley) 8/19/94 + * $FreeBSD: releng/11.2/sys/ufs/ufs/quota.h 331722 2018-03-29 02:50:57Z eadler $ + */ + +#ifndef _UFS_UFS_QUOTA_H_ +#define _UFS_UFS_QUOTA_H_ + +/* + * Definitions for disk quotas imposed on the average user + * (big brother finally hits UNIX). + * + * The following constants define the amount of time given a user before the + * soft limits are treated as hard limits (usually resulting in an allocation + * failure). The timer is started when the user crosses their soft limit, it + * is reset when they go below their soft limit. + */ +#define MAX_IQ_TIME (7*24*60*60) /* seconds in 1 week */ +#define MAX_DQ_TIME (7*24*60*60) /* seconds in 1 week */ + +/* + * The following constants define the usage of the quota file array in the + * ufsmount structure and dquot array in the inode structure. The semantics + * of the elements of these arrays are defined in the routine getinoquota; + * the remainder of the quota code treats them generically and need not be + * inspected when changing the size of the array. + */ +#define MAXQUOTAS 2 +#define USRQUOTA 0 /* element used for user quotas */ +#define GRPQUOTA 1 /* element used for group quotas */ + +/* + * Definitions for the default names of the quotas files. + */ +#define INITQFNAMES { \ + "user", /* USRQUOTA */ \ + "group", /* GRPQUOTA */ \ + "undefined", \ +} +#define QUOTAFILENAME "quota" +#define QUOTAGROUP "operator" + +/* + * Command definitions for the 'quotactl' system call. The commands are + * broken into a main command defined below and a subcommand that is used + * to convey the type of quota that is being manipulated (see above). + */ +#define SUBCMDMASK 0x00ff +#define SUBCMDSHIFT 8 +#define QCMD(cmd, type) (((cmd) << SUBCMDSHIFT) | ((type) & SUBCMDMASK)) + +#define Q_QUOTAON 0x0100 /* enable quotas */ +#define Q_QUOTAOFF 0x0200 /* disable quotas */ +#define Q_GETQUOTA32 0x0300 /* get limits and usage (32-bit version) */ +#define Q_SETQUOTA32 0x0400 /* set limits and usage (32-bit version) */ +#define Q_SETUSE32 0x0500 /* set usage (32-bit version) */ +#define Q_SYNC 0x0600 /* sync disk copy of a filesystems quotas */ +#define Q_GETQUOTA 0x0700 /* get limits and usage (64-bit version) */ +#define Q_SETQUOTA 0x0800 /* set limits and usage (64-bit version) */ +#define Q_SETUSE 0x0900 /* set usage (64-bit version) */ +#define Q_GETQUOTASIZE 0x0A00 /* get bit-size of quota file fields */ + +/* + * The following structure defines the format of the disk quota file + * (as it appears on disk) - the file is an array of these structures + * indexed by user or group number. The setquota system call establishes + * the vnode for each quota file (a pointer is retained in the ufsmount + * structure). + */ +struct dqblk32 { + u_int32_t dqb_bhardlimit; /* absolute limit on disk blks alloc */ + u_int32_t dqb_bsoftlimit; /* preferred limit on disk blks */ + u_int32_t dqb_curblocks; /* current block count */ + u_int32_t dqb_ihardlimit; /* maximum # allocated inodes + 1 */ + u_int32_t dqb_isoftlimit; /* preferred inode limit */ + u_int32_t dqb_curinodes; /* current # allocated inodes */ + int32_t dqb_btime; /* time limit for excessive disk use */ + int32_t dqb_itime; /* time limit for excessive files */ +}; + +struct dqblk64 { + u_int64_t dqb_bhardlimit; /* absolute limit on disk blks alloc */ + u_int64_t dqb_bsoftlimit; /* preferred limit on disk blks */ + u_int64_t dqb_curblocks; /* current block count */ + u_int64_t dqb_ihardlimit; /* maximum # allocated inodes + 1 */ + u_int64_t dqb_isoftlimit; /* preferred inode limit */ + u_int64_t dqb_curinodes; /* current # allocated inodes */ + int64_t dqb_btime; /* time limit for excessive disk use */ + int64_t dqb_itime; /* time limit for excessive files */ +}; + +#define dqblk dqblk64 + +#define Q_DQHDR64_MAGIC "QUOTA64" +#define Q_DQHDR64_VERSION 0x20081104 + +struct dqhdr64 { + char dqh_magic[8]; /* Q_DQHDR64_MAGIC */ + uint32_t dqh_version; /* Q_DQHDR64_VERSION */ + uint32_t dqh_hdrlen; /* header length */ + uint32_t dqh_reclen; /* record length */ + char dqh_unused[44]; /* reserved for future extension */ +}; + +#ifdef _KERNEL + +#include + +/* + * The following structure records disk usage for a user or group on a + * filesystem. There is one allocated for each quota that exists on any + * filesystem for the current user or group. A cache is kept of recently + * used entries. + * (h) protected by dqhlock + */ +struct dquot { + LIST_ENTRY(dquot) dq_hash; /* (h) hash list */ + TAILQ_ENTRY(dquot) dq_freelist; /* (h) free list */ + struct mtx dq_lock; /* lock for concurrency */ + u_int16_t dq_flags; /* flags, see below */ + u_int16_t dq_type; /* quota type of this dquot */ + u_int32_t dq_cnt; /* (h) count of active references */ + u_int32_t dq_id; /* identifier this applies to */ + struct ufsmount *dq_ump; /* (h) filesystem that this is + taken from */ + struct dqblk64 dq_dqb; /* actual usage & quotas */ +}; +/* + * Flag values. + */ +#define DQ_LOCK 0x01 /* this quota locked (no MODS) */ +#define DQ_WANT 0x02 /* wakeup on unlock */ +#define DQ_MOD 0x04 /* this quota modified since read */ +#define DQ_FAKE 0x08 /* no limits here, just usage */ +#define DQ_BLKS 0x10 /* has been warned about blk limit */ +#define DQ_INODS 0x20 /* has been warned about inode limit */ +/* + * Shorthand notation. + */ +#define dq_bhardlimit dq_dqb.dqb_bhardlimit +#define dq_bsoftlimit dq_dqb.dqb_bsoftlimit +#define dq_curblocks dq_dqb.dqb_curblocks +#define dq_ihardlimit dq_dqb.dqb_ihardlimit +#define dq_isoftlimit dq_dqb.dqb_isoftlimit +#define dq_curinodes dq_dqb.dqb_curinodes +#define dq_btime dq_dqb.dqb_btime +#define dq_itime dq_dqb.dqb_itime + +/* + * If the system has never checked for a quota for this file, then it is + * set to NODQUOT. Once a write attempt is made the inode pointer is set + * to reference a dquot structure. + */ +#define NODQUOT NULL + +/* + * Flags to chkdq() and chkiq() + */ +#define FORCE 0x01 /* force usage changes independent of limits */ +#define CHOWN 0x02 /* (advisory) change initiated by chown */ + +/* + * Macros to avoid subroutine calls to trivial functions. + */ +#ifdef DIAGNOSTIC +#define DQREF(dq) dqref(dq) +#else +#define DQREF(dq) (dq)->dq_cnt++ +#endif + +#define DQI_LOCK(dq) mtx_lock(&(dq)->dq_lock) +#define DQI_UNLOCK(dq) mtx_unlock(&(dq)->dq_lock) + +#define DQI_WAIT(dq, prio, msg) do { \ + while ((dq)->dq_flags & DQ_LOCK) { \ + (dq)->dq_flags |= DQ_WANT; \ + (void) msleep((dq), \ + &(dq)->dq_lock, (prio), (msg), 0); \ + } \ +} while (0) + +#define DQI_WAKEUP(dq) do { \ + if ((dq)->dq_flags & DQ_WANT) \ + wakeup((dq)); \ + (dq)->dq_flags &= ~(DQ_WANT|DQ_LOCK); \ +} while (0) + +struct inode; +struct mount; +struct thread; +struct ucred; +struct vnode; + +int chkdq(struct inode *, int64_t, struct ucred *, int); +int chkiq(struct inode *, int, struct ucred *, int); +void dqinit(void); +void dqrele(struct vnode *, struct dquot *); +void dquninit(void); +int getinoquota(struct inode *); +int qsync(struct mount *); +int qsyncvp(struct vnode *); +int quotaoff(struct thread *, struct mount *, int); +int quotaon(struct thread *, struct mount *, int, void *); +int getquota32(struct thread *, struct mount *, u_long, int, void *); +int setquota32(struct thread *, struct mount *, u_long, int, void *); +int setuse32(struct thread *, struct mount *, u_long, int, void *); +int getquota(struct thread *, struct mount *, u_long, int, void *); +int setquota(struct thread *, struct mount *, u_long, int, void *); +int setuse(struct thread *, struct mount *, u_long, int, void *); +int getquotasize(struct thread *, struct mount *, u_long, int, void *); +vfs_quotactl_t ufs_quotactl; + +#ifdef SOFTUPDATES +int quotaref(struct vnode *, struct dquot **); +void quotarele(struct dquot **); +void quotaadj(struct dquot **, struct ufsmount *, int64_t); +#endif /* SOFTUPDATES */ + +#else /* !_KERNEL */ + +#include + +__BEGIN_DECLS +int quotactl(const char *, int, int, void *); +__END_DECLS + +#endif /* _KERNEL */ + +#endif /* !_UFS_UFS_QUOTA_H_ */ diff --git a/Dump/ufs/ufs/ufs_acl.c b/Dump/ufs/ufs/ufs_acl.c new file mode 100644 index 0000000..5c7b11a --- /dev/null +++ b/Dump/ufs/ufs/ufs_acl.c @@ -0,0 +1,698 @@ +/*- + * Copyright (c) 1999-2003 Robert N. M. Watson + * All rights reserved. + * + * This software was developed by Robert Watson for the TrustedBSD Project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Support for POSIX.1e access control lists: UFS-specific support functions. + */ + +#include +__FBSDID("$FreeBSD: releng/11.2/sys/ufs/ufs/ufs_acl.c 306553 2016-10-01 09:19:43Z kib $"); + +#include "opt_ufs.h" +#include "opt_quota.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef UFS_ACL + +FEATURE(ufs_acl, "ACL support for UFS"); + +/* + * Synchronize an ACL and an inode by copying over appropriate inode fields + * to the passed ACL. Assumes an ACL that would satisfy acl_posix1e_check(), + * and may panic if not. + */ +void +ufs_sync_acl_from_inode(struct inode *ip, struct acl *acl) +{ + struct acl_entry *acl_mask, *acl_group_obj; + int i; + + /* + * Update ACL_USER_OBJ, ACL_OTHER, but simply identify ACL_MASK + * and ACL_GROUP_OBJ for use after we know whether ACL_MASK is + * present. + */ + acl_mask = NULL; + acl_group_obj = NULL; + for (i = 0; i < acl->acl_cnt; i++) { + switch (acl->acl_entry[i].ae_tag) { + case ACL_USER_OBJ: + acl->acl_entry[i].ae_perm = acl_posix1e_mode_to_perm( + ACL_USER_OBJ, ip->i_mode); + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; + break; + + case ACL_GROUP_OBJ: + acl_group_obj = &acl->acl_entry[i]; + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; + break; + + case ACL_OTHER: + acl->acl_entry[i].ae_perm = acl_posix1e_mode_to_perm( + ACL_OTHER, ip->i_mode); + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; + break; + + case ACL_MASK: + acl_mask = &acl->acl_entry[i]; + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; + break; + + case ACL_USER: + case ACL_GROUP: + break; + + default: + panic("ufs_sync_acl_from_inode(): bad ae_tag"); + } + } + + if (acl_group_obj == NULL) + panic("ufs_sync_acl_from_inode(): no ACL_GROUP_OBJ"); + + if (acl_mask == NULL) { + /* + * There is no ACL_MASK, so update ACL_GROUP_OBJ. + */ + acl_group_obj->ae_perm = acl_posix1e_mode_to_perm( + ACL_GROUP_OBJ, ip->i_mode); + } else { + /* + * Update the ACL_MASK entry instead of ACL_GROUP_OBJ. + */ + acl_mask->ae_perm = acl_posix1e_mode_to_perm(ACL_GROUP_OBJ, + ip->i_mode); + } +} + +/* + * Calculate what the inode mode should look like based on an authoritative + * ACL for the inode. Replace only the fields in the inode that the ACL + * can represent. + */ +void +ufs_sync_inode_from_acl(struct acl *acl, struct inode *ip) +{ + + ip->i_mode &= ACL_PRESERVE_MASK; + ip->i_mode |= acl_posix1e_acl_to_mode(acl); + DIP_SET(ip, i_mode, ip->i_mode); +} + +/* + * Retrieve NFSv4 ACL, skipping access checks. Must be used in UFS code + * instead of VOP_GETACL() when we don't want to be restricted by the user + * not having ACL_READ_ACL permission, e.g. when calculating inherited ACL + * or in ufs_vnops.c:ufs_accessx(). + */ +int +ufs_getacl_nfs4_internal(struct vnode *vp, struct acl *aclp, struct thread *td) +{ + int error, len; + struct inode *ip = VTOI(vp); + + len = sizeof(*aclp); + bzero(aclp, len); + + error = vn_extattr_get(vp, IO_NODELOCKED, + NFS4_ACL_EXTATTR_NAMESPACE, NFS4_ACL_EXTATTR_NAME, + &len, (char *) aclp, td); + aclp->acl_maxcnt = ACL_MAX_ENTRIES; + if (error == ENOATTR) { + /* + * Legitimately no ACL set on object, purely + * emulate it through the inode. + */ + acl_nfs4_sync_acl_from_mode(aclp, ip->i_mode, ip->i_uid); + + return (0); + } + + if (error) + return (error); + + if (len != sizeof(*aclp)) { + /* + * A short (or long) read, meaning that for + * some reason the ACL is corrupted. Return + * EPERM since the object DAC protections + * are unsafe. + */ + printf("ufs_getacl_nfs4(): Loaded invalid ACL (" + "%d bytes), inumber %ju on %s\n", len, + (uintmax_t)ip->i_number, ITOFS(ip)->fs_fsmnt); + + return (EPERM); + } + + error = acl_nfs4_check(aclp, vp->v_type == VDIR); + if (error) { + printf("ufs_getacl_nfs4(): Loaded invalid ACL " + "(failed acl_nfs4_check), inumber %ju on %s\n", + (uintmax_t)ip->i_number, ITOFS(ip)->fs_fsmnt); + + return (EPERM); + } + + return (0); +} + +static int +ufs_getacl_nfs4(struct vop_getacl_args *ap) +{ + int error; + + if ((ap->a_vp->v_mount->mnt_flag & MNT_NFS4ACLS) == 0) + return (EINVAL); + + error = VOP_ACCESSX(ap->a_vp, VREAD_ACL, ap->a_td->td_ucred, ap->a_td); + if (error) + return (error); + + error = ufs_getacl_nfs4_internal(ap->a_vp, ap->a_aclp, ap->a_td); + + return (error); +} + +/* + * Read POSIX.1e ACL from an EA. Return error if its not found + * or if any other error has occurred. + */ +static int +ufs_get_oldacl(acl_type_t type, struct oldacl *old, struct vnode *vp, + struct thread *td) +{ + int error, len; + struct inode *ip = VTOI(vp); + + len = sizeof(*old); + + switch (type) { + case ACL_TYPE_ACCESS: + error = vn_extattr_get(vp, IO_NODELOCKED, + POSIX1E_ACL_ACCESS_EXTATTR_NAMESPACE, + POSIX1E_ACL_ACCESS_EXTATTR_NAME, &len, (char *) old, + td); + break; + case ACL_TYPE_DEFAULT: + if (vp->v_type != VDIR) + return (EINVAL); + error = vn_extattr_get(vp, IO_NODELOCKED, + POSIX1E_ACL_DEFAULT_EXTATTR_NAMESPACE, + POSIX1E_ACL_DEFAULT_EXTATTR_NAME, &len, (char *) old, + td); + break; + default: + return (EINVAL); + } + + if (error != 0) + return (error); + + if (len != sizeof(*old)) { + /* + * A short (or long) read, meaning that for some reason + * the ACL is corrupted. Return EPERM since the object + * DAC protections are unsafe. + */ + printf("ufs_get_oldacl(): Loaded invalid ACL " + "(len = %d), inumber %ju on %s\n", len, + (uintmax_t)ip->i_number, ITOFS(ip)->fs_fsmnt); + return (EPERM); + } + + return (0); +} + +/* + * Retrieve the ACL on a file. + * + * As part of the ACL is stored in the inode, and the rest in an EA, + * assemble both into a final ACL product. Right now this is not done + * very efficiently. + */ +static int +ufs_getacl_posix1e(struct vop_getacl_args *ap) +{ + struct inode *ip = VTOI(ap->a_vp); + int error; + struct oldacl *old; + + /* + * XXX: If ufs_getacl() should work on file systems not supporting + * ACLs, remove this check. + */ + if ((ap->a_vp->v_mount->mnt_flag & MNT_ACLS) == 0) + return (EINVAL); + + old = malloc(sizeof(*old), M_ACL, M_WAITOK | M_ZERO); + + /* + * Attempt to retrieve the ACL from the extended attributes. + */ + error = ufs_get_oldacl(ap->a_type, old, ap->a_vp, ap->a_td); + switch (error) { + /* + * XXX: If ufs_getacl() should work on filesystems + * without the EA configured, add case EOPNOTSUPP here. + */ + case ENOATTR: + switch (ap->a_type) { + case ACL_TYPE_ACCESS: + /* + * Legitimately no ACL set on object, purely + * emulate it through the inode. These fields will + * be updated when the ACL is synchronized with + * the inode later. + */ + old->acl_cnt = 3; + old->acl_entry[0].ae_tag = ACL_USER_OBJ; + old->acl_entry[0].ae_id = ACL_UNDEFINED_ID; + old->acl_entry[0].ae_perm = ACL_PERM_NONE; + old->acl_entry[1].ae_tag = ACL_GROUP_OBJ; + old->acl_entry[1].ae_id = ACL_UNDEFINED_ID; + old->acl_entry[1].ae_perm = ACL_PERM_NONE; + old->acl_entry[2].ae_tag = ACL_OTHER; + old->acl_entry[2].ae_id = ACL_UNDEFINED_ID; + old->acl_entry[2].ae_perm = ACL_PERM_NONE; + break; + + case ACL_TYPE_DEFAULT: + /* + * Unlike ACL_TYPE_ACCESS, there is no relationship + * between the inode contents and the ACL, and it is + * therefore possible for the request for the ACL + * to fail since the ACL is undefined. In this + * situation, return success and an empty ACL, + * as required by POSIX.1e. + */ + old->acl_cnt = 0; + break; + } + /* FALLTHROUGH */ + case 0: + error = acl_copy_oldacl_into_acl(old, ap->a_aclp); + if (error != 0) + break; + + if (ap->a_type == ACL_TYPE_ACCESS) + ufs_sync_acl_from_inode(ip, ap->a_aclp); + default: + break; + } + + free(old, M_ACL); + return (error); +} + +int +ufs_getacl(ap) + struct vop_getacl_args /* { + struct vnode *vp; + acl_type_t type; + struct acl *aclp; + struct ucred *cred; + struct thread *td; + } */ *ap; +{ + + if ((ap->a_vp->v_mount->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS)) == 0) + return (EOPNOTSUPP); + + if (ap->a_type == ACL_TYPE_NFS4) + return (ufs_getacl_nfs4(ap)); + + return (ufs_getacl_posix1e(ap)); +} + +/* + * Set NFSv4 ACL without doing any access checking. This is required + * e.g. by the UFS code that implements ACL inheritance, or from + * ufs_vnops.c:ufs_chmod(), as some of the checks have to be skipped + * in that case, and others are redundant. + */ +int +ufs_setacl_nfs4_internal(struct vnode *vp, struct acl *aclp, struct thread *td) +{ + int error; + mode_t mode; + struct inode *ip = VTOI(vp); + + KASSERT(acl_nfs4_check(aclp, vp->v_type == VDIR) == 0, + ("invalid ACL passed to ufs_setacl_nfs4_internal")); + + if (acl_nfs4_is_trivial(aclp, ip->i_uid)) { + error = vn_extattr_rm(vp, IO_NODELOCKED, + NFS4_ACL_EXTATTR_NAMESPACE, NFS4_ACL_EXTATTR_NAME, td); + + /* + * An attempt to remove ACL from a file that didn't have + * any extended entries is not an error. + */ + if (error == ENOATTR) + error = 0; + + } else { + error = vn_extattr_set(vp, IO_NODELOCKED, + NFS4_ACL_EXTATTR_NAMESPACE, NFS4_ACL_EXTATTR_NAME, + sizeof(*aclp), (char *) aclp, td); + } + + /* + * Map lack of attribute definition in UFS_EXTATTR into lack of + * support for ACLs on the filesystem. + */ + if (error == ENOATTR) + return (EOPNOTSUPP); + + if (error) + return (error); + + mode = ip->i_mode; + + acl_nfs4_sync_mode_from_acl(&mode, aclp); + + ip->i_mode &= ACL_PRESERVE_MASK; + ip->i_mode |= mode; + DIP_SET(ip, i_mode, ip->i_mode); + ip->i_flag |= IN_CHANGE; + + VN_KNOTE_UNLOCKED(vp, NOTE_ATTRIB); + + error = UFS_UPDATE(vp, 0); + return (error); +} + +static int +ufs_setacl_nfs4(struct vop_setacl_args *ap) +{ + int error; + struct inode *ip = VTOI(ap->a_vp); + + if ((ap->a_vp->v_mount->mnt_flag & MNT_NFS4ACLS) == 0) + return (EINVAL); + + if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + + if (ap->a_aclp == NULL) + return (EINVAL); + + error = VOP_ACLCHECK(ap->a_vp, ap->a_type, ap->a_aclp, ap->a_cred, + ap->a_td); + if (error) + return (error); + + /* + * Authorize the ACL operation. + */ + if (ip->i_flags & (IMMUTABLE | APPEND)) + return (EPERM); + + /* + * Must hold VWRITE_ACL or have appropriate privilege. + */ + if ((error = VOP_ACCESSX(ap->a_vp, VWRITE_ACL, ap->a_cred, ap->a_td))) + return (error); + + /* + * With NFSv4 ACLs, chmod(2) may need to add additional entries. + * Make sure it has enough room for that - splitting every entry + * into two and appending "canonical six" entries at the end. + */ + if (ap->a_aclp->acl_cnt > (ACL_MAX_ENTRIES - 6) / 2) + return (ENOSPC); + + error = ufs_setacl_nfs4_internal(ap->a_vp, ap->a_aclp, ap->a_td); + + return (error); +} + +/* + * Set the ACL on a file. + * + * As part of the ACL is stored in the inode, and the rest in an EA, + * this is necessarily non-atomic, and has complex authorization. + * As ufs_setacl() includes elements of ufs_chown() and ufs_chmod(), + * a fair number of different access checks may be required to go ahead + * with the operation at all. + */ +static int +ufs_setacl_posix1e(struct vop_setacl_args *ap) +{ + struct inode *ip = VTOI(ap->a_vp); + int error; + struct oldacl *old; + + if ((ap->a_vp->v_mount->mnt_flag & MNT_ACLS) == 0) + return (EINVAL); + + /* + * If this is a set operation rather than a delete operation, + * invoke VOP_ACLCHECK() on the passed ACL to determine if it is + * valid for the target. This will include a check on ap->a_type. + */ + if (ap->a_aclp != NULL) { + /* + * Set operation. + */ + error = VOP_ACLCHECK(ap->a_vp, ap->a_type, ap->a_aclp, + ap->a_cred, ap->a_td); + if (error != 0) + return (error); + } else { + /* + * Delete operation. + * POSIX.1e allows only deletion of the default ACL on a + * directory (ACL_TYPE_DEFAULT). + */ + if (ap->a_type != ACL_TYPE_DEFAULT) + return (EINVAL); + if (ap->a_vp->v_type != VDIR) + return (ENOTDIR); + } + + if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + + /* + * Authorize the ACL operation. + */ + if (ip->i_flags & (IMMUTABLE | APPEND)) + return (EPERM); + + /* + * Must hold VADMIN (be file owner) or have appropriate privilege. + */ + if ((error = VOP_ACCESS(ap->a_vp, VADMIN, ap->a_cred, ap->a_td))) + return (error); + + switch(ap->a_type) { + case ACL_TYPE_ACCESS: + old = malloc(sizeof(*old), M_ACL, M_WAITOK | M_ZERO); + error = acl_copy_acl_into_oldacl(ap->a_aclp, old); + if (error == 0) { + error = vn_extattr_set(ap->a_vp, IO_NODELOCKED, + POSIX1E_ACL_ACCESS_EXTATTR_NAMESPACE, + POSIX1E_ACL_ACCESS_EXTATTR_NAME, sizeof(*old), + (char *) old, ap->a_td); + } + free(old, M_ACL); + break; + + case ACL_TYPE_DEFAULT: + if (ap->a_aclp == NULL) { + error = vn_extattr_rm(ap->a_vp, IO_NODELOCKED, + POSIX1E_ACL_DEFAULT_EXTATTR_NAMESPACE, + POSIX1E_ACL_DEFAULT_EXTATTR_NAME, ap->a_td); + /* + * Attempting to delete a non-present default ACL + * will return success for portability purposes. + * (TRIX) + * + * XXX: Note that since we can't distinguish + * "that EA is not supported" from "that EA is not + * defined", the success case here overlaps the + * the ENOATTR->EOPNOTSUPP case below. + */ + if (error == ENOATTR) + error = 0; + } else { + old = malloc(sizeof(*old), M_ACL, M_WAITOK | M_ZERO); + error = acl_copy_acl_into_oldacl(ap->a_aclp, old); + if (error == 0) { + error = vn_extattr_set(ap->a_vp, IO_NODELOCKED, + POSIX1E_ACL_DEFAULT_EXTATTR_NAMESPACE, + POSIX1E_ACL_DEFAULT_EXTATTR_NAME, + sizeof(*old), (char *) old, ap->a_td); + } + free(old, M_ACL); + } + break; + + default: + error = EINVAL; + } + /* + * Map lack of attribute definition in UFS_EXTATTR into lack of + * support for ACLs on the filesystem. + */ + if (error == ENOATTR) + return (EOPNOTSUPP); + if (error != 0) + return (error); + + if (ap->a_type == ACL_TYPE_ACCESS) { + /* + * Now that the EA is successfully updated, update the + * inode and mark it as changed. + */ + ufs_sync_inode_from_acl(ap->a_aclp, ip); + ip->i_flag |= IN_CHANGE; + error = UFS_UPDATE(ap->a_vp, 0); + } + + VN_KNOTE_UNLOCKED(ap->a_vp, NOTE_ATTRIB); + return (error); +} + +int +ufs_setacl(ap) + struct vop_setacl_args /* { + struct vnode *vp; + acl_type_t type; + struct acl *aclp; + struct ucred *cred; + struct thread *td; + } */ *ap; +{ + if ((ap->a_vp->v_mount->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS)) == 0) + return (EOPNOTSUPP); + + if (ap->a_type == ACL_TYPE_NFS4) + return (ufs_setacl_nfs4(ap)); + + return (ufs_setacl_posix1e(ap)); +} + +static int +ufs_aclcheck_nfs4(struct vop_aclcheck_args *ap) +{ + int is_directory = 0; + + if ((ap->a_vp->v_mount->mnt_flag & MNT_NFS4ACLS) == 0) + return (EINVAL); + + /* + * With NFSv4 ACLs, chmod(2) may need to add additional entries. + * Make sure it has enough room for that - splitting every entry + * into two and appending "canonical six" entries at the end. + */ + if (ap->a_aclp->acl_cnt > (ACL_MAX_ENTRIES - 6) / 2) + return (ENOSPC); + + if (ap->a_vp->v_type == VDIR) + is_directory = 1; + + return (acl_nfs4_check(ap->a_aclp, is_directory)); +} + +static int +ufs_aclcheck_posix1e(struct vop_aclcheck_args *ap) +{ + + if ((ap->a_vp->v_mount->mnt_flag & MNT_ACLS) == 0) + return (EINVAL); + + /* + * Verify we understand this type of ACL, and that it applies + * to this kind of object. + * Rely on the acl_posix1e_check() routine to verify the contents. + */ + switch(ap->a_type) { + case ACL_TYPE_ACCESS: + break; + + case ACL_TYPE_DEFAULT: + if (ap->a_vp->v_type != VDIR) + return (EINVAL); + break; + + default: + return (EINVAL); + } + + if (ap->a_aclp->acl_cnt > OLDACL_MAX_ENTRIES) + return (EINVAL); + + return (acl_posix1e_check(ap->a_aclp)); +} + +/* + * Check the validity of an ACL for a file. + */ +int +ufs_aclcheck(ap) + struct vop_aclcheck_args /* { + struct vnode *vp; + acl_type_t type; + struct acl *aclp; + struct ucred *cred; + struct thread *td; + } */ *ap; +{ + + if ((ap->a_vp->v_mount->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS)) == 0) + return (EOPNOTSUPP); + + if (ap->a_type == ACL_TYPE_NFS4) + return (ufs_aclcheck_nfs4(ap)); + + return (ufs_aclcheck_posix1e(ap)); +} + +#endif /* !UFS_ACL */ diff --git a/Dump/ufs/ufs/ufs_bmap.c b/Dump/ufs/ufs/ufs_bmap.c new file mode 100644 index 0000000..501529d --- /dev/null +++ b/Dump/ufs/ufs/ufs_bmap.c @@ -0,0 +1,384 @@ +/*- + * Copyright (c) 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_bmap.c 8.7 (Berkeley) 3/21/95 + */ + +#include +__FBSDID("$FreeBSD: releng/11.2/sys/ufs/ufs/ufs_bmap.c 331722 2018-03-29 02:50:57Z eadler $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* + * Bmap converts the logical block number of a file to its physical block + * number on the disk. The conversion is done by using the logical block + * number to index into the array of block pointers described by the dinode. + */ +int +ufs_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct bufobj **a_bop; + daddr_t *a_bnp; + int *a_runp; + int *a_runb; + } */ *ap; +{ + ufs2_daddr_t blkno; + int error; + + /* + * Check for underlying vnode requests and ensure that logical + * to physical mapping is requested. + */ + if (ap->a_bop != NULL) + *ap->a_bop = &VFSTOUFS(ap->a_vp->v_mount)->um_devvp->v_bufobj; + if (ap->a_bnp == NULL) + return (0); + + error = ufs_bmaparray(ap->a_vp, ap->a_bn, &blkno, NULL, + ap->a_runp, ap->a_runb); + *ap->a_bnp = blkno; + return (error); +} + +/* + * Indirect blocks are now on the vnode for the file. They are given negative + * logical block numbers. Indirect blocks are addressed by the negative + * address of the first data block to which they point. Double indirect blocks + * are addressed by one less than the address of the first indirect block to + * which they point. Triple indirect blocks are addressed by one less than + * the address of the first double indirect block to which they point. + * + * ufs_bmaparray does the bmap conversion, and if requested returns the + * array of logical blocks which must be traversed to get to a block. + * Each entry contains the offset into that block that gets you to the + * next block and the disk address of the block (if it is assigned). + */ + +int +ufs_bmaparray(vp, bn, bnp, nbp, runp, runb) + struct vnode *vp; + ufs2_daddr_t bn; + ufs2_daddr_t *bnp; + struct buf *nbp; + int *runp; + int *runb; +{ + struct inode *ip; + struct buf *bp; + struct ufsmount *ump; + struct mount *mp; + struct indir a[NIADDR+1], *ap; + ufs2_daddr_t daddr; + ufs_lbn_t metalbn; + int error, num, maxrun = 0; + int *nump; + + ap = NULL; + ip = VTOI(vp); + mp = vp->v_mount; + ump = VFSTOUFS(mp); + + if (runp) { + maxrun = mp->mnt_iosize_max / mp->mnt_stat.f_iosize - 1; + *runp = 0; + } + + if (runb) { + *runb = 0; + } + + + ap = a; + nump = # + error = ufs_getlbns(vp, bn, ap, nump); + if (error) + return (error); + + num = *nump; + if (num == 0) { + if (bn >= 0 && bn < NDADDR) { + *bnp = blkptrtodb(ump, DIP(ip, i_db[bn])); + } else if (bn < 0 && bn >= -NXADDR) { + *bnp = blkptrtodb(ump, ip->i_din2->di_extb[-1 - bn]); + if (*bnp == 0) + *bnp = -1; + if (nbp == NULL) + panic("ufs_bmaparray: mapping ext data"); + nbp->b_xflags |= BX_ALTDATA; + return (0); + } else { + panic("ufs_bmaparray: blkno out of range"); + } + /* + * Since this is FFS independent code, we are out of + * scope for the definitions of BLK_NOCOPY and + * BLK_SNAP, but we do know that they will fall in + * the range 1..um_seqinc, so we use that test and + * return a request for a zeroed out buffer if attempts + * are made to read a BLK_NOCOPY or BLK_SNAP block. + */ + if ((ip->i_flags & SF_SNAPSHOT) && DIP(ip, i_db[bn]) > 0 && + DIP(ip, i_db[bn]) < ump->um_seqinc) { + *bnp = -1; + } else if (*bnp == 0) { + if (ip->i_flags & SF_SNAPSHOT) + *bnp = blkptrtodb(ump, bn * ump->um_seqinc); + else + *bnp = -1; + } else if (runp) { + ufs2_daddr_t bnb = bn; + for (++bn; bn < NDADDR && *runp < maxrun && + is_sequential(ump, DIP(ip, i_db[bn - 1]), + DIP(ip, i_db[bn])); + ++bn, ++*runp); + bn = bnb; + if (runb && (bn > 0)) { + for (--bn; (bn >= 0) && (*runb < maxrun) && + is_sequential(ump, DIP(ip, i_db[bn]), + DIP(ip, i_db[bn+1])); + --bn, ++*runb); + } + } + return (0); + } + + + /* Get disk address out of indirect block array */ + daddr = DIP(ip, i_ib[ap->in_off]); + + for (bp = NULL, ++ap; --num; ++ap) { + /* + * Exit the loop if there is no disk address assigned yet and + * the indirect block isn't in the cache, or if we were + * looking for an indirect block and we've found it. + */ + + metalbn = ap->in_lbn; + if ((daddr == 0 && !incore(&vp->v_bufobj, metalbn)) || metalbn == bn) + break; + /* + * If we get here, we've either got the block in the cache + * or we have a disk address for it, go fetch it. + */ + if (bp) + bqrelse(bp); + + bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0, 0); + if ((bp->b_flags & B_CACHE) == 0) { +#ifdef INVARIANTS + if (!daddr) + panic("ufs_bmaparray: indirect block not in cache"); +#endif + bp->b_blkno = blkptrtodb(ump, daddr); + bp->b_iocmd = BIO_READ; + bp->b_flags &= ~B_INVAL; + bp->b_ioflags &= ~BIO_ERROR; + vfs_busy_pages(bp, 0); + bp->b_iooffset = dbtob(bp->b_blkno); + bstrategy(bp); +#ifdef RACCT + if (racct_enable) { + PROC_LOCK(curproc); + racct_add_buf(curproc, bp, 0); + PROC_UNLOCK(curproc); + } +#endif /* RACCT */ + curthread->td_ru.ru_inblock++; + error = bufwait(bp); + if (error) { + brelse(bp); + return (error); + } + } + + if (I_IS_UFS1(ip)) { + daddr = ((ufs1_daddr_t *)bp->b_data)[ap->in_off]; + if (num == 1 && daddr && runp) { + for (bn = ap->in_off + 1; + bn < MNINDIR(ump) && *runp < maxrun && + is_sequential(ump, + ((ufs1_daddr_t *)bp->b_data)[bn - 1], + ((ufs1_daddr_t *)bp->b_data)[bn]); + ++bn, ++*runp); + bn = ap->in_off; + if (runb && bn) { + for (--bn; bn >= 0 && *runb < maxrun && + is_sequential(ump, + ((ufs1_daddr_t *)bp->b_data)[bn], + ((ufs1_daddr_t *)bp->b_data)[bn+1]); + --bn, ++*runb); + } + } + continue; + } + daddr = ((ufs2_daddr_t *)bp->b_data)[ap->in_off]; + if (num == 1 && daddr && runp) { + for (bn = ap->in_off + 1; + bn < MNINDIR(ump) && *runp < maxrun && + is_sequential(ump, + ((ufs2_daddr_t *)bp->b_data)[bn - 1], + ((ufs2_daddr_t *)bp->b_data)[bn]); + ++bn, ++*runp); + bn = ap->in_off; + if (runb && bn) { + for (--bn; bn >= 0 && *runb < maxrun && + is_sequential(ump, + ((ufs2_daddr_t *)bp->b_data)[bn], + ((ufs2_daddr_t *)bp->b_data)[bn + 1]); + --bn, ++*runb); + } + } + } + if (bp) + bqrelse(bp); + + /* + * Since this is FFS independent code, we are out of scope for the + * definitions of BLK_NOCOPY and BLK_SNAP, but we do know that they + * will fall in the range 1..um_seqinc, so we use that test and + * return a request for a zeroed out buffer if attempts are made + * to read a BLK_NOCOPY or BLK_SNAP block. + */ + if ((ip->i_flags & SF_SNAPSHOT) && daddr > 0 && daddr < ump->um_seqinc){ + *bnp = -1; + return (0); + } + *bnp = blkptrtodb(ump, daddr); + if (*bnp == 0) { + if (ip->i_flags & SF_SNAPSHOT) + *bnp = blkptrtodb(ump, bn * ump->um_seqinc); + else + *bnp = -1; + } + return (0); +} + +/* + * Create an array of logical block number/offset pairs which represent the + * path of indirect blocks required to access a data block. The first "pair" + * contains the logical block number of the appropriate single, double or + * triple indirect block and the offset into the inode indirect block array. + * Note, the logical block number of the inode single/double/triple indirect + * block appears twice in the array, once with the offset into the i_ib and + * once with the offset into the page itself. + */ +int +ufs_getlbns(vp, bn, ap, nump) + struct vnode *vp; + ufs2_daddr_t bn; + struct indir *ap; + int *nump; +{ + ufs2_daddr_t blockcnt; + ufs_lbn_t metalbn, realbn; + struct ufsmount *ump; + int i, numlevels, off; + + ump = VFSTOUFS(vp->v_mount); + if (nump) + *nump = 0; + numlevels = 0; + realbn = bn; + if (bn < 0) + bn = -bn; + + /* The first NDADDR blocks are direct blocks. */ + if (bn < NDADDR) + return (0); + + /* + * Determine the number of levels of indirection. After this loop + * is done, blockcnt indicates the number of data blocks possible + * at the previous level of indirection, and NIADDR - i is the number + * of levels of indirection needed to locate the requested block. + */ + for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) { + if (i == 0) + return (EFBIG); + blockcnt *= MNINDIR(ump); + if (bn < blockcnt) + break; + } + + /* Calculate the address of the first meta-block. */ + if (realbn >= 0) + metalbn = -(realbn - bn + NIADDR - i); + else + metalbn = -(-realbn - bn + NIADDR - i); + + /* + * At each iteration, off is the offset into the bap array which is + * an array of disk addresses at the current level of indirection. + * The logical block number and the offset in that block are stored + * into the argument array. + */ + ap->in_lbn = metalbn; + ap->in_off = off = NIADDR - i; + ap++; + for (++numlevels; i <= NIADDR; i++) { + /* If searching for a meta-data block, quit when found. */ + if (metalbn == realbn) + break; + + blockcnt /= MNINDIR(ump); + off = (bn / blockcnt) % MNINDIR(ump); + + ++numlevels; + ap->in_lbn = metalbn; + ap->in_off = off; + ++ap; + + metalbn -= -1 + off * blockcnt; + } + if (nump) + *nump = numlevels; + return (0); +} diff --git a/Dump/ufs/ufs/ufs_dirhash.c b/Dump/ufs/ufs/ufs_dirhash.c new file mode 100644 index 0000000..18f7cc9 --- /dev/null +++ b/Dump/ufs/ufs/ufs_dirhash.c @@ -0,0 +1,1324 @@ +/*- + * Copyright (c) 2001, 2002 Ian Dowse. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This implements a hash-based lookup scheme for UFS directories. + */ + +#include +__FBSDID("$FreeBSD: releng/11.2/sys/ufs/ufs/ufs_dirhash.c 326845 2017-12-14 11:41:12Z kib $"); + +#include "opt_ufs.h" + +#ifdef UFS_DIRHASH + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define WRAPINCR(val, limit) (((val) + 1 == (limit)) ? 0 : ((val) + 1)) +#define WRAPDECR(val, limit) (((val) == 0) ? ((limit) - 1) : ((val) - 1)) +#define OFSFMT(vp) ((vp)->v_mount->mnt_maxsymlinklen <= 0) +#define BLKFREE2IDX(n) ((n) > DH_NFSTATS ? DH_NFSTATS : (n)) + +static MALLOC_DEFINE(M_DIRHASH, "ufs_dirhash", "UFS directory hash tables"); + +static int ufs_mindirhashsize = DIRBLKSIZ * 5; +SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_minsize, CTLFLAG_RW, + &ufs_mindirhashsize, + 0, "minimum directory size in bytes for which to use hashed lookup"); +static int ufs_dirhashmaxmem = 2 * 1024 * 1024; /* NOTE: initial value. It is + tuned in ufsdirhash_init() */ +SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_maxmem, CTLFLAG_RW, &ufs_dirhashmaxmem, + 0, "maximum allowed dirhash memory usage"); +static int ufs_dirhashmem; +SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_mem, CTLFLAG_RD, &ufs_dirhashmem, + 0, "current dirhash memory usage"); +static int ufs_dirhashcheck = 0; +SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_docheck, CTLFLAG_RW, &ufs_dirhashcheck, + 0, "enable extra sanity tests"); +static int ufs_dirhashlowmemcount = 0; +SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_lowmemcount, CTLFLAG_RD, + &ufs_dirhashlowmemcount, 0, "number of times low memory hook called"); +static int ufs_dirhashreclaimpercent = 10; +static int ufsdirhash_set_reclaimpercent(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_vfs_ufs, OID_AUTO, dirhash_reclaimpercent, + CTLTYPE_INT | CTLFLAG_RW, 0, 0, ufsdirhash_set_reclaimpercent, "I", + "set percentage of dirhash cache to be removed in low VM events"); + + +static int ufsdirhash_hash(struct dirhash *dh, char *name, int namelen); +static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff); +static void ufsdirhash_delslot(struct dirhash *dh, int slot); +static int ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen, + doff_t offset); +static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset); +static int ufsdirhash_recycle(int wanted); +static void ufsdirhash_lowmem(void); +static void ufsdirhash_free_locked(struct inode *ip); + +static uma_zone_t ufsdirhash_zone; + +#define DIRHASHLIST_LOCK() mtx_lock(&ufsdirhash_mtx) +#define DIRHASHLIST_UNLOCK() mtx_unlock(&ufsdirhash_mtx) +#define DIRHASH_BLKALLOC_WAITOK() uma_zalloc(ufsdirhash_zone, M_WAITOK) +#define DIRHASH_BLKFREE(ptr) uma_zfree(ufsdirhash_zone, (ptr)) +#define DIRHASH_ASSERT_LOCKED(dh) \ + sx_assert(&(dh)->dh_lock, SA_LOCKED) + +/* Dirhash list; recently-used entries are near the tail. */ +static TAILQ_HEAD(, dirhash) ufsdirhash_list; + +/* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */ +static struct mtx ufsdirhash_mtx; + +/* + * Locking: + * + * The relationship between inode and dirhash is protected either by an + * exclusive vnode lock or the vnode interlock where a shared vnode lock + * may be used. The dirhash_mtx is acquired after the dirhash lock. To + * handle teardown races, code wishing to lock the dirhash for an inode + * when using a shared vnode lock must obtain a private reference on the + * dirhash while holding the vnode interlock. They can drop it once they + * have obtained the dirhash lock and verified that the dirhash wasn't + * recycled while they waited for the dirhash lock. + * + * ufsdirhash_build() acquires a shared lock on the dirhash when it is + * successful. This lock is released after a call to ufsdirhash_lookup(). + * + * Functions requiring exclusive access use ufsdirhash_acquire() which may + * free a dirhash structure that was recycled by ufsdirhash_recycle(). + * + * The dirhash lock may be held across io operations. + * + * WITNESS reports a lock order reversal between the "bufwait" lock + * and the "dirhash" lock. However, this specific reversal will not + * cause a deadlock. To get a deadlock, one would have to lock a + * buffer followed by the dirhash while a second thread locked a + * buffer while holding the dirhash lock. The second order can happen + * under a shared or exclusive vnode lock for the associated directory + * in lookup(). The first order, however, can only happen under an + * exclusive vnode lock (e.g. unlink(), rename(), etc.). Thus, for + * a thread to be doing a "bufwait" -> "dirhash" order, it has to hold + * an exclusive vnode lock. That exclusive vnode lock will prevent + * any other threads from doing a "dirhash" -> "bufwait" order. + */ + +static void +ufsdirhash_hold(struct dirhash *dh) +{ + + refcount_acquire(&dh->dh_refcount); +} + +static void +ufsdirhash_drop(struct dirhash *dh) +{ + + if (refcount_release(&dh->dh_refcount)) { + sx_destroy(&dh->dh_lock); + free(dh, M_DIRHASH); + } +} + +/* + * Release the lock on a dirhash. + */ +static void +ufsdirhash_release(struct dirhash *dh) +{ + + sx_unlock(&dh->dh_lock); +} + +/* + * Either acquire an existing hash locked shared or create a new hash and + * return it exclusively locked. May return NULL if the allocation fails. + * + * The vnode interlock is used to protect the i_dirhash pointer from + * simultaneous access while only a shared vnode lock is held. + */ +static struct dirhash * +ufsdirhash_create(struct inode *ip) +{ + struct dirhash *ndh; + struct dirhash *dh; + struct vnode *vp; + bool excl; + + ndh = dh = NULL; + vp = ip->i_vnode; + excl = false; + for (;;) { + /* Racy check for i_dirhash to prefetch a dirhash structure. */ + if (ip->i_dirhash == NULL && ndh == NULL) { + ndh = malloc(sizeof *dh, M_DIRHASH, + M_NOWAIT | M_ZERO); + if (ndh == NULL) + return (NULL); + refcount_init(&ndh->dh_refcount, 1); + + /* + * The DUPOK is to prevent warnings from the + * sx_slock() a few lines down which is safe + * since the duplicate lock in that case is + * the one for this dirhash we are creating + * now which has no external references until + * after this function returns. + */ + sx_init_flags(&ndh->dh_lock, "dirhash", SX_DUPOK); + sx_xlock(&ndh->dh_lock); + } + /* + * Check i_dirhash. If it's NULL just try to use a + * preallocated structure. If none exists loop and try again. + */ + VI_LOCK(vp); + dh = ip->i_dirhash; + if (dh == NULL) { + ip->i_dirhash = ndh; + VI_UNLOCK(vp); + if (ndh == NULL) + continue; + return (ndh); + } + ufsdirhash_hold(dh); + VI_UNLOCK(vp); + + /* Acquire a lock on existing hashes. */ + if (excl) + sx_xlock(&dh->dh_lock); + else + sx_slock(&dh->dh_lock); + + /* The hash could've been recycled while we were waiting. */ + VI_LOCK(vp); + if (ip->i_dirhash != dh) { + VI_UNLOCK(vp); + ufsdirhash_release(dh); + ufsdirhash_drop(dh); + continue; + } + VI_UNLOCK(vp); + ufsdirhash_drop(dh); + + /* If the hash is still valid we've succeeded. */ + if (dh->dh_hash != NULL) + break; + /* + * If the hash is NULL it has been recycled. Try to upgrade + * so we can recreate it. If we fail the upgrade, drop our + * lock and try again. + */ + if (excl || sx_try_upgrade(&dh->dh_lock)) + break; + sx_sunlock(&dh->dh_lock); + excl = true; + } + /* Free the preallocated structure if it was not necessary. */ + if (ndh) { + ufsdirhash_release(ndh); + ufsdirhash_drop(ndh); + } + return (dh); +} + +/* + * Acquire an exclusive lock on an existing hash. Requires an exclusive + * vnode lock to protect the i_dirhash pointer. hashes that have been + * recycled are reclaimed here and NULL is returned. + */ +static struct dirhash * +ufsdirhash_acquire(struct inode *ip) +{ + struct dirhash *dh; + + ASSERT_VOP_ELOCKED(ip->i_vnode, __FUNCTION__); + + dh = ip->i_dirhash; + if (dh == NULL) + return (NULL); + sx_xlock(&dh->dh_lock); + if (dh->dh_hash != NULL) + return (dh); + ufsdirhash_free_locked(ip); + return (NULL); +} + +/* + * Acquire exclusively and free the hash pointed to by ip. Works with a + * shared or exclusive vnode lock. + */ +void +ufsdirhash_free(struct inode *ip) +{ + struct dirhash *dh; + struct vnode *vp; + + vp = ip->i_vnode; + for (;;) { + /* Grab a reference on this inode's dirhash if it has one. */ + VI_LOCK(vp); + dh = ip->i_dirhash; + if (dh == NULL) { + VI_UNLOCK(vp); + return; + } + ufsdirhash_hold(dh); + VI_UNLOCK(vp); + + /* Exclusively lock the dirhash. */ + sx_xlock(&dh->dh_lock); + + /* If this dirhash still belongs to this inode, then free it. */ + VI_LOCK(vp); + if (ip->i_dirhash == dh) { + VI_UNLOCK(vp); + ufsdirhash_drop(dh); + break; + } + VI_UNLOCK(vp); + + /* + * This inode's dirhash has changed while we were + * waiting for the dirhash lock, so try again. + */ + ufsdirhash_release(dh); + ufsdirhash_drop(dh); + } + ufsdirhash_free_locked(ip); +} + +/* + * Attempt to build up a hash table for the directory contents in + * inode 'ip'. Returns 0 on success, or -1 of the operation failed. + */ +int +ufsdirhash_build(struct inode *ip) +{ + struct dirhash *dh; + struct buf *bp = NULL; + struct direct *ep; + struct vnode *vp; + doff_t bmask, pos; + int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot; + + /* Take care of a decreased sysctl value. */ + while (ufs_dirhashmem > ufs_dirhashmaxmem) { + if (ufsdirhash_recycle(0) != 0) + return (-1); + /* Recycled enough memory, so unlock the list. */ + DIRHASHLIST_UNLOCK(); + } + + /* Check if we can/should use dirhash. */ + if (ip->i_size < ufs_mindirhashsize || OFSFMT(ip->i_vnode) || + ip->i_effnlink == 0) { + if (ip->i_dirhash) + ufsdirhash_free(ip); + return (-1); + } + dh = ufsdirhash_create(ip); + if (dh == NULL) + return (-1); + if (dh->dh_hash != NULL) + return (0); + + vp = ip->i_vnode; + /* Allocate 50% more entries than this dir size could ever need. */ + KASSERT(ip->i_size >= DIRBLKSIZ, ("ufsdirhash_build size")); + nslots = ip->i_size / DIRECTSIZ(1); + nslots = (nslots * 3 + 1) / 2; + narrays = howmany(nslots, DH_NBLKOFF); + nslots = narrays * DH_NBLKOFF; + dirblocks = howmany(ip->i_size, DIRBLKSIZ); + nblocks = (dirblocks * 3 + 1) / 2; + memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) + + narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) + + nblocks * sizeof(*dh->dh_blkfree); + DIRHASHLIST_LOCK(); + if (memreqd + ufs_dirhashmem > ufs_dirhashmaxmem) { + DIRHASHLIST_UNLOCK(); + if (memreqd > ufs_dirhashmaxmem / 2) + goto fail; + /* Try to free some space. */ + if (ufsdirhash_recycle(memreqd) != 0) + goto fail; + /* Enough was freed, and list has been locked. */ + } + ufs_dirhashmem += memreqd; + DIRHASHLIST_UNLOCK(); + + /* Initialise the hash table and block statistics. */ + dh->dh_memreq = memreqd; + dh->dh_narrays = narrays; + dh->dh_hlen = nslots; + dh->dh_nblk = nblocks; + dh->dh_dirblks = dirblocks; + for (i = 0; i < DH_NFSTATS; i++) + dh->dh_firstfree[i] = -1; + dh->dh_firstfree[DH_NFSTATS] = 0; + dh->dh_hused = 0; + dh->dh_seqoff = -1; + dh->dh_score = DH_SCOREINIT; + dh->dh_lastused = time_second; + + /* + * Use non-blocking mallocs so that we will revert to a linear + * lookup on failure rather than potentially blocking forever. + */ + dh->dh_hash = malloc(narrays * sizeof(dh->dh_hash[0]), + M_DIRHASH, M_NOWAIT | M_ZERO); + if (dh->dh_hash == NULL) + goto fail; + dh->dh_blkfree = malloc(nblocks * sizeof(dh->dh_blkfree[0]), + M_DIRHASH, M_NOWAIT); + if (dh->dh_blkfree == NULL) + goto fail; + for (i = 0; i < narrays; i++) { + if ((dh->dh_hash[i] = DIRHASH_BLKALLOC_WAITOK()) == NULL) + goto fail; + for (j = 0; j < DH_NBLKOFF; j++) + dh->dh_hash[i][j] = DIRHASH_EMPTY; + } + for (i = 0; i < dirblocks; i++) + dh->dh_blkfree[i] = DIRBLKSIZ / DIRALIGN; + bmask = vp->v_mount->mnt_stat.f_iosize - 1; + pos = 0; + while (pos < ip->i_size) { + /* If necessary, get the next directory block. */ + if ((pos & bmask) == 0) { + if (bp != NULL) + brelse(bp); + if (UFS_BLKATOFF(vp, (off_t)pos, NULL, &bp) != 0) + goto fail; + } + + /* Add this entry to the hash. */ + ep = (struct direct *)((char *)bp->b_data + (pos & bmask)); + if (ep->d_reclen == 0 || ep->d_reclen > + DIRBLKSIZ - (pos & (DIRBLKSIZ - 1))) { + /* Corrupted directory. */ + brelse(bp); + goto fail; + } + if (ep->d_ino != 0) { + /* Add the entry (simplified ufsdirhash_add). */ + slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen); + while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY) + slot = WRAPINCR(slot, dh->dh_hlen); + dh->dh_hused++; + DH_ENTRY(dh, slot) = pos; + ufsdirhash_adjfree(dh, pos, -DIRSIZ(0, ep)); + } + pos += ep->d_reclen; + } + + if (bp != NULL) + brelse(bp); + DIRHASHLIST_LOCK(); + TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list); + dh->dh_onlist = 1; + DIRHASHLIST_UNLOCK(); + sx_downgrade(&dh->dh_lock); + return (0); + +fail: + ufsdirhash_free_locked(ip); + return (-1); +} + +/* + * Free any hash table associated with inode 'ip'. + */ +static void +ufsdirhash_free_locked(struct inode *ip) +{ + struct dirhash *dh; + struct vnode *vp; + int i; + + DIRHASH_ASSERT_LOCKED(ip->i_dirhash); + + /* + * Clear the pointer in the inode to prevent new threads from + * finding the dead structure. + */ + vp = ip->i_vnode; + VI_LOCK(vp); + dh = ip->i_dirhash; + ip->i_dirhash = NULL; + VI_UNLOCK(vp); + + /* + * Remove the hash from the list since we are going to free its + * memory. + */ + DIRHASHLIST_LOCK(); + if (dh->dh_onlist) + TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); + ufs_dirhashmem -= dh->dh_memreq; + DIRHASHLIST_UNLOCK(); + + /* + * At this point, any waiters for the lock should hold their + * own reference on the dirhash structure. They will drop + * that reference once they grab the vnode interlock and see + * that ip->i_dirhash is NULL. + */ + sx_xunlock(&dh->dh_lock); + + /* + * Handle partially recycled as well as fully constructed hashes. + */ + if (dh->dh_hash != NULL) { + for (i = 0; i < dh->dh_narrays; i++) + if (dh->dh_hash[i] != NULL) + DIRHASH_BLKFREE(dh->dh_hash[i]); + free(dh->dh_hash, M_DIRHASH); + if (dh->dh_blkfree != NULL) + free(dh->dh_blkfree, M_DIRHASH); + } + + /* + * Drop the inode's reference to the data structure. + */ + ufsdirhash_drop(dh); +} + +/* + * Find the offset of the specified name within the given inode. + * Returns 0 on success, ENOENT if the entry does not exist, or + * EJUSTRETURN if the caller should revert to a linear search. + * + * If successful, the directory offset is stored in *offp, and a + * pointer to a struct buf containing the entry is stored in *bpp. If + * prevoffp is non-NULL, the offset of the previous entry within + * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry + * is the first in a block, the start of the block is used). + * + * Must be called with the hash locked. Returns with the hash unlocked. + */ +int +ufsdirhash_lookup(struct inode *ip, char *name, int namelen, doff_t *offp, + struct buf **bpp, doff_t *prevoffp) +{ + struct dirhash *dh, *dh_next; + struct direct *dp; + struct vnode *vp; + struct buf *bp; + doff_t blkoff, bmask, offset, prevoff, seqoff; + int i, slot; + int error; + + dh = ip->i_dirhash; + KASSERT(dh != NULL && dh->dh_hash != NULL, + ("ufsdirhash_lookup: Invalid dirhash %p\n", dh)); + DIRHASH_ASSERT_LOCKED(dh); + /* + * Move this dirhash towards the end of the list if it has a + * score higher than the next entry, and acquire the dh_lock. + */ + DIRHASHLIST_LOCK(); + if (TAILQ_NEXT(dh, dh_list) != NULL) { + /* + * If the new score will be greater than that of the next + * entry, then move this entry past it. With both mutexes + * held, dh_next won't go away, but its dh_score could + * change; that's not important since it is just a hint. + */ + if ((dh_next = TAILQ_NEXT(dh, dh_list)) != NULL && + dh->dh_score >= dh_next->dh_score) { + KASSERT(dh->dh_onlist, ("dirhash: not on list")); + TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); + TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh, + dh_list); + } + } + /* Update the score. */ + if (dh->dh_score < DH_SCOREMAX) + dh->dh_score++; + + /* Update last used time. */ + dh->dh_lastused = time_second; + DIRHASHLIST_UNLOCK(); + + vp = ip->i_vnode; + bmask = vp->v_mount->mnt_stat.f_iosize - 1; + blkoff = -1; + bp = NULL; + seqoff = dh->dh_seqoff; +restart: + slot = ufsdirhash_hash(dh, name, namelen); + + if (seqoff != -1) { + /* + * Sequential access optimisation. seqoff contains the + * offset of the directory entry immediately following + * the last entry that was looked up. Check if this offset + * appears in the hash chain for the name we are looking for. + */ + for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY; + i = WRAPINCR(i, dh->dh_hlen)) + if (offset == seqoff) + break; + if (offset == seqoff) { + /* + * We found an entry with the expected offset. This + * is probably the entry we want, but if not, the + * code below will retry. + */ + slot = i; + } else + seqoff = -1; + } + + for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY; + slot = WRAPINCR(slot, dh->dh_hlen)) { + if (offset == DIRHASH_DEL) + continue; + if (offset < 0 || offset >= ip->i_size) + panic("ufsdirhash_lookup: bad offset in hash array"); + if ((offset & ~bmask) != blkoff) { + if (bp != NULL) + brelse(bp); + blkoff = offset & ~bmask; + if (UFS_BLKATOFF(vp, (off_t)blkoff, NULL, &bp) != 0) { + error = EJUSTRETURN; + goto fail; + } + } + KASSERT(bp != NULL, ("no buffer allocated")); + dp = (struct direct *)(bp->b_data + (offset & bmask)); + if (dp->d_reclen == 0 || dp->d_reclen > + DIRBLKSIZ - (offset & (DIRBLKSIZ - 1))) { + /* Corrupted directory. */ + error = EJUSTRETURN; + goto fail; + } + if (dp->d_namlen == namelen && + bcmp(dp->d_name, name, namelen) == 0) { + /* Found. Get the prev offset if needed. */ + if (prevoffp != NULL) { + if (offset & (DIRBLKSIZ - 1)) { + prevoff = ufsdirhash_getprev(dp, + offset); + if (prevoff == -1) { + error = EJUSTRETURN; + goto fail; + } + } else + prevoff = offset; + *prevoffp = prevoff; + } + + /* Update offset. */ + dh->dh_seqoff = offset + DIRSIZ(0, dp); + *bpp = bp; + *offp = offset; + ufsdirhash_release(dh); + return (0); + } + + /* + * When the name doesn't match in the sequential + * optimization case, go back and search normally. + */ + if (seqoff != -1) { + seqoff = -1; + goto restart; + } + } + error = ENOENT; +fail: + ufsdirhash_release(dh); + if (bp != NULL) + brelse(bp); + return (error); +} + +/* + * Find a directory block with room for 'slotneeded' bytes. Returns + * the offset of the directory entry that begins the free space. + * This will either be the offset of an existing entry that has free + * space at the end, or the offset of an entry with d_ino == 0 at + * the start of a DIRBLKSIZ block. + * + * To use the space, the caller may need to compact existing entries in + * the directory. The total number of bytes in all of the entries involved + * in the compaction is stored in *slotsize. In other words, all of + * the entries that must be compacted are exactly contained in the + * region beginning at the returned offset and spanning *slotsize bytes. + * + * Returns -1 if no space was found, indicating that the directory + * must be extended. + */ +doff_t +ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize) +{ + struct direct *dp; + struct dirhash *dh; + struct buf *bp; + doff_t pos, slotstart; + int dirblock, error, freebytes, i; + + dh = ip->i_dirhash; + KASSERT(dh != NULL && dh->dh_hash != NULL, + ("ufsdirhash_findfree: Invalid dirhash %p\n", dh)); + DIRHASH_ASSERT_LOCKED(dh); + + /* Find a directory block with the desired free space. */ + dirblock = -1; + for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++) + if ((dirblock = dh->dh_firstfree[i]) != -1) + break; + if (dirblock == -1) + return (-1); + + KASSERT(dirblock < dh->dh_nblk && + dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN), + ("ufsdirhash_findfree: bad stats")); + pos = dirblock * DIRBLKSIZ; + error = UFS_BLKATOFF(ip->i_vnode, (off_t)pos, (char **)&dp, &bp); + if (error) + return (-1); + + /* Find the first entry with free space. */ + for (i = 0; i < DIRBLKSIZ; ) { + if (dp->d_reclen == 0) { + brelse(bp); + return (-1); + } + if (dp->d_ino == 0 || dp->d_reclen > DIRSIZ(0, dp)) + break; + i += dp->d_reclen; + dp = (struct direct *)((char *)dp + dp->d_reclen); + } + if (i > DIRBLKSIZ) { + brelse(bp); + return (-1); + } + slotstart = pos + i; + + /* Find the range of entries needed to get enough space */ + freebytes = 0; + while (i < DIRBLKSIZ && freebytes < slotneeded) { + freebytes += dp->d_reclen; + if (dp->d_ino != 0) + freebytes -= DIRSIZ(0, dp); + if (dp->d_reclen == 0) { + brelse(bp); + return (-1); + } + i += dp->d_reclen; + dp = (struct direct *)((char *)dp + dp->d_reclen); + } + if (i > DIRBLKSIZ) { + brelse(bp); + return (-1); + } + if (freebytes < slotneeded) + panic("ufsdirhash_findfree: free mismatch"); + brelse(bp); + *slotsize = pos + i - slotstart; + return (slotstart); +} + +/* + * Return the start of the unused space at the end of a directory, or + * -1 if there are no trailing unused blocks. + */ +doff_t +ufsdirhash_enduseful(struct inode *ip) +{ + + struct dirhash *dh; + int i; + + dh = ip->i_dirhash; + DIRHASH_ASSERT_LOCKED(dh); + KASSERT(dh != NULL && dh->dh_hash != NULL, + ("ufsdirhash_enduseful: Invalid dirhash %p\n", dh)); + + if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN) + return (-1); + + for (i = dh->dh_dirblks - 1; i >= 0; i--) + if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN) + break; + + return ((doff_t)(i + 1) * DIRBLKSIZ); +} + +/* + * Insert information into the hash about a new directory entry. dirp + * points to a struct direct containing the entry, and offset specifies + * the offset of this entry. + */ +void +ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset) +{ + struct dirhash *dh; + int slot; + + if ((dh = ufsdirhash_acquire(ip)) == NULL) + return; + + KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ, + ("ufsdirhash_add: bad offset")); + /* + * Normal hash usage is < 66%. If the usage gets too high then + * remove the hash entirely and let it be rebuilt later. + */ + if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) { + ufsdirhash_free_locked(ip); + return; + } + + /* Find a free hash slot (empty or deleted), and add the entry. */ + slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen); + while (DH_ENTRY(dh, slot) >= 0) + slot = WRAPINCR(slot, dh->dh_hlen); + if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY) + dh->dh_hused++; + DH_ENTRY(dh, slot) = offset; + + /* Update last used time. */ + dh->dh_lastused = time_second; + + /* Update the per-block summary info. */ + ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp)); + ufsdirhash_release(dh); +} + +/* + * Remove the specified directory entry from the hash. The entry to remove + * is defined by the name in `dirp', which must exist at the specified + * `offset' within the directory. + */ +void +ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset) +{ + struct dirhash *dh; + int slot; + + if ((dh = ufsdirhash_acquire(ip)) == NULL) + return; + + KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ, + ("ufsdirhash_remove: bad offset")); + /* Find the entry */ + slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset); + + /* Remove the hash entry. */ + ufsdirhash_delslot(dh, slot); + + /* Update the per-block summary info. */ + ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp)); + ufsdirhash_release(dh); +} + +/* + * Change the offset associated with a directory entry in the hash. Used + * when compacting directory blocks. + */ +void +ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff, + doff_t newoff) +{ + struct dirhash *dh; + int slot; + + if ((dh = ufsdirhash_acquire(ip)) == NULL) + return; + + KASSERT(oldoff < dh->dh_dirblks * DIRBLKSIZ && + newoff < dh->dh_dirblks * DIRBLKSIZ, + ("ufsdirhash_move: bad offset")); + /* Find the entry, and update the offset. */ + slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff); + DH_ENTRY(dh, slot) = newoff; + ufsdirhash_release(dh); +} + +/* + * Inform dirhash that the directory has grown by one block that + * begins at offset (i.e. the new length is offset + DIRBLKSIZ). + */ +void +ufsdirhash_newblk(struct inode *ip, doff_t offset) +{ + struct dirhash *dh; + int block; + + if ((dh = ufsdirhash_acquire(ip)) == NULL) + return; + + KASSERT(offset == dh->dh_dirblks * DIRBLKSIZ, + ("ufsdirhash_newblk: bad offset")); + block = offset / DIRBLKSIZ; + if (block >= dh->dh_nblk) { + /* Out of space; must rebuild. */ + ufsdirhash_free_locked(ip); + return; + } + dh->dh_dirblks = block + 1; + + /* Account for the new free block. */ + dh->dh_blkfree[block] = DIRBLKSIZ / DIRALIGN; + if (dh->dh_firstfree[DH_NFSTATS] == -1) + dh->dh_firstfree[DH_NFSTATS] = block; + ufsdirhash_release(dh); +} + +/* + * Inform dirhash that the directory is being truncated. + */ +void +ufsdirhash_dirtrunc(struct inode *ip, doff_t offset) +{ + struct dirhash *dh; + int block, i; + + if ((dh = ufsdirhash_acquire(ip)) == NULL) + return; + + KASSERT(offset <= dh->dh_dirblks * DIRBLKSIZ, + ("ufsdirhash_dirtrunc: bad offset")); + block = howmany(offset, DIRBLKSIZ); + /* + * If the directory shrinks to less than 1/8 of dh_nblk blocks + * (about 20% of its original size due to the 50% extra added in + * ufsdirhash_build) then free it, and let the caller rebuild + * if necessary. + */ + if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) { + ufsdirhash_free_locked(ip); + return; + } + + /* + * Remove any `first free' information pertaining to the + * truncated blocks. All blocks we're removing should be + * completely unused. + */ + if (dh->dh_firstfree[DH_NFSTATS] >= block) + dh->dh_firstfree[DH_NFSTATS] = -1; + for (i = block; i < dh->dh_dirblks; i++) + if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN) + panic("ufsdirhash_dirtrunc: blocks in use"); + for (i = 0; i < DH_NFSTATS; i++) + if (dh->dh_firstfree[i] >= block) + panic("ufsdirhash_dirtrunc: first free corrupt"); + dh->dh_dirblks = block; + ufsdirhash_release(dh); +} + +/* + * Debugging function to check that the dirhash information about + * a directory block matches its actual contents. Panics if a mismatch + * is detected. + * + * On entry, `buf' should point to the start of an in-core + * DIRBLKSIZ-sized directory block, and `offset' should contain the + * offset from the start of the directory of that block. + */ +void +ufsdirhash_checkblock(struct inode *ip, char *buf, doff_t offset) +{ + struct dirhash *dh; + struct direct *dp; + int block, ffslot, i, nfree; + + if (!ufs_dirhashcheck) + return; + if ((dh = ufsdirhash_acquire(ip)) == NULL) + return; + + block = offset / DIRBLKSIZ; + if ((offset & (DIRBLKSIZ - 1)) != 0 || block >= dh->dh_dirblks) + panic("ufsdirhash_checkblock: bad offset"); + + nfree = 0; + for (i = 0; i < DIRBLKSIZ; i += dp->d_reclen) { + dp = (struct direct *)(buf + i); + if (dp->d_reclen == 0 || i + dp->d_reclen > DIRBLKSIZ) + panic("ufsdirhash_checkblock: bad dir"); + + if (dp->d_ino == 0) { +#if 0 + /* + * XXX entries with d_ino == 0 should only occur + * at the start of a DIRBLKSIZ block. However the + * ufs code is tolerant of such entries at other + * offsets, and fsck does not fix them. + */ + if (i != 0) + panic("ufsdirhash_checkblock: bad dir inode"); +#endif + nfree += dp->d_reclen; + continue; + } + + /* Check that the entry exists (will panic if it doesn't). */ + ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i); + + nfree += dp->d_reclen - DIRSIZ(0, dp); + } + if (i != DIRBLKSIZ) + panic("ufsdirhash_checkblock: bad dir end"); + + if (dh->dh_blkfree[block] * DIRALIGN != nfree) + panic("ufsdirhash_checkblock: bad free count"); + + ffslot = BLKFREE2IDX(nfree / DIRALIGN); + for (i = 0; i <= DH_NFSTATS; i++) + if (dh->dh_firstfree[i] == block && i != ffslot) + panic("ufsdirhash_checkblock: bad first-free"); + if (dh->dh_firstfree[ffslot] == -1) + panic("ufsdirhash_checkblock: missing first-free entry"); + ufsdirhash_release(dh); +} + +/* + * Hash the specified filename into a dirhash slot. + */ +static int +ufsdirhash_hash(struct dirhash *dh, char *name, int namelen) +{ + u_int32_t hash; + + /* + * We hash the name and then some other bit of data that is + * invariant over the dirhash's lifetime. Otherwise names + * differing only in the last byte are placed close to one + * another in the table, which is bad for linear probing. + */ + hash = fnv_32_buf(name, namelen, FNV1_32_INIT); + hash = fnv_32_buf(&dh, sizeof(dh), hash); + return (hash % dh->dh_hlen); +} + +/* + * Adjust the number of free bytes in the block containing `offset' + * by the value specified by `diff'. + * + * The caller must ensure we have exclusive access to `dh'; normally + * that means that dh_lock should be held, but this is also called + * from ufsdirhash_build() where exclusive access can be assumed. + */ +static void +ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff) +{ + int block, i, nfidx, ofidx; + + /* Update the per-block summary info. */ + block = offset / DIRBLKSIZ; + KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks, + ("dirhash bad offset")); + ofidx = BLKFREE2IDX(dh->dh_blkfree[block]); + dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN); + nfidx = BLKFREE2IDX(dh->dh_blkfree[block]); + + /* Update the `first free' list if necessary. */ + if (ofidx != nfidx) { + /* If removing, scan forward for the next block. */ + if (dh->dh_firstfree[ofidx] == block) { + for (i = block + 1; i < dh->dh_dirblks; i++) + if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx) + break; + dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1; + } + + /* Make this the new `first free' if necessary */ + if (dh->dh_firstfree[nfidx] > block || + dh->dh_firstfree[nfidx] == -1) + dh->dh_firstfree[nfidx] = block; + } +} + +/* + * Find the specified name which should have the specified offset. + * Returns a slot number, and panics on failure. + * + * `dh' must be locked on entry and remains so on return. + */ +static int +ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen, doff_t offset) +{ + int slot; + + DIRHASH_ASSERT_LOCKED(dh); + + /* Find the entry. */ + KASSERT(dh->dh_hused < dh->dh_hlen, ("dirhash find full")); + slot = ufsdirhash_hash(dh, name, namelen); + while (DH_ENTRY(dh, slot) != offset && + DH_ENTRY(dh, slot) != DIRHASH_EMPTY) + slot = WRAPINCR(slot, dh->dh_hlen); + if (DH_ENTRY(dh, slot) != offset) + panic("ufsdirhash_findslot: '%.*s' not found", namelen, name); + + return (slot); +} + +/* + * Remove the entry corresponding to the specified slot from the hash array. + * + * `dh' must be locked on entry and remains so on return. + */ +static void +ufsdirhash_delslot(struct dirhash *dh, int slot) +{ + int i; + + DIRHASH_ASSERT_LOCKED(dh); + + /* Mark the entry as deleted. */ + DH_ENTRY(dh, slot) = DIRHASH_DEL; + + /* If this is the end of a chain of DIRHASH_DEL slots, remove them. */ + for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; ) + i = WRAPINCR(i, dh->dh_hlen); + if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) { + i = WRAPDECR(i, dh->dh_hlen); + while (DH_ENTRY(dh, i) == DIRHASH_DEL) { + DH_ENTRY(dh, i) = DIRHASH_EMPTY; + dh->dh_hused--; + i = WRAPDECR(i, dh->dh_hlen); + } + KASSERT(dh->dh_hused >= 0, ("ufsdirhash_delslot neg hlen")); + } +} + +/* + * Given a directory entry and its offset, find the offset of the + * previous entry in the same DIRBLKSIZ-sized block. Returns an + * offset, or -1 if there is no previous entry in the block or some + * other problem occurred. + */ +static doff_t +ufsdirhash_getprev(struct direct *dirp, doff_t offset) +{ + struct direct *dp; + char *blkbuf; + doff_t blkoff, prevoff; + int entrypos, i; + + blkoff = rounddown2(offset, DIRBLKSIZ); /* offset of start of block */ + entrypos = offset & (DIRBLKSIZ - 1); /* entry relative to block */ + blkbuf = (char *)dirp - entrypos; + prevoff = blkoff; + + /* If `offset' is the start of a block, there is no previous entry. */ + if (entrypos == 0) + return (-1); + + /* Scan from the start of the block until we get to the entry. */ + for (i = 0; i < entrypos; i += dp->d_reclen) { + dp = (struct direct *)(blkbuf + i); + if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos) + return (-1); /* Corrupted directory. */ + prevoff = blkoff + i; + } + return (prevoff); +} + +/* + * Delete the given dirhash and reclaim its memory. Assumes that + * ufsdirhash_list is locked, and leaves it locked. Also assumes + * that dh is locked. Returns the amount of memory freed. + */ +static int +ufsdirhash_destroy(struct dirhash *dh) +{ + doff_t **hash; + u_int8_t *blkfree; + int i, mem, narrays; + + KASSERT(dh->dh_hash != NULL, ("dirhash: NULL hash on list")); + + /* Remove it from the list and detach its memory. */ + TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); + dh->dh_onlist = 0; + hash = dh->dh_hash; + dh->dh_hash = NULL; + blkfree = dh->dh_blkfree; + dh->dh_blkfree = NULL; + narrays = dh->dh_narrays; + mem = dh->dh_memreq; + dh->dh_memreq = 0; + + /* Unlock dirhash and free the detached memory. */ + ufsdirhash_release(dh); + for (i = 0; i < narrays; i++) + DIRHASH_BLKFREE(hash[i]); + free(hash, M_DIRHASH); + free(blkfree, M_DIRHASH); + + /* Account for the returned memory. */ + ufs_dirhashmem -= mem; + + return (mem); +} + +/* + * Try to free up `wanted' bytes by stealing memory from existing + * dirhashes. Returns zero with list locked if successful. + */ +static int +ufsdirhash_recycle(int wanted) +{ + struct dirhash *dh; + + DIRHASHLIST_LOCK(); + dh = TAILQ_FIRST(&ufsdirhash_list); + while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) { + /* Decrement the score; only recycle if it becomes zero. */ + if (dh == NULL || --dh->dh_score > 0) { + DIRHASHLIST_UNLOCK(); + return (-1); + } + /* + * If we can't lock it it's in use and we don't want to + * recycle it anyway. + */ + if (!sx_try_xlock(&dh->dh_lock)) { + dh = TAILQ_NEXT(dh, dh_list); + continue; + } + + ufsdirhash_destroy(dh); + + /* Repeat if necessary. */ + dh = TAILQ_FIRST(&ufsdirhash_list); + } + /* Success; return with list locked. */ + return (0); +} + +/* + * Callback that frees some dirhashes when the system is low on virtual memory. + */ +static void +ufsdirhash_lowmem() +{ + struct dirhash *dh, *dh_temp; + int memfreed, memwanted; + + ufs_dirhashlowmemcount++; + memfreed = 0; + memwanted = ufs_dirhashmem * ufs_dirhashreclaimpercent / 100; + + DIRHASHLIST_LOCK(); + + /* + * Reclaim up to memwanted from the oldest dirhashes. This will allow + * us to make some progress when the system is running out of memory + * without compromising the dinamicity of maximum age. If the situation + * does not improve lowmem will be eventually retriggered and free some + * other entry in the cache. The entries on the head of the list should + * be the oldest. If during list traversal we can't get a lock on the + * dirhash, it will be skipped. + */ + TAILQ_FOREACH_SAFE(dh, &ufsdirhash_list, dh_list, dh_temp) { + if (sx_try_xlock(&dh->dh_lock)) + memfreed += ufsdirhash_destroy(dh); + if (memfreed >= memwanted) + break; + } + DIRHASHLIST_UNLOCK(); +} + +static int +ufsdirhash_set_reclaimpercent(SYSCTL_HANDLER_ARGS) +{ + int error, v; + + v = ufs_dirhashreclaimpercent; + error = sysctl_handle_int(oidp, &v, v, req); + if (error) + return (error); + if (req->newptr == NULL) + return (error); + if (v == ufs_dirhashreclaimpercent) + return (0); + + /* Refuse invalid percentages */ + if (v < 0 || v > 100) + return (EINVAL); + ufs_dirhashreclaimpercent = v; + return (0); +} + +void +ufsdirhash_init() +{ + ufs_dirhashmaxmem = lmax(roundup(hibufspace / 64, PAGE_SIZE), + 2 * 1024 * 1024); + + ufsdirhash_zone = uma_zcreate("DIRHASH", DH_NBLKOFF * sizeof(doff_t), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + mtx_init(&ufsdirhash_mtx, "dirhash list", NULL, MTX_DEF); + TAILQ_INIT(&ufsdirhash_list); + + /* Register a callback function to handle low memory signals */ + EVENTHANDLER_REGISTER(vm_lowmem, ufsdirhash_lowmem, NULL, + EVENTHANDLER_PRI_FIRST); +} + +void +ufsdirhash_uninit() +{ + KASSERT(TAILQ_EMPTY(&ufsdirhash_list), ("ufsdirhash_uninit")); + uma_zdestroy(ufsdirhash_zone); + mtx_destroy(&ufsdirhash_mtx); +} + +#endif /* UFS_DIRHASH */ diff --git a/Dump/ufs/ufs/ufs_extattr.c b/Dump/ufs/ufs/ufs_extattr.c new file mode 100644 index 0000000..bb3bcca --- /dev/null +++ b/Dump/ufs/ufs/ufs_extattr.c @@ -0,0 +1,1300 @@ +/*- + * Copyright (c) 1999-2002 Robert N. M. Watson + * Copyright (c) 2002-2003 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed by Robert Watson for the TrustedBSD Project. + * + * This software was developed for the FreeBSD Project in part by Network + * Associates Laboratories, the Security Research Division of Network + * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), + * as part of the DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +/* + * Support for filesystem extended attribute: UFS-specific support functions. + */ + +#include +__FBSDID("$FreeBSD: releng/11.2/sys/ufs/ufs/ufs_extattr.c 298463 2016-04-22 08:09:27Z ngie $"); + +#include "opt_ufs.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#ifdef UFS_EXTATTR + +FEATURE(ufs_extattr, "ufs extended attribute support"); + +static MALLOC_DEFINE(M_UFS_EXTATTR, "ufs_extattr", "ufs extended attribute"); + +static int ufs_extattr_sync = 0; +SYSCTL_INT(_debug, OID_AUTO, ufs_extattr_sync, CTLFLAG_RW, &ufs_extattr_sync, + 0, ""); + +static int ufs_extattr_valid_attrname(int attrnamespace, + const char *attrname); +static int ufs_extattr_enable_with_open(struct ufsmount *ump, + struct vnode *vp, int attrnamespace, const char *attrname, + struct thread *td); +static int ufs_extattr_enable(struct ufsmount *ump, int attrnamespace, + const char *attrname, struct vnode *backing_vnode, + struct thread *td); +static int ufs_extattr_disable(struct ufsmount *ump, int attrnamespace, + const char *attrname, struct thread *td); +static int ufs_extattr_get(struct vnode *vp, int attrnamespace, + const char *name, struct uio *uio, size_t *size, + struct ucred *cred, struct thread *td); +static int ufs_extattr_set(struct vnode *vp, int attrnamespace, + const char *name, struct uio *uio, struct ucred *cred, + struct thread *td); +static int ufs_extattr_rm(struct vnode *vp, int attrnamespace, + const char *name, struct ucred *cred, struct thread *td); +#ifdef UFS_EXTATTR_AUTOSTART +static int ufs_extattr_autostart_locked(struct mount *mp, + struct thread *td); +#endif +static int ufs_extattr_start_locked(struct ufsmount *ump, + struct thread *td); + +/* + * Per-FS attribute lock protecting attribute operations. + * + * XXXRW: Perhaps something more fine-grained would be appropriate, but at + * the end of the day we're going to contend on the vnode lock for the + * backing file anyway. + */ +static void +ufs_extattr_uepm_lock(struct ufsmount *ump) +{ + + sx_xlock(&ump->um_extattr.uepm_lock); +} + +static void +ufs_extattr_uepm_unlock(struct ufsmount *ump) +{ + + sx_xunlock(&ump->um_extattr.uepm_lock); +} + +/*- + * Determine whether the name passed is a valid name for an actual + * attribute. + * + * Invalid currently consists of: + * NULL pointer for attrname + * zero-length attrname (used to retrieve application attribute list) + */ +static int +ufs_extattr_valid_attrname(int attrnamespace, const char *attrname) +{ + + if (attrname == NULL) + return (0); + if (strlen(attrname) == 0) + return (0); + return (1); +} + +/* + * Locate an attribute given a name and mountpoint. + * Must be holding uepm lock for the mount point. + */ +static struct ufs_extattr_list_entry * +ufs_extattr_find_attr(struct ufsmount *ump, int attrnamespace, + const char *attrname) +{ + struct ufs_extattr_list_entry *search_attribute; + + sx_assert(&ump->um_extattr.uepm_lock, SA_XLOCKED); + + for (search_attribute = LIST_FIRST(&ump->um_extattr.uepm_list); + search_attribute != NULL; + search_attribute = LIST_NEXT(search_attribute, uele_entries)) { + if (!(strncmp(attrname, search_attribute->uele_attrname, + UFS_EXTATTR_MAXEXTATTRNAME)) && + (attrnamespace == search_attribute->uele_attrnamespace)) { + return (search_attribute); + } + } + + return (0); +} + +/* + * Initialize per-FS structures supporting extended attributes. Do not + * start extended attributes yet. + */ +void +ufs_extattr_uepm_init(struct ufs_extattr_per_mount *uepm) +{ + + uepm->uepm_flags = 0; + LIST_INIT(&uepm->uepm_list); + sx_init(&uepm->uepm_lock, "ufs_extattr_sx"); + uepm->uepm_flags |= UFS_EXTATTR_UEPM_INITIALIZED; +} + +/* + * Destroy per-FS structures supporting extended attributes. Assumes + * that EAs have already been stopped, and will panic if not. + */ +void +ufs_extattr_uepm_destroy(struct ufs_extattr_per_mount *uepm) +{ + + if (!(uepm->uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) + panic("ufs_extattr_uepm_destroy: not initialized"); + + if ((uepm->uepm_flags & UFS_EXTATTR_UEPM_STARTED)) + panic("ufs_extattr_uepm_destroy: called while still started"); + + /* + * It's not clear that either order for the next two lines is + * ideal, and it should never be a problem if this is only called + * during unmount, and with vfs_busy(). + */ + uepm->uepm_flags &= ~UFS_EXTATTR_UEPM_INITIALIZED; + sx_destroy(&uepm->uepm_lock); +} + +/* + * Start extended attribute support on an FS. + */ +int +ufs_extattr_start(struct mount *mp, struct thread *td) +{ + struct ufsmount *ump; + int error = 0; + + ump = VFSTOUFS(mp); + + ufs_extattr_uepm_lock(ump); + error = ufs_extattr_start_locked(ump, td); + ufs_extattr_uepm_unlock(ump); + return (error); +} + +static int +ufs_extattr_start_locked(struct ufsmount *ump, struct thread *td) +{ + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) + return (EOPNOTSUPP); + if (ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED) + return (EBUSY); + + ump->um_extattr.uepm_flags |= UFS_EXTATTR_UEPM_STARTED; + ump->um_extattr.uepm_ucred = crhold(td->td_ucred); + return (0); +} + +#ifdef UFS_EXTATTR_AUTOSTART +/* + * Helper routine: given a locked parent directory and filename, return + * the locked vnode of the inode associated with the name. Will not + * follow symlinks, may return any type of vnode. Lock on parent will + * be released even in the event of a failure. In the event that the + * target is the parent (i.e., "."), there will be two references and + * one lock, requiring the caller to possibly special-case. + */ +#define UE_GETDIR_LOCKPARENT 1 +#define UE_GETDIR_LOCKPARENT_DONT 2 +static int +ufs_extattr_lookup(struct vnode *start_dvp, int lockparent, char *dirname, + struct vnode **vp, struct thread *td) +{ + struct vop_cachedlookup_args vargs; + struct componentname cnp; + struct vnode *target_vp; + int error; + + bzero(&cnp, sizeof(cnp)); + cnp.cn_nameiop = LOOKUP; + cnp.cn_flags = ISLASTCN; + if (lockparent == UE_GETDIR_LOCKPARENT) + cnp.cn_flags |= LOCKPARENT; + cnp.cn_lkflags = LK_EXCLUSIVE; + cnp.cn_thread = td; + cnp.cn_cred = td->td_ucred; + cnp.cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); + cnp.cn_nameptr = cnp.cn_pnbuf; + error = copystr(dirname, cnp.cn_pnbuf, MAXPATHLEN, + (size_t *) &cnp.cn_namelen); + if (error) { + if (lockparent == UE_GETDIR_LOCKPARENT_DONT) { + VOP_UNLOCK(start_dvp, 0); + } + uma_zfree(namei_zone, cnp.cn_pnbuf); + printf("ufs_extattr_lookup: copystr failed\n"); + return (error); + } + cnp.cn_namelen--; /* trim nul termination */ + vargs.a_gen.a_desc = NULL; + vargs.a_dvp = start_dvp; + vargs.a_vpp = &target_vp; + vargs.a_cnp = &cnp; + error = ufs_lookup(&vargs); + uma_zfree(namei_zone, cnp.cn_pnbuf); + if (error) { + /* + * Error condition, may have to release the lock on the parent + * if ufs_lookup() didn't. + */ + if (lockparent == UE_GETDIR_LOCKPARENT_DONT) + VOP_UNLOCK(start_dvp, 0); + + /* + * Check that ufs_lookup() didn't release the lock when we + * didn't want it to. + */ + if (lockparent == UE_GETDIR_LOCKPARENT) + ASSERT_VOP_LOCKED(start_dvp, "ufs_extattr_lookup"); + + return (error); + } +/* + if (target_vp == start_dvp) + panic("ufs_extattr_lookup: target_vp == start_dvp"); +*/ + + if (target_vp != start_dvp && lockparent == UE_GETDIR_LOCKPARENT_DONT) + VOP_UNLOCK(start_dvp, 0); + + if (lockparent == UE_GETDIR_LOCKPARENT) + ASSERT_VOP_LOCKED(start_dvp, "ufs_extattr_lookup"); + + /* printf("ufs_extattr_lookup: success\n"); */ + *vp = target_vp; + return (0); +} +#endif /* !UFS_EXTATTR_AUTOSTART */ + +/* + * Enable an EA using the passed filesystem, backing vnode, attribute name, + * namespace, and proc. Will perform a VOP_OPEN() on the vp, so expects vp + * to be locked when passed in. The vnode will be returned unlocked, + * regardless of success/failure of the function. As a result, the caller + * will always need to vrele(), but not vput(). + */ +static int +ufs_extattr_enable_with_open(struct ufsmount *ump, struct vnode *vp, + int attrnamespace, const char *attrname, struct thread *td) +{ + int error; + + error = VOP_OPEN(vp, FREAD|FWRITE, td->td_ucred, td, NULL); + if (error) { + printf("ufs_extattr_enable_with_open.VOP_OPEN(): failed " + "with %d\n", error); + VOP_UNLOCK(vp, 0); + return (error); + } + + VOP_ADD_WRITECOUNT(vp, 1); + CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d", __func__, vp, + vp->v_writecount); + + vref(vp); + + VOP_UNLOCK(vp, 0); + + error = ufs_extattr_enable(ump, attrnamespace, attrname, vp, td); + if (error != 0) + vn_close(vp, FREAD|FWRITE, td->td_ucred, td); + return (error); +} + +#ifdef UFS_EXTATTR_AUTOSTART +/* + * Given a locked directory vnode, iterate over the names in the directory + * and use ufs_extattr_lookup() to retrieve locked vnodes of potential + * attribute files. Then invoke ufs_extattr_enable_with_open() on each + * to attempt to start the attribute. Leaves the directory locked on + * exit. + */ +static int +ufs_extattr_iterate_directory(struct ufsmount *ump, struct vnode *dvp, + int attrnamespace, struct thread *td) +{ + struct vop_readdir_args vargs; + struct dirent *dp, *edp; + struct vnode *attr_vp; + struct uio auio; + struct iovec aiov; + char *dirbuf; + int error, eofflag = 0; + + if (dvp->v_type != VDIR) + return (ENOTDIR); + + dirbuf = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK); + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_td = td; + auio.uio_offset = 0; + + vargs.a_gen.a_desc = NULL; + vargs.a_vp = dvp; + vargs.a_uio = &auio; + vargs.a_cred = td->td_ucred; + vargs.a_eofflag = &eofflag; + vargs.a_ncookies = NULL; + vargs.a_cookies = NULL; + + while (!eofflag) { + auio.uio_resid = DIRBLKSIZ; + aiov.iov_base = dirbuf; + aiov.iov_len = DIRBLKSIZ; + error = ufs_readdir(&vargs); + if (error) { + printf("ufs_extattr_iterate_directory: ufs_readdir " + "%d\n", error); + return (error); + } + + edp = (struct dirent *)&dirbuf[DIRBLKSIZ - auio.uio_resid]; + for (dp = (struct dirent *)dirbuf; dp < edp; ) { + if (dp->d_reclen == 0) + break; + error = ufs_extattr_lookup(dvp, UE_GETDIR_LOCKPARENT, + dp->d_name, &attr_vp, td); + if (error) { + printf("ufs_extattr_iterate_directory: lookup " + "%s %d\n", dp->d_name, error); + } else if (attr_vp == dvp) { + vrele(attr_vp); + } else if (attr_vp->v_type != VREG) { + vput(attr_vp); + } else { + error = ufs_extattr_enable_with_open(ump, + attr_vp, attrnamespace, dp->d_name, td); + vrele(attr_vp); + if (error) { + printf("ufs_extattr_iterate_directory: " + "enable %s %d\n", dp->d_name, + error); + } else if (bootverbose) { + printf("UFS autostarted EA %s\n", + dp->d_name); + } + } + dp = (struct dirent *) ((char *)dp + dp->d_reclen); + if (dp >= edp) + break; + } + } + free(dirbuf, M_TEMP); + + return (0); +} + +/* + * Auto-start of extended attributes, to be executed (optionally) at + * mount-time. + */ +int +ufs_extattr_autostart(struct mount *mp, struct thread *td) +{ + struct ufsmount *ump; + int error; + + ump = VFSTOUFS(mp); + ufs_extattr_uepm_lock(ump); + error = ufs_extattr_autostart_locked(mp, td); + ufs_extattr_uepm_unlock(ump); + return (error); +} + +static int +ufs_extattr_autostart_locked(struct mount *mp, struct thread *td) +{ + struct vnode *rvp, *attr_dvp, *attr_system_dvp, *attr_user_dvp; + struct ufsmount *ump = VFSTOUFS(mp); + int error; + + /* + * UFS_EXTATTR applies only to UFS1, as UFS2 uses native extended + * attributes, so don't autostart. + */ + if (ump->um_fstype != UFS1) + return (0); + + /* + * Does UFS_EXTATTR_FSROOTSUBDIR exist off the filesystem root? + * If so, automatically start EA's. + */ + error = VFS_ROOT(mp, LK_EXCLUSIVE, &rvp); + if (error) { + printf("ufs_extattr_autostart.VFS_ROOT() returned %d\n", + error); + return (error); + } + + error = ufs_extattr_lookup(rvp, UE_GETDIR_LOCKPARENT_DONT, + UFS_EXTATTR_FSROOTSUBDIR, &attr_dvp, td); + if (error) { + /* rvp ref'd but now unlocked */ + vrele(rvp); + return (error); + } + if (rvp == attr_dvp) { + /* Should never happen. */ + vput(rvp); + vrele(attr_dvp); + return (EINVAL); + } + vrele(rvp); + + if (attr_dvp->v_type != VDIR) { + printf("ufs_extattr_autostart: %s != VDIR\n", + UFS_EXTATTR_FSROOTSUBDIR); + goto return_vput_attr_dvp; + } + + error = ufs_extattr_start_locked(ump, td); + if (error) { + printf("ufs_extattr_autostart: ufs_extattr_start failed (%d)\n", + error); + goto return_vput_attr_dvp; + } + + /* + * Look for two subdirectories: UFS_EXTATTR_SUBDIR_SYSTEM, + * UFS_EXTATTR_SUBDIR_USER. For each, iterate over the sub-directory, + * and start with appropriate type. Failures in either don't + * result in an over-all failure. attr_dvp is left locked to + * be cleaned up on exit. + */ + error = ufs_extattr_lookup(attr_dvp, UE_GETDIR_LOCKPARENT, + UFS_EXTATTR_SUBDIR_SYSTEM, &attr_system_dvp, td); + if (!error) { + error = ufs_extattr_iterate_directory(VFSTOUFS(mp), + attr_system_dvp, EXTATTR_NAMESPACE_SYSTEM, td); + if (error) + printf("ufs_extattr_iterate_directory returned %d\n", + error); + vput(attr_system_dvp); + } + + error = ufs_extattr_lookup(attr_dvp, UE_GETDIR_LOCKPARENT, + UFS_EXTATTR_SUBDIR_USER, &attr_user_dvp, td); + if (!error) { + error = ufs_extattr_iterate_directory(VFSTOUFS(mp), + attr_user_dvp, EXTATTR_NAMESPACE_USER, td); + if (error) + printf("ufs_extattr_iterate_directory returned %d\n", + error); + vput(attr_user_dvp); + } + + /* Mask startup failures in sub-directories. */ + error = 0; + +return_vput_attr_dvp: + vput(attr_dvp); + + return (error); +} +#endif /* !UFS_EXTATTR_AUTOSTART */ + +/* + * Stop extended attribute support on an FS. + */ +int +ufs_extattr_stop(struct mount *mp, struct thread *td) +{ + struct ufs_extattr_list_entry *uele; + struct ufsmount *ump = VFSTOUFS(mp); + int error = 0; + + ufs_extattr_uepm_lock(ump); + + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) { + error = EOPNOTSUPP; + goto unlock; + } + + while ((uele = LIST_FIRST(&ump->um_extattr.uepm_list)) != NULL) { + ufs_extattr_disable(ump, uele->uele_attrnamespace, + uele->uele_attrname, td); + } + + ump->um_extattr.uepm_flags &= ~UFS_EXTATTR_UEPM_STARTED; + + crfree(ump->um_extattr.uepm_ucred); + ump->um_extattr.uepm_ucred = NULL; + +unlock: + ufs_extattr_uepm_unlock(ump); + + return (error); +} + +/* + * Enable a named attribute on the specified filesystem; provide an + * unlocked backing vnode to hold the attribute data. + */ +static int +ufs_extattr_enable(struct ufsmount *ump, int attrnamespace, + const char *attrname, struct vnode *backing_vnode, struct thread *td) +{ + struct ufs_extattr_list_entry *attribute; + struct iovec aiov; + struct uio auio; + int error = 0; + + if (!ufs_extattr_valid_attrname(attrnamespace, attrname)) + return (EINVAL); + if (backing_vnode->v_type != VREG) + return (EINVAL); + + attribute = malloc(sizeof(struct ufs_extattr_list_entry), + M_UFS_EXTATTR, M_WAITOK); + + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) { + error = EOPNOTSUPP; + goto free_exit; + } + + if (ufs_extattr_find_attr(ump, attrnamespace, attrname)) { + error = EEXIST; + goto free_exit; + } + + strncpy(attribute->uele_attrname, attrname, + UFS_EXTATTR_MAXEXTATTRNAME); + attribute->uele_attrnamespace = attrnamespace; + bzero(&attribute->uele_fileheader, + sizeof(struct ufs_extattr_fileheader)); + + attribute->uele_backing_vnode = backing_vnode; + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = (caddr_t) &attribute->uele_fileheader; + aiov.iov_len = sizeof(struct ufs_extattr_fileheader); + auio.uio_resid = sizeof(struct ufs_extattr_fileheader); + auio.uio_offset = (off_t) 0; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_td = td; + + vn_lock(backing_vnode, LK_SHARED | LK_RETRY); + error = VOP_READ(backing_vnode, &auio, IO_NODELOCKED, + ump->um_extattr.uepm_ucred); + + if (error) + goto unlock_free_exit; + + if (auio.uio_resid != 0) { + printf("ufs_extattr_enable: malformed attribute header\n"); + error = EINVAL; + goto unlock_free_exit; + } + + if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) { + printf("ufs_extattr_enable: invalid attribute header magic\n"); + error = EINVAL; + goto unlock_free_exit; + } + + if (attribute->uele_fileheader.uef_version != UFS_EXTATTR_VERSION) { + printf("ufs_extattr_enable: incorrect attribute header " + "version\n"); + error = EINVAL; + goto unlock_free_exit; + } + + ASSERT_VOP_LOCKED(backing_vnode, "ufs_extattr_enable"); + LIST_INSERT_HEAD(&ump->um_extattr.uepm_list, attribute, + uele_entries); + + VOP_UNLOCK(backing_vnode, 0); + return (0); + +unlock_free_exit: + VOP_UNLOCK(backing_vnode, 0); + +free_exit: + free(attribute, M_UFS_EXTATTR); + return (error); +} + +/* + * Disable extended attribute support on an FS. + */ +static int +ufs_extattr_disable(struct ufsmount *ump, int attrnamespace, + const char *attrname, struct thread *td) +{ + struct ufs_extattr_list_entry *uele; + int error = 0; + + if (!ufs_extattr_valid_attrname(attrnamespace, attrname)) + return (EINVAL); + + uele = ufs_extattr_find_attr(ump, attrnamespace, attrname); + if (!uele) + return (ENOATTR); + + LIST_REMOVE(uele, uele_entries); + + vn_lock(uele->uele_backing_vnode, LK_SHARED | LK_RETRY); + ASSERT_VOP_LOCKED(uele->uele_backing_vnode, "ufs_extattr_disable"); + VOP_UNLOCK(uele->uele_backing_vnode, 0); + error = vn_close(uele->uele_backing_vnode, FREAD|FWRITE, + td->td_ucred, td); + + free(uele, M_UFS_EXTATTR); + + return (error); +} + +/* + * VFS call to manage extended attributes in UFS. If filename_vp is + * non-NULL, it must be passed in locked, and regardless of errors in + * processing, will be unlocked. + */ +int +ufs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, + int attrnamespace, const char *attrname) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct thread *td = curthread; + int error; + + /* + * Processes with privilege, but in jail, are not allowed to + * configure extended attributes. + */ + error = priv_check(td, PRIV_UFS_EXTATTRCTL); + if (error) { + if (filename_vp != NULL) + VOP_UNLOCK(filename_vp, 0); + return (error); + } + + /* + * We only allow extattrctl(2) on UFS1 file systems, as UFS2 uses + * native extended attributes. + */ + if (ump->um_fstype != UFS1) { + if (filename_vp != NULL) + VOP_UNLOCK(filename_vp, 0); + return (EOPNOTSUPP); + } + + switch(cmd) { + case UFS_EXTATTR_CMD_START: + if (filename_vp != NULL) { + VOP_UNLOCK(filename_vp, 0); + return (EINVAL); + } + if (attrname != NULL) + return (EINVAL); + + error = ufs_extattr_start(mp, td); + + return (error); + + case UFS_EXTATTR_CMD_STOP: + if (filename_vp != NULL) { + VOP_UNLOCK(filename_vp, 0); + return (EINVAL); + } + if (attrname != NULL) + return (EINVAL); + + error = ufs_extattr_stop(mp, td); + + return (error); + + case UFS_EXTATTR_CMD_ENABLE: + + if (filename_vp == NULL) + return (EINVAL); + if (attrname == NULL) { + VOP_UNLOCK(filename_vp, 0); + return (EINVAL); + } + + /* + * ufs_extattr_enable_with_open() will always unlock the + * vnode, regardless of failure. + */ + ufs_extattr_uepm_lock(ump); + error = ufs_extattr_enable_with_open(ump, filename_vp, + attrnamespace, attrname, td); + ufs_extattr_uepm_unlock(ump); + + return (error); + + case UFS_EXTATTR_CMD_DISABLE: + + if (filename_vp != NULL) { + VOP_UNLOCK(filename_vp, 0); + return (EINVAL); + } + if (attrname == NULL) + return (EINVAL); + + ufs_extattr_uepm_lock(ump); + error = ufs_extattr_disable(ump, attrnamespace, attrname, + td); + ufs_extattr_uepm_unlock(ump); + + return (error); + + default: + return (EINVAL); + } +} + +/* + * Vnode operating to retrieve a named extended attribute. + */ +int +ufs_getextattr(struct vop_getextattr_args *ap) +/* +vop_getextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + INOUT struct uio *a_uio; + OUT size_t *a_size; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +*/ +{ + struct mount *mp = ap->a_vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + int error; + + ufs_extattr_uepm_lock(ump); + + error = ufs_extattr_get(ap->a_vp, ap->a_attrnamespace, ap->a_name, + ap->a_uio, ap->a_size, ap->a_cred, ap->a_td); + + ufs_extattr_uepm_unlock(ump); + + return (error); +} + +/* + * Real work associated with retrieving a named attribute--assumes that + * the attribute lock has already been grabbed. + */ +static int +ufs_extattr_get(struct vnode *vp, int attrnamespace, const char *name, + struct uio *uio, size_t *size, struct ucred *cred, struct thread *td) +{ + struct ufs_extattr_list_entry *attribute; + struct ufs_extattr_header ueh; + struct iovec local_aiov; + struct uio local_aio; + struct mount *mp = vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + struct inode *ip = VTOI(vp); + off_t base_offset; + size_t len, old_len; + int error = 0; + + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) + return (EOPNOTSUPP); + + if (strlen(name) == 0) + return (EINVAL); + + error = extattr_check_cred(vp, attrnamespace, cred, td, VREAD); + if (error) + return (error); + + attribute = ufs_extattr_find_attr(ump, attrnamespace, name); + if (!attribute) + return (ENOATTR); + + /* + * Allow only offsets of zero to encourage the read/replace + * extended attribute semantic. Otherwise we can't guarantee + * atomicity, as we don't provide locks for extended attributes. + */ + if (uio != NULL && uio->uio_offset != 0) + return (ENXIO); + + /* + * Find base offset of header in file based on file header size, and + * data header size + maximum data size, indexed by inode number. + */ + base_offset = sizeof(struct ufs_extattr_fileheader) + + ip->i_number * (sizeof(struct ufs_extattr_header) + + attribute->uele_fileheader.uef_size); + + /* + * Read in the data header to see if the data is defined, and if so + * how much. + */ + bzero(&ueh, sizeof(struct ufs_extattr_header)); + local_aiov.iov_base = (caddr_t) &ueh; + local_aiov.iov_len = sizeof(struct ufs_extattr_header); + local_aio.uio_iov = &local_aiov; + local_aio.uio_iovcnt = 1; + local_aio.uio_rw = UIO_READ; + local_aio.uio_segflg = UIO_SYSSPACE; + local_aio.uio_td = td; + local_aio.uio_offset = base_offset; + local_aio.uio_resid = sizeof(struct ufs_extattr_header); + + /* + * Acquire locks. + * + * Don't need to get a lock on the backing file if the getattr is + * being applied to the backing file, as the lock is already held. + */ + if (attribute->uele_backing_vnode != vp) + vn_lock(attribute->uele_backing_vnode, LK_SHARED | LK_RETRY); + + error = VOP_READ(attribute->uele_backing_vnode, &local_aio, + IO_NODELOCKED, ump->um_extattr.uepm_ucred); + if (error) + goto vopunlock_exit; + + /* Defined? */ + if ((ueh.ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0) { + error = ENOATTR; + goto vopunlock_exit; + } + + /* Valid for the current inode generation? */ + if (ueh.ueh_i_gen != ip->i_gen) { + /* + * The inode itself has a different generation number + * than the attribute data. For now, the best solution + * is to coerce this to undefined, and let it get cleaned + * up by the next write or extattrctl clean. + */ + printf("ufs_extattr_get (%s): inode number inconsistency (%d, %ju)\n", + mp->mnt_stat.f_mntonname, ueh.ueh_i_gen, (uintmax_t)ip->i_gen); + error = ENOATTR; + goto vopunlock_exit; + } + + /* Local size consistency check. */ + if (ueh.ueh_len > attribute->uele_fileheader.uef_size) { + error = ENXIO; + goto vopunlock_exit; + } + + /* Return full data size if caller requested it. */ + if (size != NULL) + *size = ueh.ueh_len; + + /* Return data if the caller requested it. */ + if (uio != NULL) { + /* Allow for offset into the attribute data. */ + uio->uio_offset = base_offset + sizeof(struct + ufs_extattr_header); + + /* + * Figure out maximum to transfer -- use buffer size and + * local data limit. + */ + len = MIN(uio->uio_resid, ueh.ueh_len); + old_len = uio->uio_resid; + uio->uio_resid = len; + + error = VOP_READ(attribute->uele_backing_vnode, uio, + IO_NODELOCKED, ump->um_extattr.uepm_ucred); + if (error) + goto vopunlock_exit; + + uio->uio_resid = old_len - (len - uio->uio_resid); + } + +vopunlock_exit: + + if (uio != NULL) + uio->uio_offset = 0; + + if (attribute->uele_backing_vnode != vp) + VOP_UNLOCK(attribute->uele_backing_vnode, 0); + + return (error); +} + +/* + * Vnode operation to remove a named attribute. + */ +int +ufs_deleteextattr(struct vop_deleteextattr_args *ap) +/* +vop_deleteextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +*/ +{ + struct mount *mp = ap->a_vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + int error; + + ufs_extattr_uepm_lock(ump); + + error = ufs_extattr_rm(ap->a_vp, ap->a_attrnamespace, ap->a_name, + ap->a_cred, ap->a_td); + + + ufs_extattr_uepm_unlock(ump); + + return (error); +} + +/* + * Vnode operation to set a named attribute. + */ +int +ufs_setextattr(struct vop_setextattr_args *ap) +/* +vop_setextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + INOUT struct uio *a_uio; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +*/ +{ + struct mount *mp = ap->a_vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + int error; + + /* + * XXX: No longer a supported way to delete extended attributes. + */ + if (ap->a_uio == NULL) + return (EINVAL); + + ufs_extattr_uepm_lock(ump); + + error = ufs_extattr_set(ap->a_vp, ap->a_attrnamespace, ap->a_name, + ap->a_uio, ap->a_cred, ap->a_td); + + ufs_extattr_uepm_unlock(ump); + + return (error); +} + +/* + * Real work associated with setting a vnode's extended attributes; + * assumes that the attribute lock has already been grabbed. + */ +static int +ufs_extattr_set(struct vnode *vp, int attrnamespace, const char *name, + struct uio *uio, struct ucred *cred, struct thread *td) +{ + struct ufs_extattr_list_entry *attribute; + struct ufs_extattr_header ueh; + struct iovec local_aiov; + struct uio local_aio; + struct mount *mp = vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + struct inode *ip = VTOI(vp); + off_t base_offset; + int error = 0, ioflag; + + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) + return (EOPNOTSUPP); + if (!ufs_extattr_valid_attrname(attrnamespace, name)) + return (EINVAL); + + error = extattr_check_cred(vp, attrnamespace, cred, td, VWRITE); + if (error) + return (error); + + attribute = ufs_extattr_find_attr(ump, attrnamespace, name); + if (!attribute) + return (ENOATTR); + + /* + * Early rejection of invalid offsets/length. + * Reject: any offset but 0 (replace) + * Any size greater than attribute size limit + */ + if (uio->uio_offset != 0 || + uio->uio_resid > attribute->uele_fileheader.uef_size) + return (ENXIO); + + /* + * Find base offset of header in file based on file header size, and + * data header size + maximum data size, indexed by inode number. + */ + base_offset = sizeof(struct ufs_extattr_fileheader) + + ip->i_number * (sizeof(struct ufs_extattr_header) + + attribute->uele_fileheader.uef_size); + + /* + * Write out a data header for the data. + */ + ueh.ueh_len = uio->uio_resid; + ueh.ueh_flags = UFS_EXTATTR_ATTR_FLAG_INUSE; + ueh.ueh_i_gen = ip->i_gen; + local_aiov.iov_base = (caddr_t) &ueh; + local_aiov.iov_len = sizeof(struct ufs_extattr_header); + local_aio.uio_iov = &local_aiov; + local_aio.uio_iovcnt = 1; + local_aio.uio_rw = UIO_WRITE; + local_aio.uio_segflg = UIO_SYSSPACE; + local_aio.uio_td = td; + local_aio.uio_offset = base_offset; + local_aio.uio_resid = sizeof(struct ufs_extattr_header); + + /* + * Acquire locks. + * + * Don't need to get a lock on the backing file if the setattr is + * being applied to the backing file, as the lock is already held. + */ + if (attribute->uele_backing_vnode != vp) + vn_lock(attribute->uele_backing_vnode, LK_EXCLUSIVE | LK_RETRY); + + ioflag = IO_NODELOCKED; + if (ufs_extattr_sync) + ioflag |= IO_SYNC; + error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag, + ump->um_extattr.uepm_ucred); + if (error) + goto vopunlock_exit; + + if (local_aio.uio_resid != 0) { + error = ENXIO; + goto vopunlock_exit; + } + + /* + * Write out user data. + */ + uio->uio_offset = base_offset + sizeof(struct ufs_extattr_header); + + ioflag = IO_NODELOCKED; + if (ufs_extattr_sync) + ioflag |= IO_SYNC; + error = VOP_WRITE(attribute->uele_backing_vnode, uio, ioflag, + ump->um_extattr.uepm_ucred); + +vopunlock_exit: + uio->uio_offset = 0; + + if (attribute->uele_backing_vnode != vp) + VOP_UNLOCK(attribute->uele_backing_vnode, 0); + + return (error); +} + +/* + * Real work associated with removing an extended attribute from a vnode. + * Assumes the attribute lock has already been grabbed. + */ +static int +ufs_extattr_rm(struct vnode *vp, int attrnamespace, const char *name, + struct ucred *cred, struct thread *td) +{ + struct ufs_extattr_list_entry *attribute; + struct ufs_extattr_header ueh; + struct iovec local_aiov; + struct uio local_aio; + struct mount *mp = vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + struct inode *ip = VTOI(vp); + off_t base_offset; + int error = 0, ioflag; + + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) + return (EOPNOTSUPP); + if (!ufs_extattr_valid_attrname(attrnamespace, name)) + return (EINVAL); + + error = extattr_check_cred(vp, attrnamespace, cred, td, VWRITE); + if (error) + return (error); + + attribute = ufs_extattr_find_attr(ump, attrnamespace, name); + if (!attribute) + return (ENOATTR); + + /* + * Find base offset of header in file based on file header size, and + * data header size + maximum data size, indexed by inode number. + */ + base_offset = sizeof(struct ufs_extattr_fileheader) + + ip->i_number * (sizeof(struct ufs_extattr_header) + + attribute->uele_fileheader.uef_size); + + /* + * Check to see if currently defined. + */ + bzero(&ueh, sizeof(struct ufs_extattr_header)); + + local_aiov.iov_base = (caddr_t) &ueh; + local_aiov.iov_len = sizeof(struct ufs_extattr_header); + local_aio.uio_iov = &local_aiov; + local_aio.uio_iovcnt = 1; + local_aio.uio_rw = UIO_READ; + local_aio.uio_segflg = UIO_SYSSPACE; + local_aio.uio_td = td; + local_aio.uio_offset = base_offset; + local_aio.uio_resid = sizeof(struct ufs_extattr_header); + + /* + * Don't need to get the lock on the backing vnode if the vnode we're + * modifying is it, as we already hold the lock. + */ + if (attribute->uele_backing_vnode != vp) + vn_lock(attribute->uele_backing_vnode, LK_EXCLUSIVE | LK_RETRY); + + error = VOP_READ(attribute->uele_backing_vnode, &local_aio, + IO_NODELOCKED, ump->um_extattr.uepm_ucred); + if (error) + goto vopunlock_exit; + + /* Defined? */ + if ((ueh.ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0) { + error = ENOATTR; + goto vopunlock_exit; + } + + /* Valid for the current inode generation? */ + if (ueh.ueh_i_gen != ip->i_gen) { + /* + * The inode itself has a different generation number than + * the attribute data. For now, the best solution is to + * coerce this to undefined, and let it get cleaned up by + * the next write or extattrctl clean. + */ + printf("ufs_extattr_rm (%s): inode number inconsistency (%d, %jd)\n", + mp->mnt_stat.f_mntonname, ueh.ueh_i_gen, (intmax_t)ip->i_gen); + error = ENOATTR; + goto vopunlock_exit; + } + + /* Flag it as not in use. */ + ueh.ueh_flags = 0; + ueh.ueh_len = 0; + + local_aiov.iov_base = (caddr_t) &ueh; + local_aiov.iov_len = sizeof(struct ufs_extattr_header); + local_aio.uio_iov = &local_aiov; + local_aio.uio_iovcnt = 1; + local_aio.uio_rw = UIO_WRITE; + local_aio.uio_segflg = UIO_SYSSPACE; + local_aio.uio_td = td; + local_aio.uio_offset = base_offset; + local_aio.uio_resid = sizeof(struct ufs_extattr_header); + + ioflag = IO_NODELOCKED; + if (ufs_extattr_sync) + ioflag |= IO_SYNC; + error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag, + ump->um_extattr.uepm_ucred); + if (error) + goto vopunlock_exit; + + if (local_aio.uio_resid != 0) + error = ENXIO; + +vopunlock_exit: + VOP_UNLOCK(attribute->uele_backing_vnode, 0); + + return (error); +} + +/* + * Called by UFS when an inode is no longer active and should have its + * attributes stripped. + */ +void +ufs_extattr_vnode_inactive(struct vnode *vp, struct thread *td) +{ + struct ufs_extattr_list_entry *uele; + struct mount *mp = vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + + /* + * In that case, we cannot lock. We should not have any active vnodes + * on the fs if this is not yet initialized but is going to be, so + * this can go unlocked. + */ + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) + return; + + ufs_extattr_uepm_lock(ump); + + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) { + ufs_extattr_uepm_unlock(ump); + return; + } + + LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries) + ufs_extattr_rm(vp, uele->uele_attrnamespace, + uele->uele_attrname, NULL, td); + + ufs_extattr_uepm_unlock(ump); +} + +#endif /* !UFS_EXTATTR */ diff --git a/Dump/ufs/ufs/ufs_extern.h b/Dump/ufs/ufs/ufs_extern.h new file mode 100644 index 0000000..ea2ee8a --- /dev/null +++ b/Dump/ufs/ufs/ufs_extern.h @@ -0,0 +1,127 @@ +/*- + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_extern.h 8.10 (Berkeley) 5/14/95 + * $FreeBSD: releng/11.2/sys/ufs/ufs/ufs_extern.h 331722 2018-03-29 02:50:57Z eadler $ + */ + +#ifndef _UFS_UFS_EXTERN_H_ +#define _UFS_UFS_EXTERN_H_ + +struct componentname; +struct direct; +struct indir; +struct inode; +struct mount; +struct thread; +struct sockaddr; +struct ucred; +struct ufid; +struct vfsconf; +struct vnode; +struct vop_bmap_args; +struct vop_cachedlookup_args; +struct vop_generic_args; +struct vop_inactive_args; +struct vop_reclaim_args; + +extern struct vop_vector ufs_fifoops; +extern struct vop_vector ufs_vnodeops; + +int ufs_bmap(struct vop_bmap_args *); +int ufs_bmaparray(struct vnode *, ufs2_daddr_t, ufs2_daddr_t *, + struct buf *, int *, int *); +int ufs_fhtovp(struct mount *, struct ufid *, int, struct vnode **); +int ufs_checkpath(ino_t, ino_t, struct inode *, struct ucred *, ino_t *); +void ufs_dirbad(struct inode *, doff_t, char *); +int ufs_dirbadentry(struct vnode *, struct direct *, int); +int ufs_dirempty(struct inode *, ino_t, struct ucred *); +int ufs_extread(struct vop_read_args *); +int ufs_extwrite(struct vop_write_args *); +void ufs_makedirentry(struct inode *, struct componentname *, + struct direct *); +int ufs_direnter(struct vnode *, struct vnode *, struct direct *, + struct componentname *, struct buf *, int); +int ufs_dirremove(struct vnode *, struct inode *, int, int); +int ufs_dirrewrite(struct inode *, struct inode *, ino_t, int, int); +int ufs_lookup_ino(struct vnode *, struct vnode **, struct componentname *, + ino_t *); +int ufs_getlbns(struct vnode *, ufs2_daddr_t, struct indir *, int *); +int ufs_inactive(struct vop_inactive_args *); +int ufs_init(struct vfsconf *); +void ufs_itimes(struct vnode *vp); +int ufs_lookup(struct vop_cachedlookup_args *); +void ufs_prepare_reclaim(struct vnode *vp); +int ufs_readdir(struct vop_readdir_args *); +int ufs_reclaim(struct vop_reclaim_args *); +void ffs_snapgone(struct inode *); +vfs_root_t ufs_root; +int ufs_uninit(struct vfsconf *); +int ufs_vinit(struct mount *, struct vop_vector *, struct vnode **); + +#include +SYSCTL_DECL(_vfs_ufs); + +/* + * Soft update function prototypes. + */ +int softdep_setup_directory_add(struct buf *, struct inode *, off_t, + ino_t, struct buf *, int); +void softdep_change_directoryentry_offset(struct buf *, struct inode *, + caddr_t, caddr_t, caddr_t, int); +void softdep_setup_remove(struct buf *,struct inode *, struct inode *, int); +void softdep_setup_directory_change(struct buf *, struct inode *, + struct inode *, ino_t, int); +void softdep_change_linkcnt(struct inode *); +int softdep_slowdown(struct vnode *); +void softdep_setup_create(struct inode *, struct inode *); +void softdep_setup_dotdot_link(struct inode *, struct inode *); +void softdep_setup_link(struct inode *, struct inode *); +void softdep_setup_mkdir(struct inode *, struct inode *); +void softdep_setup_rmdir(struct inode *, struct inode *); +void softdep_setup_unlink(struct inode *, struct inode *); +void softdep_revert_create(struct inode *, struct inode *); +void softdep_revert_link(struct inode *, struct inode *); +void softdep_revert_mkdir(struct inode *, struct inode *); +void softdep_revert_rmdir(struct inode *, struct inode *); + +/* + * Flags to low-level allocation routines. The low 16-bits are reserved + * for IO_ flags from vnode.h. + * + * Note: The general vfs code typically limits the sequential heuristic + * count to 127. See sequential_heuristic() in kern/vfs_vnops.c + */ +#define BA_CLRBUF 0x00010000 /* Clear invalid areas of buffer. */ +#define BA_METAONLY 0x00020000 /* Return indirect block buffer. */ +#define BA_UNMAPPED 0x00040000 /* Do not mmap resulted buffer. */ +#define BA_SEQMASK 0x7F000000 /* Bits holding seq heuristic. */ +#define BA_SEQSHIFT 24 +#define BA_SEQMAX 0x7F + +#endif /* !_UFS_UFS_EXTERN_H_ */ diff --git a/Dump/ufs/ufs/ufs_gjournal.c b/Dump/ufs/ufs/ufs_gjournal.c new file mode 100644 index 0000000..fd4c584 --- /dev/null +++ b/Dump/ufs/ufs/ufs_gjournal.c @@ -0,0 +1,141 @@ +/*- + * Copyright (c) 2005-2006 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD: releng/11.2/sys/ufs/ufs/ufs_gjournal.c 306627 2016-10-03 09:37:56Z kib $"); + +#include "opt_ufs.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +/* + * Change the number of unreferenced inodes. + */ +static int +ufs_gjournal_modref(struct vnode *vp, int count) +{ + struct cg *cgp; + struct buf *bp; + ufs2_daddr_t cgbno; + int error, cg; + struct cdev *dev; + struct inode *ip; + struct ufsmount *ump; + struct fs *fs; + struct vnode *devvp; + ino_t ino; + + ip = VTOI(vp); + ump = VFSTOUFS(vp->v_mount); + fs = ump->um_fs; + devvp = ump->um_devvp; + ino = ip->i_number; + + cg = ino_to_cg(fs, ino); + if (devvp->v_type == VREG) { + /* devvp is a snapshot */ + dev = VFSTOUFS(devvp->v_mount)->um_devvp->v_rdev; + cgbno = fragstoblks(fs, cgtod(fs, cg)); + } else if (devvp->v_type == VCHR) { + /* devvp is a normal disk device */ + dev = devvp->v_rdev; + cgbno = fsbtodb(fs, cgtod(fs, cg)); + } else { + bp = NULL; + return (EIO); + } + if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg) + panic("ufs_gjournal_modref: range: dev = %s, ino = %lu, fs = %s", + devtoname(dev), (u_long)ino, fs->fs_fsmnt); + if ((error = bread(devvp, cgbno, (int)fs->fs_cgsize, NOCRED, &bp))) { + brelse(bp); + return (error); + } + cgp = (struct cg *)bp->b_data; + if (!cg_chkmagic(cgp)) { + brelse(bp); + return (0); + } + bp->b_xflags |= BX_BKGRDWRITE; + cgp->cg_unrefs += count; + UFS_LOCK(ump); + fs->fs_unrefs += count; + fs->fs_fmod = 1; + ACTIVECLEAR(fs, cg); + UFS_UNLOCK(ump); + bdwrite(bp); + return (0); +} + +void +ufs_gjournal_orphan(struct vnode *vp) +{ + struct inode *ip; + + if (vp->v_mount->mnt_gjprovider == NULL) + return; + if (vp->v_usecount < 2 || (vp->v_vflag & VV_DELETED)) + return; + ip = VTOI(vp); + if ((vp->v_type == VDIR && ip->i_nlink > 2) || + (vp->v_type != VDIR && ip->i_nlink > 1)) { + return; + } + vp->v_vflag |= VV_DELETED; + + ufs_gjournal_modref(vp, 1); +} + +void +ufs_gjournal_close(struct vnode *vp) +{ + struct inode *ip; + + if (vp->v_mount->mnt_gjprovider == NULL) + return; + if (!(vp->v_vflag & VV_DELETED)) + return; + ip = VTOI(vp); + if (ip->i_nlink > 0) + return; + ufs_gjournal_modref(vp, -1); +} diff --git a/Dump/ufs/ufs/ufs_inode.c b/Dump/ufs/ufs/ufs_inode.c new file mode 100644 index 0000000..46a11d9 --- /dev/null +++ b/Dump/ufs/ufs/ufs_inode.c @@ -0,0 +1,237 @@ +/*- + * Copyright (c) 1991, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_inode.c 8.9 (Berkeley) 5/14/95 + */ + +#include +__FBSDID("$FreeBSD: releng/11.2/sys/ufs/ufs/ufs_inode.c 331722 2018-03-29 02:50:57Z eadler $"); + +#include "opt_quota.h" +#include "opt_ufs.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#ifdef UFS_DIRHASH +#include +#include +#endif +#ifdef UFS_GJOURNAL +#include +#endif + +/* + * Last reference to an inode. If necessary, write or delete it. + */ +int +ufs_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + mode_t mode; + int error = 0; + off_t isize; + struct mount *mp; + + mp = NULL; + /* + * Ignore inodes related to stale file handles. + */ + if (ip->i_mode == 0) + goto out; +#ifdef UFS_GJOURNAL + ufs_gjournal_close(vp); +#endif +#ifdef QUOTA + /* + * Before moving off the active list, we must be sure that + * any modified quotas have been pushed since these will no + * longer be checked once the vnode is on the inactive list. + */ + qsyncvp(vp); +#endif + if ((ip->i_effnlink == 0 && DOINGSOFTDEP(vp)) || + (ip->i_nlink <= 0 && !UFS_RDONLY(ip))) { + loop: + if (vn_start_secondary_write(vp, &mp, V_NOWAIT) != 0) { + /* Cannot delete file while file system is suspended */ + if ((vp->v_iflag & VI_DOOMED) != 0) { + /* Cannot return before file is deleted */ + (void) vn_start_secondary_write(vp, &mp, + V_WAIT); + } else { + MNT_ILOCK(mp); + if ((mp->mnt_kern_flag & + (MNTK_SUSPEND2 | MNTK_SUSPENDED)) == 0) { + MNT_IUNLOCK(mp); + goto loop; + } + /* + * Fail to inactivate vnode now and + * let ffs_snapshot() clean up after + * it has resumed the file system. + */ + VI_LOCK(vp); + vp->v_iflag |= VI_OWEINACT; + VI_UNLOCK(vp); + MNT_IUNLOCK(mp); + return (0); + } + } + } + isize = ip->i_size; + if (I_IS_UFS2(ip)) + isize += ip->i_din2->di_extsize; + if (ip->i_effnlink <= 0 && isize && !UFS_RDONLY(ip)) + error = UFS_TRUNCATE(vp, (off_t)0, IO_EXT | IO_NORMAL, NOCRED); + if (ip->i_nlink <= 0 && ip->i_mode && !UFS_RDONLY(ip)) { +#ifdef QUOTA + if (!getinoquota(ip)) + (void)chkiq(ip, -1, NOCRED, FORCE); +#endif +#ifdef UFS_EXTATTR + ufs_extattr_vnode_inactive(vp, ap->a_td); +#endif + /* + * Setting the mode to zero needs to wait for the inode + * to be written just as does a change to the link count. + * So, rather than creating a new entry point to do the + * same thing, we just use softdep_change_linkcnt(). + */ + DIP_SET(ip, i_rdev, 0); + mode = ip->i_mode; + ip->i_mode = 0; + DIP_SET(ip, i_mode, 0); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (DOINGSOFTDEP(vp)) + softdep_change_linkcnt(ip); + UFS_VFREE(vp, ip->i_number, mode); + } + if (ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) { + if ((ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) == 0 && + mp == NULL && + vn_start_secondary_write(vp, &mp, V_NOWAIT)) { + mp = NULL; + ip->i_flag &= ~IN_ACCESS; + } else { + if (mp == NULL) + (void) vn_start_secondary_write(vp, &mp, + V_WAIT); + UFS_UPDATE(vp, 0); + } + } +out: + /* + * If we are done with the inode, reclaim it + * so that it can be reused immediately. + */ + if (ip->i_mode == 0) + vrecycle(vp); + if (mp != NULL) + vn_finished_secondary_write(mp); + return (error); +} + +void +ufs_prepare_reclaim(struct vnode *vp) +{ + struct inode *ip; +#ifdef QUOTA + int i; +#endif + + ip = VTOI(vp); + + vnode_destroy_vobject(vp); +#ifdef QUOTA + for (i = 0; i < MAXQUOTAS; i++) { + if (ip->i_dquot[i] != NODQUOT) { + dqrele(vp, ip->i_dquot[i]); + ip->i_dquot[i] = NODQUOT; + } + } +#endif +#ifdef UFS_DIRHASH + if (ip->i_dirhash != NULL) + ufsdirhash_free(ip); +#endif +} + +/* + * Reclaim an inode so that it can be used for other purposes. + */ +int +ufs_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + + ufs_prepare_reclaim(vp); + + if (ip->i_flag & IN_LAZYMOD) + ip->i_flag |= IN_MODIFIED; + UFS_UPDATE(vp, 0); + /* + * Remove the inode from its hash chain. + */ + vfs_hash_remove(vp); + + /* + * Lock the clearing of v_data so ffs_lock() can inspect it + * prior to obtaining the lock. + */ + VI_LOCK(vp); + vp->v_data = 0; + VI_UNLOCK(vp); + UFS_IFREE(ITOUMP(ip), ip); + return (0); +} diff --git a/Dump/ufs/ufs/ufs_lookup.c b/Dump/ufs/ufs/ufs_lookup.c new file mode 100644 index 0000000..5c9967b --- /dev/null +++ b/Dump/ufs/ufs/ufs_lookup.c @@ -0,0 +1,1486 @@ +/*- + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_lookup.c 8.15 (Berkeley) 6/16/95 + */ + +#include +__FBSDID("$FreeBSD: releng/11.2/sys/ufs/ufs/ufs_lookup.c 331722 2018-03-29 02:50:57Z eadler $"); + +#include "opt_ufs.h" +#include "opt_quota.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#ifdef UFS_DIRHASH +#include +#endif +#include +#include + +#ifdef DIAGNOSTIC +static int dirchk = 1; +#else +static int dirchk = 0; +#endif + +SYSCTL_INT(_debug, OID_AUTO, dircheck, CTLFLAG_RW, &dirchk, 0, ""); + +/* true if old FS format...*/ +#define OFSFMT(vp) ((vp)->v_mount->mnt_maxsymlinklen <= 0) + +static int +ufs_delete_denied(struct vnode *vdp, struct vnode *tdp, struct ucred *cred, + struct thread *td) +{ + int error; + +#ifdef UFS_ACL + /* + * NFSv4 Minor Version 1, draft-ietf-nfsv4-minorversion1-03.txt + * + * 3.16.2.1. ACE4_DELETE vs. ACE4_DELETE_CHILD + */ + + /* + * XXX: Is this check required? + */ + error = VOP_ACCESS(vdp, VEXEC, cred, td); + if (error) + return (error); + + error = VOP_ACCESSX(tdp, VDELETE, cred, td); + if (error == 0) + return (0); + + error = VOP_ACCESSX(vdp, VDELETE_CHILD, cred, td); + if (error == 0) + return (0); + + error = VOP_ACCESSX(vdp, VEXPLICIT_DENY | VDELETE_CHILD, cred, td); + if (error) + return (error); + +#endif /* !UFS_ACL */ + + /* + * Standard Unix access control - delete access requires VWRITE. + */ + error = VOP_ACCESS(vdp, VWRITE, cred, td); + if (error) + return (error); + + /* + * If directory is "sticky", then user must own + * the directory, or the file in it, else she + * may not delete it (unless she's root). This + * implements append-only directories. + */ + if ((VTOI(vdp)->i_mode & ISVTX) && + VOP_ACCESS(vdp, VADMIN, cred, td) && + VOP_ACCESS(tdp, VADMIN, cred, td)) + return (EPERM); + + return (0); +} + +/* + * Convert a component of a pathname into a pointer to a locked inode. + * This is a very central and rather complicated routine. + * If the filesystem is not maintained in a strict tree hierarchy, + * this can result in a deadlock situation (see comments in code below). + * + * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending + * on whether the name is to be looked up, created, renamed, or deleted. + * When CREATE, RENAME, or DELETE is specified, information usable in + * creating, renaming, or deleting a directory entry may be calculated. + * If flag has LOCKPARENT or'ed into it and the target of the pathname + * exists, lookup returns both the target and its parent directory locked. + * When creating or renaming and LOCKPARENT is specified, the target may + * not be ".". When deleting and LOCKPARENT is specified, the target may + * be "."., but the caller must check to ensure it does an vrele and vput + * instead of two vputs. + * + * This routine is actually used as VOP_CACHEDLOOKUP method, and the + * filesystem employs the generic vfs_cache_lookup() as VOP_LOOKUP + * method. + * + * vfs_cache_lookup() performs the following for us: + * check that it is a directory + * check accessibility of directory + * check for modification attempts on read-only mounts + * if name found in cache + * if at end of path and deleting or creating + * drop it + * else + * return name. + * return VOP_CACHEDLOOKUP() + * + * Overall outline of ufs_lookup: + * + * search for name in directory, to found or notfound + * notfound: + * if creating, return locked directory, leaving info on available slots + * else return error + * found: + * if at end of path and deleting, return information to allow delete + * if at end of path and rewriting (RENAME and LOCKPARENT), lock target + * inode and return info to allow rewrite + * if not at end, add name to cache; if at end and neither creating + * nor deleting, add name to cache + */ +int +ufs_lookup(ap) + struct vop_cachedlookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + + return (ufs_lookup_ino(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL)); +} + +int +ufs_lookup_ino(struct vnode *vdp, struct vnode **vpp, struct componentname *cnp, + ino_t *dd_ino) +{ + struct inode *dp; /* inode for directory being searched */ + struct buf *bp; /* a buffer of directory entries */ + struct direct *ep; /* the current directory entry */ + int entryoffsetinblock; /* offset of ep in bp's buffer */ + enum {NONE, COMPACT, FOUND} slotstatus; + doff_t slotoffset; /* offset of area with free space */ + doff_t i_diroff; /* cached i_diroff value. */ + doff_t i_offset; /* cached i_offset value. */ + int slotsize; /* size of area at slotoffset */ + int slotfreespace; /* amount of space free in slot */ + int slotneeded; /* size of the entry we're seeking */ + int numdirpasses; /* strategy for directory search */ + doff_t endsearch; /* offset to end directory search */ + doff_t prevoff; /* prev entry dp->i_offset */ + struct vnode *pdp; /* saved dp during symlink work */ + struct vnode *tdp; /* returned by VFS_VGET */ + doff_t enduseful; /* pointer past last used dir slot */ + u_long bmask; /* block offset mask */ + int namlen, error; + struct ucred *cred = cnp->cn_cred; + int flags = cnp->cn_flags; + int nameiop = cnp->cn_nameiop; + ino_t ino, ino1; + int ltype; + + if (vpp != NULL) + *vpp = NULL; + + dp = VTOI(vdp); + if (dp->i_effnlink == 0) + return (ENOENT); + + /* + * Create a vm object if vmiodirenable is enabled. + * Alternatively we could call vnode_create_vobject + * in VFS_VGET but we could end up creating objects + * that are never used. + */ + vnode_create_vobject(vdp, DIP(dp, i_size), cnp->cn_thread); + + bmask = VFSTOUFS(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; + +#ifdef DEBUG_VFS_LOCKS + /* + * Assert that the directory vnode is locked, and locked + * exclusively for the last component lookup for modifying + * operations. + * + * The directory-modifying operations need to save + * intermediate state in the inode between namei() call and + * actual directory manipulations. See fields in the struct + * inode marked as 'used during directory lookup'. We must + * ensure that upgrade in namei() does not happen, since + * upgrade might need to unlock vdp. If quotas are enabled, + * getinoquota() also requires exclusive lock to modify inode. + */ + ASSERT_VOP_LOCKED(vdp, "ufs_lookup1"); + if ((nameiop == CREATE || nameiop == DELETE || nameiop == RENAME) && + (flags & (LOCKPARENT | ISLASTCN)) == (LOCKPARENT | ISLASTCN)) + ASSERT_VOP_ELOCKED(vdp, "ufs_lookup2"); +#endif + +restart: + bp = NULL; + slotoffset = -1; + + /* + * We now have a segment name to search for, and a directory to search. + * + * Suppress search for slots unless creating + * file and at end of pathname, in which case + * we watch for a place to put the new file in + * case it doesn't already exist. + */ + ino = 0; + i_diroff = dp->i_diroff; + slotstatus = FOUND; + slotfreespace = slotsize = slotneeded = 0; + if ((nameiop == CREATE || nameiop == RENAME) && + (flags & ISLASTCN)) { + slotstatus = NONE; + slotneeded = DIRECTSIZ(cnp->cn_namelen); + } + +#ifdef UFS_DIRHASH + /* + * Use dirhash for fast operations on large directories. The logic + * to determine whether to hash the directory is contained within + * ufsdirhash_build(); a zero return means that it decided to hash + * this directory and it successfully built up the hash table. + */ + if (ufsdirhash_build(dp) == 0) { + /* Look for a free slot if needed. */ + enduseful = dp->i_size; + if (slotstatus != FOUND) { + slotoffset = ufsdirhash_findfree(dp, slotneeded, + &slotsize); + if (slotoffset >= 0) { + slotstatus = COMPACT; + enduseful = ufsdirhash_enduseful(dp); + if (enduseful < 0) + enduseful = dp->i_size; + } + } + /* Look up the component. */ + numdirpasses = 1; + entryoffsetinblock = 0; /* silence compiler warning */ + switch (ufsdirhash_lookup(dp, cnp->cn_nameptr, cnp->cn_namelen, + &i_offset, &bp, nameiop == DELETE ? &prevoff : NULL)) { + case 0: + ep = (struct direct *)((char *)bp->b_data + + (i_offset & bmask)); + goto foundentry; + case ENOENT: + i_offset = roundup2(dp->i_size, DIRBLKSIZ); + goto notfound; + default: + /* Something failed; just do a linear search. */ + break; + } + } +#endif /* UFS_DIRHASH */ + /* + * If there is cached information on a previous search of + * this directory, pick up where we last left off. + * We cache only lookups as these are the most common + * and have the greatest payoff. Caching CREATE has little + * benefit as it usually must search the entire directory + * to determine that the entry does not exist. Caching the + * location of the last DELETE or RENAME has not reduced + * profiling time and hence has been removed in the interest + * of simplicity. + */ + if (nameiop != LOOKUP || i_diroff == 0 || i_diroff >= dp->i_size) { + entryoffsetinblock = 0; + i_offset = 0; + numdirpasses = 1; + } else { + i_offset = i_diroff; + if ((entryoffsetinblock = i_offset & bmask) && + (error = UFS_BLKATOFF(vdp, (off_t)i_offset, NULL, &bp))) + return (error); + numdirpasses = 2; + nchstats.ncs_2passes++; + } + prevoff = i_offset; + endsearch = roundup2(dp->i_size, DIRBLKSIZ); + enduseful = 0; + +searchloop: + while (i_offset < endsearch) { + /* + * If necessary, get the next directory block. + */ + if ((i_offset & bmask) == 0) { + if (bp != NULL) + brelse(bp); + error = + UFS_BLKATOFF(vdp, (off_t)i_offset, NULL, &bp); + if (error) + return (error); + entryoffsetinblock = 0; + } + /* + * If still looking for a slot, and at a DIRBLKSIZE + * boundary, have to start looking for free space again. + */ + if (slotstatus == NONE && + (entryoffsetinblock & (DIRBLKSIZ - 1)) == 0) { + slotoffset = -1; + slotfreespace = 0; + } + /* + * Get pointer to next entry. + * Full validation checks are slow, so we only check + * enough to insure forward progress through the + * directory. Complete checks can be run by patching + * "dirchk" to be true. + */ + ep = (struct direct *)((char *)bp->b_data + entryoffsetinblock); + if (ep->d_reclen == 0 || ep->d_reclen > + DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) || + (dirchk && ufs_dirbadentry(vdp, ep, entryoffsetinblock))) { + int i; + + ufs_dirbad(dp, i_offset, "mangled entry"); + i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); + i_offset += i; + entryoffsetinblock += i; + continue; + } + + /* + * If an appropriate sized slot has not yet been found, + * check to see if one is available. Also accumulate space + * in the current block so that we can determine if + * compaction is viable. + */ + if (slotstatus != FOUND) { + int size = ep->d_reclen; + + if (ep->d_ino != 0) + size -= DIRSIZ(OFSFMT(vdp), ep); + if (size > 0) { + if (size >= slotneeded) { + slotstatus = FOUND; + slotoffset = i_offset; + slotsize = ep->d_reclen; + } else if (slotstatus == NONE) { + slotfreespace += size; + if (slotoffset == -1) + slotoffset = i_offset; + if (slotfreespace >= slotneeded) { + slotstatus = COMPACT; + slotsize = i_offset + + ep->d_reclen - slotoffset; + } + } + } + } + + /* + * Check for a name match. + */ + if (ep->d_ino) { +# if (BYTE_ORDER == LITTLE_ENDIAN) + if (OFSFMT(vdp)) + namlen = ep->d_type; + else + namlen = ep->d_namlen; +# else + namlen = ep->d_namlen; +# endif + if (namlen == cnp->cn_namelen && + (cnp->cn_nameptr[0] == ep->d_name[0]) && + !bcmp(cnp->cn_nameptr, ep->d_name, + (unsigned)namlen)) { +#ifdef UFS_DIRHASH +foundentry: +#endif + /* + * Save directory entry's inode number and + * reclen in ndp->ni_ufs area, and release + * directory buffer. + */ + if (vdp->v_mount->mnt_maxsymlinklen > 0 && + ep->d_type == DT_WHT) { + slotstatus = FOUND; + slotoffset = i_offset; + slotsize = ep->d_reclen; + enduseful = dp->i_size; + cnp->cn_flags |= ISWHITEOUT; + numdirpasses--; + goto notfound; + } + ino = ep->d_ino; + goto found; + } + } + prevoff = i_offset; + i_offset += ep->d_reclen; + entryoffsetinblock += ep->d_reclen; + if (ep->d_ino) + enduseful = i_offset; + } +notfound: + /* + * If we started in the middle of the directory and failed + * to find our target, we must check the beginning as well. + */ + if (numdirpasses == 2) { + numdirpasses--; + i_offset = 0; + endsearch = i_diroff; + goto searchloop; + } + if (bp != NULL) + brelse(bp); + /* + * If creating, and at end of pathname and current + * directory has not been removed, then can consider + * allowing file to be created. + */ + if ((nameiop == CREATE || nameiop == RENAME || + (nameiop == DELETE && + (cnp->cn_flags & DOWHITEOUT) && + (cnp->cn_flags & ISWHITEOUT))) && + (flags & ISLASTCN) && dp->i_effnlink != 0) { + /* + * Access for write is interpreted as allowing + * creation of files in the directory. + * + * XXX: Fix the comment above. + */ + if (flags & WILLBEDIR) + error = VOP_ACCESSX(vdp, VWRITE | VAPPEND, cred, cnp->cn_thread); + else + error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread); + if (error) + return (error); + /* + * Return an indication of where the new directory + * entry should be put. If we didn't find a slot, + * then set dp->i_count to 0 indicating + * that the new slot belongs at the end of the + * directory. If we found a slot, then the new entry + * can be put in the range from dp->i_offset to + * dp->i_offset + dp->i_count. + */ + if (slotstatus == NONE) { + dp->i_offset = roundup2(dp->i_size, DIRBLKSIZ); + dp->i_count = 0; + enduseful = dp->i_offset; + } else if (nameiop == DELETE) { + dp->i_offset = slotoffset; + if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0) + dp->i_count = 0; + else + dp->i_count = dp->i_offset - prevoff; + } else { + dp->i_offset = slotoffset; + dp->i_count = slotsize; + if (enduseful < slotoffset + slotsize) + enduseful = slotoffset + slotsize; + } + dp->i_endoff = roundup2(enduseful, DIRBLKSIZ); + /* + * We return with the directory locked, so that + * the parameters we set up above will still be + * valid if we actually decide to do a direnter(). + * We return ni_vp == NULL to indicate that the entry + * does not currently exist; we leave a pointer to + * the (locked) directory inode in ndp->ni_dvp. + * The pathname buffer is saved so that the name + * can be obtained later. + * + * NB - if the directory is unlocked, then this + * information cannot be used. + */ + cnp->cn_flags |= SAVENAME; + return (EJUSTRETURN); + } + /* + * Insert name into cache (as non-existent) if appropriate. + */ + if ((cnp->cn_flags & MAKEENTRY) != 0) + cache_enter(vdp, NULL, cnp); + return (ENOENT); + +found: + if (dd_ino != NULL) + *dd_ino = ino; + if (numdirpasses == 2) + nchstats.ncs_pass2++; + /* + * Check that directory length properly reflects presence + * of this entry. + */ + if (i_offset + DIRSIZ(OFSFMT(vdp), ep) > dp->i_size) { + ufs_dirbad(dp, i_offset, "i_size too small"); + dp->i_size = i_offset + DIRSIZ(OFSFMT(vdp), ep); + DIP_SET(dp, i_size, dp->i_size); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + } + brelse(bp); + + /* + * Found component in pathname. + * If the final component of path name, save information + * in the cache as to where the entry was found. + */ + if ((flags & ISLASTCN) && nameiop == LOOKUP) + dp->i_diroff = rounddown2(i_offset, DIRBLKSIZ); + + /* + * If deleting, and at end of pathname, return + * parameters which can be used to remove file. + */ + if (nameiop == DELETE && (flags & ISLASTCN)) { + if (flags & LOCKPARENT) + ASSERT_VOP_ELOCKED(vdp, __FUNCTION__); + /* + * Return pointer to current entry in dp->i_offset, + * and distance past previous entry (if there + * is a previous entry in this block) in dp->i_count. + * Save directory inode pointer in ndp->ni_dvp for dirremove(). + * + * Technically we shouldn't be setting these in the + * WANTPARENT case (first lookup in rename()), but any + * lookups that will result in directory changes will + * overwrite these. + */ + dp->i_offset = i_offset; + if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0) + dp->i_count = 0; + else + dp->i_count = dp->i_offset - prevoff; + if (dd_ino != NULL) + return (0); + if ((error = VFS_VGET(vdp->v_mount, ino, + LK_EXCLUSIVE, &tdp)) != 0) + return (error); + error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread); + if (error) { + vput(tdp); + return (error); + } + if (dp->i_number == ino) { + VREF(vdp); + *vpp = vdp; + vput(tdp); + return (0); + } + + *vpp = tdp; + return (0); + } + + /* + * If rewriting (RENAME), return the inode and the + * information required to rewrite the present directory + * Must get inode of directory entry to verify it's a + * regular file, or empty directory. + */ + if (nameiop == RENAME && (flags & ISLASTCN)) { + if (flags & WILLBEDIR) + error = VOP_ACCESSX(vdp, VWRITE | VAPPEND, cred, cnp->cn_thread); + else + error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread); + if (error) + return (error); + /* + * Careful about locking second inode. + * This can only occur if the target is ".". + */ + dp->i_offset = i_offset; + if (dp->i_number == ino) + return (EISDIR); + if (dd_ino != NULL) + return (0); + if ((error = VFS_VGET(vdp->v_mount, ino, + LK_EXCLUSIVE, &tdp)) != 0) + return (error); + + error = ufs_delete_denied(vdp, tdp, cred, cnp->cn_thread); + if (error) { + vput(tdp); + return (error); + } + +#ifdef SunOS_doesnt_do_that + /* + * The only purpose of this check is to return the correct + * error. Assume that we want to rename directory "a" + * to a file "b", and that we have no ACL_WRITE_DATA on + * a containing directory, but we _do_ have ACL_APPEND_DATA. + * In that case, the VOP_ACCESS check above will return 0, + * and the operation will fail with ENOTDIR instead + * of EACCESS. + */ + if (tdp->v_type == VDIR) + error = VOP_ACCESSX(vdp, VWRITE | VAPPEND, cred, cnp->cn_thread); + else + error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread); + if (error) { + vput(tdp); + return (error); + } +#endif + + *vpp = tdp; + cnp->cn_flags |= SAVENAME; + return (0); + } + if (dd_ino != NULL) + return (0); + + /* + * Step through the translation in the name. We do not `vput' the + * directory because we may need it again if a symbolic link + * is relative to the current directory. Instead we save it + * unlocked as "pdp". We must get the target inode before unlocking + * the directory to insure that the inode will not be removed + * before we get it. We prevent deadlock by always fetching + * inodes from the root, moving down the directory tree. Thus + * when following backward pointers ".." we must unlock the + * parent directory before getting the requested directory. + * There is a potential race condition here if both the current + * and parent directories are removed before the VFS_VGET for the + * inode associated with ".." returns. We hope that this occurs + * infrequently since we cannot avoid this race condition without + * implementing a sophisticated deadlock detection algorithm. + * Note also that this simple deadlock detection scheme will not + * work if the filesystem has any hard links other than ".." + * that point backwards in the directory structure. + */ + pdp = vdp; + if (flags & ISDOTDOT) { + error = vn_vget_ino(pdp, ino, cnp->cn_lkflags, &tdp); + if (error) + return (error); + + /* + * Recheck that ".." entry in the vdp directory points + * to the inode we looked up before vdp lock was + * dropped. + */ + error = ufs_lookup_ino(pdp, NULL, cnp, &ino1); + if (error) { + vput(tdp); + return (error); + } + if (ino1 != ino) { + vput(tdp); + goto restart; + } + + *vpp = tdp; + } else if (dp->i_number == ino) { + VREF(vdp); /* we want ourself, ie "." */ + /* + * When we lookup "." we still can be asked to lock it + * differently. + */ + ltype = cnp->cn_lkflags & LK_TYPE_MASK; + if (ltype != VOP_ISLOCKED(vdp)) { + if (ltype == LK_EXCLUSIVE) + vn_lock(vdp, LK_UPGRADE | LK_RETRY); + else /* if (ltype == LK_SHARED) */ + vn_lock(vdp, LK_DOWNGRADE | LK_RETRY); + /* + * Relock for the "." case may left us with + * reclaimed vnode. + */ + if (vdp->v_iflag & VI_DOOMED) { + vrele(vdp); + return (ENOENT); + } + } + *vpp = vdp; + } else { + error = VFS_VGET(pdp->v_mount, ino, cnp->cn_lkflags, &tdp); + if (error) + return (error); + *vpp = tdp; + } + + /* + * Insert name into cache if appropriate. + */ + if (cnp->cn_flags & MAKEENTRY) + cache_enter(vdp, *vpp, cnp); + return (0); +} + +void +ufs_dirbad(ip, offset, how) + struct inode *ip; + doff_t offset; + char *how; +{ + struct mount *mp; + + mp = ITOV(ip)->v_mount; + if ((mp->mnt_flag & MNT_RDONLY) == 0) + panic("ufs_dirbad: %s: bad dir ino %ju at offset %ld: %s", + mp->mnt_stat.f_mntonname, (uintmax_t)ip->i_number, + (long)offset, how); + else + (void)printf("%s: bad dir ino %ju at offset %ld: %s\n", + mp->mnt_stat.f_mntonname, (uintmax_t)ip->i_number, + (long)offset, how); +} + +/* + * Do consistency checking on a directory entry: + * record length must be multiple of 4 + * entry must fit in rest of its DIRBLKSIZ block + * record must be large enough to contain entry + * name is not longer than MAXNAMLEN + * name must be as long as advertised, and null terminated + */ +int +ufs_dirbadentry(dp, ep, entryoffsetinblock) + struct vnode *dp; + struct direct *ep; + int entryoffsetinblock; +{ + int i, namlen; + +# if (BYTE_ORDER == LITTLE_ENDIAN) + if (OFSFMT(dp)) + namlen = ep->d_type; + else + namlen = ep->d_namlen; +# else + namlen = ep->d_namlen; +# endif + if ((ep->d_reclen & 0x3) != 0 || + ep->d_reclen > DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) || + ep->d_reclen < DIRSIZ(OFSFMT(dp), ep) || namlen > MAXNAMLEN) { + /*return (1); */ + printf("First bad\n"); + goto bad; + } + if (ep->d_ino == 0) + return (0); + for (i = 0; i < namlen; i++) + if (ep->d_name[i] == '\0') { + /*return (1); */ + printf("Second bad\n"); + goto bad; + } + if (ep->d_name[i]) + goto bad; + return (0); +bad: + return (1); +} + +/* + * Construct a new directory entry after a call to namei, using the + * parameters that it left in the componentname argument cnp. The + * argument ip is the inode to which the new directory entry will refer. + */ +void +ufs_makedirentry(ip, cnp, newdirp) + struct inode *ip; + struct componentname *cnp; + struct direct *newdirp; +{ + +#ifdef INVARIANTS + if ((cnp->cn_flags & SAVENAME) == 0) + panic("ufs_makedirentry: missing name"); +#endif + newdirp->d_ino = ip->i_number; + newdirp->d_namlen = cnp->cn_namelen; + bcopy(cnp->cn_nameptr, newdirp->d_name, (unsigned)cnp->cn_namelen + 1); + if (ITOV(ip)->v_mount->mnt_maxsymlinklen > 0) + newdirp->d_type = IFTODT(ip->i_mode); + else { + newdirp->d_type = 0; +# if (BYTE_ORDER == LITTLE_ENDIAN) + { u_char tmp = newdirp->d_namlen; + newdirp->d_namlen = newdirp->d_type; + newdirp->d_type = tmp; } +# endif + } +} + +/* + * Write a directory entry after a call to namei, using the parameters + * that it left in nameidata. The argument dirp is the new directory + * entry contents. Dvp is a pointer to the directory to be written, + * which was left locked by namei. Remaining parameters (dp->i_offset, + * dp->i_count) indicate how the space for the new entry is to be obtained. + * Non-null bp indicates that a directory is being created (for the + * soft dependency code). + */ +int +ufs_direnter(dvp, tvp, dirp, cnp, newdirbp, isrename) + struct vnode *dvp; + struct vnode *tvp; + struct direct *dirp; + struct componentname *cnp; + struct buf *newdirbp; + int isrename; +{ + struct ucred *cr; + struct thread *td; + int newentrysize; + struct inode *dp; + struct buf *bp; + u_int dsize; + struct direct *ep, *nep; + u_int64_t old_isize; + int error, ret, blkoff, loc, spacefree, flags, namlen; + char *dirbuf; + + td = curthread; /* XXX */ + cr = td->td_ucred; + + dp = VTOI(dvp); + newentrysize = DIRSIZ(OFSFMT(dvp), dirp); + + if (dp->i_count == 0) { + /* + * If dp->i_count is 0, then namei could find no + * space in the directory. Here, dp->i_offset will + * be on a directory block boundary and we will write the + * new entry into a fresh block. + */ + if (dp->i_offset & (DIRBLKSIZ - 1)) + panic("ufs_direnter: newblk"); + flags = BA_CLRBUF; + if (!DOINGSOFTDEP(dvp) && !DOINGASYNC(dvp)) + flags |= IO_SYNC; +#ifdef QUOTA + if ((error = getinoquota(dp)) != 0) { + if (DOINGSOFTDEP(dvp) && newdirbp != NULL) + bdwrite(newdirbp); + return (error); + } +#endif + old_isize = dp->i_size; + vnode_pager_setsize(dvp, (u_long)dp->i_offset + DIRBLKSIZ); + if ((error = UFS_BALLOC(dvp, (off_t)dp->i_offset, DIRBLKSIZ, + cr, flags, &bp)) != 0) { + if (DOINGSOFTDEP(dvp) && newdirbp != NULL) + bdwrite(newdirbp); + vnode_pager_setsize(dvp, (u_long)old_isize); + return (error); + } + dp->i_size = dp->i_offset + DIRBLKSIZ; + DIP_SET(dp, i_size, dp->i_size); + dp->i_endoff = dp->i_size; + dp->i_flag |= IN_CHANGE | IN_UPDATE; + dirp->d_reclen = DIRBLKSIZ; + blkoff = dp->i_offset & + (VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_iosize - 1); + bcopy((caddr_t)dirp, (caddr_t)bp->b_data + blkoff,newentrysize); +#ifdef UFS_DIRHASH + if (dp->i_dirhash != NULL) { + ufsdirhash_newblk(dp, dp->i_offset); + ufsdirhash_add(dp, dirp, dp->i_offset); + ufsdirhash_checkblock(dp, (char *)bp->b_data + blkoff, + dp->i_offset); + } +#endif + if (DOINGSOFTDEP(dvp)) { + /* + * Ensure that the entire newly allocated block is a + * valid directory so that future growth within the + * block does not have to ensure that the block is + * written before the inode. + */ + blkoff += DIRBLKSIZ; + while (blkoff < bp->b_bcount) { + ((struct direct *) + (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ; + blkoff += DIRBLKSIZ; + } + if (softdep_setup_directory_add(bp, dp, dp->i_offset, + dirp->d_ino, newdirbp, 1)) + dp->i_flag |= IN_NEEDSYNC; + if (newdirbp) + bdwrite(newdirbp); + bdwrite(bp); + if ((dp->i_flag & IN_NEEDSYNC) == 0) + return (UFS_UPDATE(dvp, 0)); + /* + * We have just allocated a directory block in an + * indirect block. We must prevent holes in the + * directory created if directory entries are + * written out of order. To accomplish this we + * fsync when we extend a directory into indirects. + * During rename it's not safe to drop the tvp lock + * so sync must be delayed until it is. + * + * This synchronous step could be removed if fsck and + * the kernel were taught to fill in sparse + * directories rather than panic. + */ + if (isrename) + return (0); + if (tvp != NULL) + VOP_UNLOCK(tvp, 0); + (void) VOP_FSYNC(dvp, MNT_WAIT, td); + if (tvp != NULL) + vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY); + return (error); + } + if (DOINGASYNC(dvp)) { + bdwrite(bp); + return (UFS_UPDATE(dvp, 0)); + } + error = bwrite(bp); + ret = UFS_UPDATE(dvp, 1); + if (error == 0) + return (ret); + return (error); + } + + /* + * If dp->i_count is non-zero, then namei found space for the new + * entry in the range dp->i_offset to dp->i_offset + dp->i_count + * in the directory. To use this space, we may have to compact + * the entries located there, by copying them together towards the + * beginning of the block, leaving the free space in one usable + * chunk at the end. + */ + + /* + * Increase size of directory if entry eats into new space. + * This should never push the size past a new multiple of + * DIRBLKSIZE. + * + * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN. + */ + if (dp->i_offset + dp->i_count > dp->i_size) { + dp->i_size = dp->i_offset + dp->i_count; + DIP_SET(dp, i_size, dp->i_size); + } + /* + * Get the block containing the space for the new directory entry. + */ + error = UFS_BLKATOFF(dvp, (off_t)dp->i_offset, &dirbuf, &bp); + if (error) { + if (DOINGSOFTDEP(dvp) && newdirbp != NULL) + bdwrite(newdirbp); + return (error); + } + /* + * Find space for the new entry. In the simple case, the entry at + * offset base will have the space. If it does not, then namei + * arranged that compacting the region dp->i_offset to + * dp->i_offset + dp->i_count would yield the space. + */ + ep = (struct direct *)dirbuf; + dsize = ep->d_ino ? DIRSIZ(OFSFMT(dvp), ep) : 0; + spacefree = ep->d_reclen - dsize; + for (loc = ep->d_reclen; loc < dp->i_count; ) { + nep = (struct direct *)(dirbuf + loc); + + /* Trim the existing slot (NB: dsize may be zero). */ + ep->d_reclen = dsize; + ep = (struct direct *)((char *)ep + dsize); + + /* Read nep->d_reclen now as the bcopy() may clobber it. */ + loc += nep->d_reclen; + if (nep->d_ino == 0) { + /* + * A mid-block unused entry. Such entries are + * never created by the kernel, but fsck_ffs + * can create them (and it doesn't fix them). + * + * Add up the free space, and initialise the + * relocated entry since we don't bcopy it. + */ + spacefree += nep->d_reclen; + ep->d_ino = 0; + dsize = 0; + continue; + } + dsize = DIRSIZ(OFSFMT(dvp), nep); + spacefree += nep->d_reclen - dsize; +#ifdef UFS_DIRHASH + if (dp->i_dirhash != NULL) + ufsdirhash_move(dp, nep, + dp->i_offset + ((char *)nep - dirbuf), + dp->i_offset + ((char *)ep - dirbuf)); +#endif + if (DOINGSOFTDEP(dvp)) + softdep_change_directoryentry_offset(bp, dp, dirbuf, + (caddr_t)nep, (caddr_t)ep, dsize); + else + bcopy((caddr_t)nep, (caddr_t)ep, dsize); + } + /* + * Here, `ep' points to a directory entry containing `dsize' in-use + * bytes followed by `spacefree' unused bytes. If ep->d_ino == 0, + * then the entry is completely unused (dsize == 0). The value + * of ep->d_reclen is always indeterminate. + * + * Update the pointer fields in the previous entry (if any), + * copy in the new entry, and write out the block. + */ +# if (BYTE_ORDER == LITTLE_ENDIAN) + if (OFSFMT(dvp)) + namlen = ep->d_type; + else + namlen = ep->d_namlen; +# else + namlen = ep->d_namlen; +# endif + if (ep->d_ino == 0 || + (ep->d_ino == WINO && namlen == dirp->d_namlen && + bcmp(ep->d_name, dirp->d_name, dirp->d_namlen) == 0)) { + if (spacefree + dsize < newentrysize) + panic("ufs_direnter: compact1"); + dirp->d_reclen = spacefree + dsize; + } else { + if (spacefree < newentrysize) + panic("ufs_direnter: compact2"); + dirp->d_reclen = spacefree; + ep->d_reclen = dsize; + ep = (struct direct *)((char *)ep + dsize); + } +#ifdef UFS_DIRHASH + if (dp->i_dirhash != NULL && (ep->d_ino == 0 || + dirp->d_reclen == spacefree)) + ufsdirhash_add(dp, dirp, dp->i_offset + ((char *)ep - dirbuf)); +#endif + bcopy((caddr_t)dirp, (caddr_t)ep, (u_int)newentrysize); +#ifdef UFS_DIRHASH + if (dp->i_dirhash != NULL) + ufsdirhash_checkblock(dp, dirbuf - + (dp->i_offset & (DIRBLKSIZ - 1)), + rounddown2(dp->i_offset, DIRBLKSIZ)); +#endif + + if (DOINGSOFTDEP(dvp)) { + (void) softdep_setup_directory_add(bp, dp, + dp->i_offset + (caddr_t)ep - dirbuf, + dirp->d_ino, newdirbp, 0); + if (newdirbp != NULL) + bdwrite(newdirbp); + bdwrite(bp); + } else { + if (DOINGASYNC(dvp)) { + bdwrite(bp); + error = 0; + } else { + error = bwrite(bp); + } + } + dp->i_flag |= IN_CHANGE | IN_UPDATE; + /* + * If all went well, and the directory can be shortened, proceed + * with the truncation. Note that we have to unlock the inode for + * the entry that we just entered, as the truncation may need to + * lock other inodes which can lead to deadlock if we also hold a + * lock on the newly entered node. + */ + if (isrename == 0 && error == 0 && + dp->i_endoff && dp->i_endoff < dp->i_size) { + if (tvp != NULL) + VOP_UNLOCK(tvp, 0); + error = UFS_TRUNCATE(dvp, (off_t)dp->i_endoff, + IO_NORMAL | (DOINGASYNC(dvp) ? 0 : IO_SYNC), cr); + if (error != 0) + vn_printf(dvp, + "ufs_direnter: failed to truncate, error %d\n", + error); +#ifdef UFS_DIRHASH + if (error == 0 && dp->i_dirhash != NULL) + ufsdirhash_dirtrunc(dp, dp->i_endoff); +#endif + error = 0; + if (tvp != NULL) + vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY); + } + return (error); +} + +/* + * Remove a directory entry after a call to namei, using + * the parameters which it left in nameidata. The entry + * dp->i_offset contains the offset into the directory of the + * entry to be eliminated. The dp->i_count field contains the + * size of the previous record in the directory. If this + * is 0, the first entry is being deleted, so we need only + * zero the inode number to mark the entry as free. If the + * entry is not the first in the directory, we must reclaim + * the space of the now empty record by adding the record size + * to the size of the previous entry. + */ +int +ufs_dirremove(dvp, ip, flags, isrmdir) + struct vnode *dvp; + struct inode *ip; + int flags; + int isrmdir; +{ + struct inode *dp; + struct direct *ep, *rep; + struct buf *bp; + int error; + + dp = VTOI(dvp); + + /* + * Adjust the link count early so softdep can block if necessary. + */ + if (ip) { + ip->i_effnlink--; + if (DOINGSOFTDEP(dvp)) { + softdep_setup_unlink(dp, ip); + } else { + ip->i_nlink--; + DIP_SET(ip, i_nlink, ip->i_nlink); + ip->i_flag |= IN_CHANGE; + } + } + if (flags & DOWHITEOUT) { + /* + * Whiteout entry: set d_ino to WINO. + */ + if ((error = + UFS_BLKATOFF(dvp, (off_t)dp->i_offset, (char **)&ep, &bp)) != 0) + return (error); + ep->d_ino = WINO; + ep->d_type = DT_WHT; + goto out; + } + + if ((error = UFS_BLKATOFF(dvp, + (off_t)(dp->i_offset - dp->i_count), (char **)&ep, &bp)) != 0) + return (error); + + /* Set 'rep' to the entry being removed. */ + if (dp->i_count == 0) + rep = ep; + else + rep = (struct direct *)((char *)ep + ep->d_reclen); +#ifdef UFS_DIRHASH + /* + * Remove the dirhash entry. This is complicated by the fact + * that `ep' is the previous entry when dp->i_count != 0. + */ + if (dp->i_dirhash != NULL) + ufsdirhash_remove(dp, rep, dp->i_offset); +#endif + if (ip && rep->d_ino != ip->i_number) + panic("ufs_dirremove: ip %ju does not match dirent ino %ju\n", + (uintmax_t)ip->i_number, (uintmax_t)rep->d_ino); + if (dp->i_count == 0) { + /* + * First entry in block: set d_ino to zero. + */ + ep->d_ino = 0; + } else { + /* + * Collapse new free space into previous entry. + */ + ep->d_reclen += rep->d_reclen; + } +#ifdef UFS_DIRHASH + if (dp->i_dirhash != NULL) + ufsdirhash_checkblock(dp, (char *)ep - + ((dp->i_offset - dp->i_count) & (DIRBLKSIZ - 1)), + rounddown2(dp->i_offset, DIRBLKSIZ)); +#endif +out: + error = 0; + if (DOINGSOFTDEP(dvp)) { + if (ip) + softdep_setup_remove(bp, dp, ip, isrmdir); + if (softdep_slowdown(dvp)) + error = bwrite(bp); + else + bdwrite(bp); + } else { + if (flags & DOWHITEOUT) + error = bwrite(bp); + else if (DOINGASYNC(dvp) && dp->i_count != 0) + bdwrite(bp); + else + error = bwrite(bp); + } + dp->i_flag |= IN_CHANGE | IN_UPDATE; + /* + * If the last named reference to a snapshot goes away, + * drop its snapshot reference so that it will be reclaimed + * when last open reference goes away. + */ + if (ip != NULL && (ip->i_flags & SF_SNAPSHOT) != 0 && + ip->i_effnlink == 0) + UFS_SNAPGONE(ip); + return (error); +} + +/* + * Rewrite an existing directory entry to point at the inode + * supplied. The parameters describing the directory entry are + * set up by a call to namei. + */ +int +ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir) + struct inode *dp, *oip; + ino_t newinum; + int newtype; + int isrmdir; +{ + struct buf *bp; + struct direct *ep; + struct vnode *vdp = ITOV(dp); + int error; + + /* + * Drop the link before we lock the buf so softdep can block if + * necessary. + */ + oip->i_effnlink--; + if (DOINGSOFTDEP(vdp)) { + softdep_setup_unlink(dp, oip); + } else { + oip->i_nlink--; + DIP_SET(oip, i_nlink, oip->i_nlink); + oip->i_flag |= IN_CHANGE; + } + + error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, (char **)&ep, &bp); + if (error) + return (error); + if (ep->d_namlen == 2 && ep->d_name[1] == '.' && ep->d_name[0] == '.' && + ep->d_ino != oip->i_number) { + brelse(bp); + return (EIDRM); + } + ep->d_ino = newinum; + if (!OFSFMT(vdp)) + ep->d_type = newtype; + if (DOINGSOFTDEP(vdp)) { + softdep_setup_directory_change(bp, dp, oip, newinum, isrmdir); + bdwrite(bp); + } else { + if (DOINGASYNC(vdp)) { + bdwrite(bp); + error = 0; + } else { + error = bwrite(bp); + } + } + dp->i_flag |= IN_CHANGE | IN_UPDATE; + /* + * If the last named reference to a snapshot goes away, + * drop its snapshot reference so that it will be reclaimed + * when last open reference goes away. + */ + if ((oip->i_flags & SF_SNAPSHOT) != 0 && oip->i_effnlink == 0) + UFS_SNAPGONE(oip); + return (error); +} + +/* + * Check if a directory is empty or not. + * Inode supplied must be locked. + * + * Using a struct dirtemplate here is not precisely + * what we want, but better than using a struct direct. + * + * NB: does not handle corrupted directories. + */ +int +ufs_dirempty(ip, parentino, cred) + struct inode *ip; + ino_t parentino; + struct ucred *cred; +{ + doff_t off; + struct dirtemplate dbuf; + struct direct *dp = (struct direct *)&dbuf; + int error, namlen; + ssize_t count; +#define MINDIRSIZ (sizeof (struct dirtemplate) / 2) + + for (off = 0; off < ip->i_size; off += dp->d_reclen) { + error = vn_rdwr(UIO_READ, ITOV(ip), (caddr_t)dp, MINDIRSIZ, + off, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, cred, + NOCRED, &count, (struct thread *)0); + /* + * Since we read MINDIRSIZ, residual must + * be 0 unless we're at end of file. + */ + if (error || count != 0) + return (0); + /* avoid infinite loops */ + if (dp->d_reclen == 0) + return (0); + /* skip empty entries */ + if (dp->d_ino == 0 || dp->d_ino == WINO) + continue; + /* accept only "." and ".." */ +# if (BYTE_ORDER == LITTLE_ENDIAN) + if (OFSFMT(ITOV(ip))) + namlen = dp->d_type; + else + namlen = dp->d_namlen; +# else + namlen = dp->d_namlen; +# endif + if (namlen > 2) + return (0); + if (dp->d_name[0] != '.') + return (0); + /* + * At this point namlen must be 1 or 2. + * 1 implies ".", 2 implies ".." if second + * char is also "." + */ + if (namlen == 1 && dp->d_ino == ip->i_number) + continue; + if (dp->d_name[1] == '.' && dp->d_ino == parentino) + continue; + return (0); + } + return (1); +} + +static int +ufs_dir_dd_ino(struct vnode *vp, struct ucred *cred, ino_t *dd_ino, + struct vnode **dd_vp) +{ + struct dirtemplate dirbuf; + struct vnode *ddvp; + int error, namlen; + + ASSERT_VOP_LOCKED(vp, "ufs_dir_dd_ino"); + if (vp->v_type != VDIR) + return (ENOTDIR); + /* + * First check to see if we have it in the name cache. + */ + if ((ddvp = vn_dir_dd_ino(vp)) != NULL) { + KASSERT(ddvp->v_mount == vp->v_mount, + ("ufs_dir_dd_ino: Unexpected mount point crossing")); + *dd_ino = VTOI(ddvp)->i_number; + *dd_vp = ddvp; + return (0); + } + /* + * Have to read the directory. + */ + error = vn_rdwr(UIO_READ, vp, (caddr_t)&dirbuf, + sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE, + IO_NODELOCKED | IO_NOMACCHECK, cred, NOCRED, NULL, NULL); + if (error != 0) + return (error); +#if (BYTE_ORDER == LITTLE_ENDIAN) + if (OFSFMT(vp)) + namlen = dirbuf.dotdot_type; + else + namlen = dirbuf.dotdot_namlen; +#else + namlen = dirbuf.dotdot_namlen; +#endif + if (namlen != 2 || dirbuf.dotdot_name[0] != '.' || + dirbuf.dotdot_name[1] != '.') + return (ENOTDIR); + *dd_ino = dirbuf.dotdot_ino; + *dd_vp = NULL; + return (0); +} + +/* + * Check if source directory is in the path of the target directory. + */ +int +ufs_checkpath(ino_t source_ino, ino_t parent_ino, struct inode *target, struct ucred *cred, ino_t *wait_ino) +{ + struct mount *mp; + struct vnode *tvp, *vp, *vp1; + int error; + ino_t dd_ino; + + vp = tvp = ITOV(target); + mp = vp->v_mount; + *wait_ino = 0; + if (target->i_number == source_ino) + return (EEXIST); + if (target->i_number == parent_ino) + return (0); + if (target->i_number == ROOTINO) + return (0); + for (;;) { + error = ufs_dir_dd_ino(vp, cred, &dd_ino, &vp1); + if (error != 0) + break; + if (dd_ino == source_ino) { + error = EINVAL; + break; + } + if (dd_ino == ROOTINO) + break; + if (dd_ino == parent_ino) + break; + if (vp1 == NULL) { + error = VFS_VGET(mp, dd_ino, LK_SHARED | LK_NOWAIT, + &vp1); + if (error != 0) { + *wait_ino = dd_ino; + break; + } + } + KASSERT(dd_ino == VTOI(vp1)->i_number, + ("directory %ju reparented\n", + (uintmax_t)VTOI(vp1)->i_number)); + if (vp != tvp) + vput(vp); + vp = vp1; + } + + if (error == ENOTDIR) + panic("checkpath: .. not a directory\n"); + if (vp1 != NULL) + vput(vp1); + if (vp != tvp) + vput(vp); + return (error); +} diff --git a/Dump/ufs/ufs/ufs_quota.c b/Dump/ufs/ufs/ufs_quota.c new file mode 100644 index 0000000..550bb9c --- /dev/null +++ b/Dump/ufs/ufs/ufs_quota.c @@ -0,0 +1,1855 @@ +/*- + * Copyright (c) 1982, 1986, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Robert Elz at The University of Melbourne. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_quota.c 8.5 (Berkeley) 5/20/95 + */ + +#include +__FBSDID("$FreeBSD: releng/11.2/sys/ufs/ufs/ufs_quota.c 331722 2018-03-29 02:50:57Z eadler $"); + +#include "opt_ffs.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +CTASSERT(sizeof(struct dqblk64) == sizeof(struct dqhdr64)); + +static int unprivileged_get_quota = 0; +SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_get_quota, CTLFLAG_RW, + &unprivileged_get_quota, 0, + "Unprivileged processes may retrieve quotas for other uids and gids"); + +static MALLOC_DEFINE(M_DQUOT, "ufs_quota", "UFS quota entries"); + +/* + * Quota name to error message mapping. + */ +static char *quotatypes[] = INITQFNAMES; + +static int chkdqchg(struct inode *, ufs2_daddr_t, struct ucred *, int, int *); +static int chkiqchg(struct inode *, int, struct ucred *, int, int *); +static int dqopen(struct vnode *, struct ufsmount *, int); +static int dqget(struct vnode *, + u_long, struct ufsmount *, int, struct dquot **); +static int dqsync(struct vnode *, struct dquot *); +static int dqflush(struct vnode *); +static int quotaoff1(struct thread *td, struct mount *mp, int type); +static int quotaoff_inchange(struct thread *td, struct mount *mp, int type); + +/* conversion functions - from_to() */ +static void dqb32_dq(const struct dqblk32 *, struct dquot *); +static void dqb64_dq(const struct dqblk64 *, struct dquot *); +static void dq_dqb32(const struct dquot *, struct dqblk32 *); +static void dq_dqb64(const struct dquot *, struct dqblk64 *); +static void dqb32_dqb64(const struct dqblk32 *, struct dqblk64 *); +static void dqb64_dqb32(const struct dqblk64 *, struct dqblk32 *); + +#ifdef DIAGNOSTIC +static void dqref(struct dquot *); +static void chkdquot(struct inode *); +#endif + +/* + * Set up the quotas for an inode. + * + * This routine completely defines the semantics of quotas. + * If other criterion want to be used to establish quotas, the + * MAXQUOTAS value in quota.h should be increased, and the + * additional dquots set up here. + */ +int +getinoquota(struct inode *ip) +{ + struct ufsmount *ump; + struct vnode *vp; + int error; + + vp = ITOV(ip); + + /* + * Disk quotas must be turned off for system files. Currently + * snapshot and quota files. + */ + if ((vp->v_vflag & VV_SYSTEM) != 0) + return (0); + /* + * XXX: Turn off quotas for files with a negative UID or GID. + * This prevents the creation of 100GB+ quota files. + */ + if ((int)ip->i_uid < 0 || (int)ip->i_gid < 0) + return (0); + ump = VFSTOUFS(vp->v_mount); + /* + * Set up the user quota based on file uid. + * EINVAL means that quotas are not enabled. + */ + if ((error = + dqget(vp, ip->i_uid, ump, USRQUOTA, &ip->i_dquot[USRQUOTA])) && + error != EINVAL) + return (error); + /* + * Set up the group quota based on file gid. + * EINVAL means that quotas are not enabled. + */ + if ((error = + dqget(vp, ip->i_gid, ump, GRPQUOTA, &ip->i_dquot[GRPQUOTA])) && + error != EINVAL) + return (error); + return (0); +} + +/* + * Update disk usage, and take corrective action. + */ +int +chkdq(struct inode *ip, ufs2_daddr_t change, struct ucred *cred, int flags) +{ + struct dquot *dq; + ufs2_daddr_t ncurblocks; + struct vnode *vp = ITOV(ip); + int i, error, warn, do_check; + + /* + * Disk quotas must be turned off for system files. Currently + * snapshot and quota files. + */ + if ((vp->v_vflag & VV_SYSTEM) != 0) + return (0); + /* + * XXX: Turn off quotas for files with a negative UID or GID. + * This prevents the creation of 100GB+ quota files. + */ + if ((int)ip->i_uid < 0 || (int)ip->i_gid < 0) + return (0); +#ifdef DIAGNOSTIC + if ((flags & CHOWN) == 0) + chkdquot(ip); +#endif + if (change == 0) + return (0); + if (change < 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + DQI_LOCK(dq); + DQI_WAIT(dq, PINOD+1, "chkdq1"); + ncurblocks = dq->dq_curblocks + change; + if (ncurblocks >= 0) + dq->dq_curblocks = ncurblocks; + else + dq->dq_curblocks = 0; + dq->dq_flags &= ~DQ_BLKS; + dq->dq_flags |= DQ_MOD; + DQI_UNLOCK(dq); + } + return (0); + } + if ((flags & FORCE) == 0 && + priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA, 0)) + do_check = 1; + else + do_check = 0; + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + warn = 0; + DQI_LOCK(dq); + DQI_WAIT(dq, PINOD+1, "chkdq2"); + if (do_check) { + error = chkdqchg(ip, change, cred, i, &warn); + if (error) { + /* + * Roll back user quota changes when + * group quota failed. + */ + while (i > 0) { + --i; + dq = ip->i_dquot[i]; + if (dq == NODQUOT) + continue; + DQI_LOCK(dq); + DQI_WAIT(dq, PINOD+1, "chkdq3"); + ncurblocks = dq->dq_curblocks - change; + if (ncurblocks >= 0) + dq->dq_curblocks = ncurblocks; + else + dq->dq_curblocks = 0; + dq->dq_flags &= ~DQ_BLKS; + dq->dq_flags |= DQ_MOD; + DQI_UNLOCK(dq); + } + return (error); + } + } + /* Reset timer when crossing soft limit */ + if (dq->dq_curblocks + change >= dq->dq_bsoftlimit && + dq->dq_curblocks < dq->dq_bsoftlimit) + dq->dq_btime = time_second + ITOUMP(ip)->um_btime[i]; + dq->dq_curblocks += change; + dq->dq_flags |= DQ_MOD; + DQI_UNLOCK(dq); + if (warn) + uprintf("\n%s: warning, %s disk quota exceeded\n", + ITOVFS(ip)->mnt_stat.f_mntonname, + quotatypes[i]); + } + return (0); +} + +/* + * Check for a valid change to a users allocation. + * Issue an error message if appropriate. + */ +static int +chkdqchg(struct inode *ip, ufs2_daddr_t change, struct ucred *cred, + int type, int *warn) +{ + struct dquot *dq = ip->i_dquot[type]; + ufs2_daddr_t ncurblocks = dq->dq_curblocks + change; + + /* + * If user would exceed their hard limit, disallow space allocation. + */ + if (ncurblocks >= dq->dq_bhardlimit && dq->dq_bhardlimit) { + if ((dq->dq_flags & DQ_BLKS) == 0 && + ip->i_uid == cred->cr_uid) { + dq->dq_flags |= DQ_BLKS; + DQI_UNLOCK(dq); + uprintf("\n%s: write failed, %s disk limit reached\n", + ITOVFS(ip)->mnt_stat.f_mntonname, + quotatypes[type]); + return (EDQUOT); + } + DQI_UNLOCK(dq); + return (EDQUOT); + } + /* + * If user is over their soft limit for too long, disallow space + * allocation. Reset time limit as they cross their soft limit. + */ + if (ncurblocks >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) { + if (dq->dq_curblocks < dq->dq_bsoftlimit) { + dq->dq_btime = time_second + ITOUMP(ip)->um_btime[type]; + if (ip->i_uid == cred->cr_uid) + *warn = 1; + return (0); + } + if (time_second > dq->dq_btime) { + if ((dq->dq_flags & DQ_BLKS) == 0 && + ip->i_uid == cred->cr_uid) { + dq->dq_flags |= DQ_BLKS; + DQI_UNLOCK(dq); + uprintf("\n%s: write failed, %s " + "disk quota exceeded for too long\n", + ITOVFS(ip)->mnt_stat.f_mntonname, + quotatypes[type]); + return (EDQUOT); + } + DQI_UNLOCK(dq); + return (EDQUOT); + } + } + return (0); +} + +/* + * Check the inode limit, applying corrective action. + */ +int +chkiq(struct inode *ip, int change, struct ucred *cred, int flags) +{ + struct dquot *dq; + int i, error, warn, do_check; + +#ifdef DIAGNOSTIC + if ((flags & CHOWN) == 0) + chkdquot(ip); +#endif + if (change == 0) + return (0); + if (change < 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + DQI_LOCK(dq); + DQI_WAIT(dq, PINOD+1, "chkiq1"); + if (dq->dq_curinodes >= -change) + dq->dq_curinodes += change; + else + dq->dq_curinodes = 0; + dq->dq_flags &= ~DQ_INODS; + dq->dq_flags |= DQ_MOD; + DQI_UNLOCK(dq); + } + return (0); + } + if ((flags & FORCE) == 0 && + priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA, 0)) + do_check = 1; + else + do_check = 0; + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + warn = 0; + DQI_LOCK(dq); + DQI_WAIT(dq, PINOD+1, "chkiq2"); + if (do_check) { + error = chkiqchg(ip, change, cred, i, &warn); + if (error) { + /* + * Roll back user quota changes when + * group quota failed. + */ + while (i > 0) { + --i; + dq = ip->i_dquot[i]; + if (dq == NODQUOT) + continue; + DQI_LOCK(dq); + DQI_WAIT(dq, PINOD+1, "chkiq3"); + if (dq->dq_curinodes >= change) + dq->dq_curinodes -= change; + else + dq->dq_curinodes = 0; + dq->dq_flags &= ~DQ_INODS; + dq->dq_flags |= DQ_MOD; + DQI_UNLOCK(dq); + } + return (error); + } + } + /* Reset timer when crossing soft limit */ + if (dq->dq_curinodes + change >= dq->dq_isoftlimit && + dq->dq_curinodes < dq->dq_isoftlimit) + dq->dq_itime = time_second + ITOUMP(ip)->um_itime[i]; + dq->dq_curinodes += change; + dq->dq_flags |= DQ_MOD; + DQI_UNLOCK(dq); + if (warn) + uprintf("\n%s: warning, %s inode quota exceeded\n", + ITOVFS(ip)->mnt_stat.f_mntonname, + quotatypes[i]); + } + return (0); +} + +/* + * Check for a valid change to a users allocation. + * Issue an error message if appropriate. + */ +static int +chkiqchg(struct inode *ip, int change, struct ucred *cred, int type, int *warn) +{ + struct dquot *dq = ip->i_dquot[type]; + ino_t ncurinodes = dq->dq_curinodes + change; + + /* + * If user would exceed their hard limit, disallow inode allocation. + */ + if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) { + if ((dq->dq_flags & DQ_INODS) == 0 && + ip->i_uid == cred->cr_uid) { + dq->dq_flags |= DQ_INODS; + DQI_UNLOCK(dq); + uprintf("\n%s: write failed, %s inode limit reached\n", + ITOVFS(ip)->mnt_stat.f_mntonname, + quotatypes[type]); + return (EDQUOT); + } + DQI_UNLOCK(dq); + return (EDQUOT); + } + /* + * If user is over their soft limit for too long, disallow inode + * allocation. Reset time limit as they cross their soft limit. + */ + if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) { + if (dq->dq_curinodes < dq->dq_isoftlimit) { + dq->dq_itime = time_second + ITOUMP(ip)->um_itime[type]; + if (ip->i_uid == cred->cr_uid) + *warn = 1; + return (0); + } + if (time_second > dq->dq_itime) { + if ((dq->dq_flags & DQ_INODS) == 0 && + ip->i_uid == cred->cr_uid) { + dq->dq_flags |= DQ_INODS; + DQI_UNLOCK(dq); + uprintf("\n%s: write failed, %s " + "inode quota exceeded for too long\n", + ITOVFS(ip)->mnt_stat.f_mntonname, + quotatypes[type]); + return (EDQUOT); + } + DQI_UNLOCK(dq); + return (EDQUOT); + } + } + return (0); +} + +#ifdef DIAGNOSTIC +/* + * On filesystems with quotas enabled, it is an error for a file to change + * size and not to have a dquot structure associated with it. + */ +static void +chkdquot(struct inode *ip) +{ + struct ufsmount *ump; + struct vnode *vp; + int i; + + ump = ITOUMP(ip); + vp = ITOV(ip); + + /* + * Disk quotas must be turned off for system files. Currently + * these are snapshots and quota files. + */ + if ((vp->v_vflag & VV_SYSTEM) != 0) + return; + /* + * XXX: Turn off quotas for files with a negative UID or GID. + * This prevents the creation of 100GB+ quota files. + */ + if ((int)ip->i_uid < 0 || (int)ip->i_gid < 0) + return; + + UFS_LOCK(ump); + for (i = 0; i < MAXQUOTAS; i++) { + if (ump->um_quotas[i] == NULLVP || + (ump->um_qflags[i] & (QTF_OPENING|QTF_CLOSING))) + continue; + if (ip->i_dquot[i] == NODQUOT) { + UFS_UNLOCK(ump); + vn_printf(ITOV(ip), "chkdquot: missing dquot "); + panic("chkdquot: missing dquot"); + } + } + UFS_UNLOCK(ump); +} +#endif + +/* + * Code to process quotactl commands. + */ + +/* + * Q_QUOTAON - set up a quota file for a particular filesystem. + */ +int +quotaon(struct thread *td, struct mount *mp, int type, void *fname) +{ + struct ufsmount *ump; + struct vnode *vp, **vpp; + struct vnode *mvp; + struct dquot *dq; + int error, flags; + struct nameidata nd; + + error = priv_check(td, PRIV_UFS_QUOTAON); + if (error != 0) { + vfs_unbusy(mp); + return (error); + } + + if ((mp->mnt_flag & MNT_RDONLY) != 0) { + vfs_unbusy(mp); + return (EROFS); + } + + ump = VFSTOUFS(mp); + dq = NODQUOT; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, fname, td); + flags = FREAD | FWRITE; + vfs_ref(mp); + vfs_unbusy(mp); + error = vn_open(&nd, &flags, 0, NULL); + if (error != 0) { + vfs_rel(mp); + return (error); + } + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + error = vfs_busy(mp, MBF_NOWAIT); + vfs_rel(mp); + if (error == 0) { + if (vp->v_type != VREG) { + error = EACCES; + vfs_unbusy(mp); + } + } + if (error != 0) { + VOP_UNLOCK(vp, 0); + (void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td); + return (error); + } + + UFS_LOCK(ump); + if ((ump->um_qflags[type] & (QTF_OPENING|QTF_CLOSING)) != 0) { + UFS_UNLOCK(ump); + VOP_UNLOCK(vp, 0); + (void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td); + vfs_unbusy(mp); + return (EALREADY); + } + ump->um_qflags[type] |= QTF_OPENING|QTF_CLOSING; + UFS_UNLOCK(ump); + if ((error = dqopen(vp, ump, type)) != 0) { + VOP_UNLOCK(vp, 0); + UFS_LOCK(ump); + ump->um_qflags[type] &= ~(QTF_OPENING|QTF_CLOSING); + UFS_UNLOCK(ump); + (void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td); + vfs_unbusy(mp); + return (error); + } + VOP_UNLOCK(vp, 0); + MNT_ILOCK(mp); + mp->mnt_flag |= MNT_QUOTA; + MNT_IUNLOCK(mp); + + vpp = &ump->um_quotas[type]; + if (*vpp != vp) + quotaoff1(td, mp, type); + + /* + * When the directory vnode containing the quota file is + * inactivated, due to the shared lookup of the quota file + * vput()ing the dvp, the qsyncvp() call for the containing + * directory would try to acquire the quota lock exclusive. + * At the same time, lookup already locked the quota vnode + * shared. Mark the quota vnode lock as allowing recursion + * and automatically converting shared locks to exclusive. + * + * Also mark quota vnode as system. + */ + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + vp->v_vflag |= VV_SYSTEM; + VN_LOCK_AREC(vp); + VN_LOCK_DSHARE(vp); + VOP_UNLOCK(vp, 0); + *vpp = vp; + /* + * Save the credential of the process that turned on quotas. + * Set up the time limits for this quota. + */ + ump->um_cred[type] = crhold(td->td_ucred); + ump->um_btime[type] = MAX_DQ_TIME; + ump->um_itime[type] = MAX_IQ_TIME; + if (dqget(NULLVP, 0, ump, type, &dq) == 0) { + if (dq->dq_btime > 0) + ump->um_btime[type] = dq->dq_btime; + if (dq->dq_itime > 0) + ump->um_itime[type] = dq->dq_itime; + dqrele(NULLVP, dq); + } + /* + * Allow the getdq from getinoquota below to read the quota + * from file. + */ + UFS_LOCK(ump); + ump->um_qflags[type] &= ~QTF_CLOSING; + UFS_UNLOCK(ump); + /* + * Search vnodes associated with this mount point, + * adding references to quota file being opened. + * NB: only need to add dquot's for inodes being modified. + */ +again: + MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { + if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { + MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); + goto again; + } + if (vp->v_type == VNON || vp->v_writecount == 0) { + VOP_UNLOCK(vp, 0); + vrele(vp); + continue; + } + error = getinoquota(VTOI(vp)); + VOP_UNLOCK(vp, 0); + vrele(vp); + if (error) { + MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); + break; + } + } + + if (error) + quotaoff_inchange(td, mp, type); + UFS_LOCK(ump); + ump->um_qflags[type] &= ~QTF_OPENING; + KASSERT((ump->um_qflags[type] & QTF_CLOSING) == 0, + ("quotaon: leaking flags")); + UFS_UNLOCK(ump); + + vfs_unbusy(mp); + return (error); +} + +/* + * Main code to turn off disk quotas for a filesystem. Does not change + * flags. + */ +static int +quotaoff1(struct thread *td, struct mount *mp, int type) +{ + struct vnode *vp; + struct vnode *qvp, *mvp; + struct ufsmount *ump; + struct dquot *dq; + struct inode *ip; + struct ucred *cr; + int error; + + ump = VFSTOUFS(mp); + + UFS_LOCK(ump); + KASSERT((ump->um_qflags[type] & QTF_CLOSING) != 0, + ("quotaoff1: flags are invalid")); + if ((qvp = ump->um_quotas[type]) == NULLVP) { + UFS_UNLOCK(ump); + return (0); + } + cr = ump->um_cred[type]; + UFS_UNLOCK(ump); + + /* + * Search vnodes associated with this mount point, + * deleting any references to quota file being closed. + */ +again: + MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { + if (vp->v_type == VNON) { + VI_UNLOCK(vp); + continue; + } + if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { + MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); + goto again; + } + ip = VTOI(vp); + dq = ip->i_dquot[type]; + ip->i_dquot[type] = NODQUOT; + dqrele(vp, dq); + VOP_UNLOCK(vp, 0); + vrele(vp); + } + + error = dqflush(qvp); + if (error != 0) + return (error); + + /* + * Clear um_quotas before closing the quota vnode to prevent + * access to the closed vnode from dqget/dqsync + */ + UFS_LOCK(ump); + ump->um_quotas[type] = NULLVP; + ump->um_cred[type] = NOCRED; + UFS_UNLOCK(ump); + + vn_lock(qvp, LK_EXCLUSIVE | LK_RETRY); + qvp->v_vflag &= ~VV_SYSTEM; + VOP_UNLOCK(qvp, 0); + error = vn_close(qvp, FREAD|FWRITE, td->td_ucred, td); + crfree(cr); + + return (error); +} + +/* + * Turns off quotas, assumes that ump->um_qflags are already checked + * and QTF_CLOSING is set to indicate operation in progress. Fixes + * ump->um_qflags and mp->mnt_flag after. + */ +int +quotaoff_inchange(struct thread *td, struct mount *mp, int type) +{ + struct ufsmount *ump; + int i; + int error; + + error = quotaoff1(td, mp, type); + + ump = VFSTOUFS(mp); + UFS_LOCK(ump); + ump->um_qflags[type] &= ~QTF_CLOSING; + for (i = 0; i < MAXQUOTAS; i++) + if (ump->um_quotas[i] != NULLVP) + break; + if (i == MAXQUOTAS) { + MNT_ILOCK(mp); + mp->mnt_flag &= ~MNT_QUOTA; + MNT_IUNLOCK(mp); + } + UFS_UNLOCK(ump); + return (error); +} + +/* + * Q_QUOTAOFF - turn off disk quotas for a filesystem. + */ +int +quotaoff(struct thread *td, struct mount *mp, int type) +{ + struct ufsmount *ump; + int error; + + error = priv_check(td, PRIV_UFS_QUOTAOFF); + if (error) + return (error); + + ump = VFSTOUFS(mp); + UFS_LOCK(ump); + if ((ump->um_qflags[type] & (QTF_OPENING|QTF_CLOSING)) != 0) { + UFS_UNLOCK(ump); + return (EALREADY); + } + ump->um_qflags[type] |= QTF_CLOSING; + UFS_UNLOCK(ump); + + return (quotaoff_inchange(td, mp, type)); +} + +/* + * Q_GETQUOTA - return current values in a dqblk structure. + */ +static int +_getquota(struct thread *td, struct mount *mp, u_long id, int type, + struct dqblk64 *dqb) +{ + struct dquot *dq; + int error; + + switch (type) { + case USRQUOTA: + if ((td->td_ucred->cr_uid != id) && !unprivileged_get_quota) { + error = priv_check(td, PRIV_VFS_GETQUOTA); + if (error) + return (error); + } + break; + + case GRPQUOTA: + if (!groupmember(id, td->td_ucred) && + !unprivileged_get_quota) { + error = priv_check(td, PRIV_VFS_GETQUOTA); + if (error) + return (error); + } + break; + + default: + return (EINVAL); + } + + dq = NODQUOT; + error = dqget(NULLVP, id, VFSTOUFS(mp), type, &dq); + if (error) + return (error); + *dqb = dq->dq_dqb; + dqrele(NULLVP, dq); + return (error); +} + +/* + * Q_SETQUOTA - assign an entire dqblk structure. + */ +static int +_setquota(struct thread *td, struct mount *mp, u_long id, int type, + struct dqblk64 *dqb) +{ + struct dquot *dq; + struct dquot *ndq; + struct ufsmount *ump; + struct dqblk64 newlim; + int error; + + error = priv_check(td, PRIV_VFS_SETQUOTA); + if (error) + return (error); + + newlim = *dqb; + + ndq = NODQUOT; + ump = VFSTOUFS(mp); + + error = dqget(NULLVP, id, ump, type, &ndq); + if (error) + return (error); + dq = ndq; + DQI_LOCK(dq); + DQI_WAIT(dq, PINOD+1, "setqta"); + /* + * Copy all but the current values. + * Reset time limit if previously had no soft limit or were + * under it, but now have a soft limit and are over it. + */ + newlim.dqb_curblocks = dq->dq_curblocks; + newlim.dqb_curinodes = dq->dq_curinodes; + if (dq->dq_id != 0) { + newlim.dqb_btime = dq->dq_btime; + newlim.dqb_itime = dq->dq_itime; + } + if (newlim.dqb_bsoftlimit && + dq->dq_curblocks >= newlim.dqb_bsoftlimit && + (dq->dq_bsoftlimit == 0 || dq->dq_curblocks < dq->dq_bsoftlimit)) + newlim.dqb_btime = time_second + ump->um_btime[type]; + if (newlim.dqb_isoftlimit && + dq->dq_curinodes >= newlim.dqb_isoftlimit && + (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit)) + newlim.dqb_itime = time_second + ump->um_itime[type]; + dq->dq_dqb = newlim; + if (dq->dq_curblocks < dq->dq_bsoftlimit) + dq->dq_flags &= ~DQ_BLKS; + if (dq->dq_curinodes < dq->dq_isoftlimit) + dq->dq_flags &= ~DQ_INODS; + if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && + dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) + dq->dq_flags |= DQ_FAKE; + else + dq->dq_flags &= ~DQ_FAKE; + dq->dq_flags |= DQ_MOD; + DQI_UNLOCK(dq); + dqrele(NULLVP, dq); + return (0); +} + +/* + * Q_SETUSE - set current inode and block usage. + */ +static int +_setuse(struct thread *td, struct mount *mp, u_long id, int type, + struct dqblk64 *dqb) +{ + struct dquot *dq; + struct ufsmount *ump; + struct dquot *ndq; + struct dqblk64 usage; + int error; + + error = priv_check(td, PRIV_UFS_SETUSE); + if (error) + return (error); + + usage = *dqb; + + ump = VFSTOUFS(mp); + ndq = NODQUOT; + + error = dqget(NULLVP, id, ump, type, &ndq); + if (error) + return (error); + dq = ndq; + DQI_LOCK(dq); + DQI_WAIT(dq, PINOD+1, "setuse"); + /* + * Reset time limit if have a soft limit and were + * previously under it, but are now over it. + */ + if (dq->dq_bsoftlimit && dq->dq_curblocks < dq->dq_bsoftlimit && + usage.dqb_curblocks >= dq->dq_bsoftlimit) + dq->dq_btime = time_second + ump->um_btime[type]; + if (dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit && + usage.dqb_curinodes >= dq->dq_isoftlimit) + dq->dq_itime = time_second + ump->um_itime[type]; + dq->dq_curblocks = usage.dqb_curblocks; + dq->dq_curinodes = usage.dqb_curinodes; + if (dq->dq_curblocks < dq->dq_bsoftlimit) + dq->dq_flags &= ~DQ_BLKS; + if (dq->dq_curinodes < dq->dq_isoftlimit) + dq->dq_flags &= ~DQ_INODS; + dq->dq_flags |= DQ_MOD; + DQI_UNLOCK(dq); + dqrele(NULLVP, dq); + return (0); +} + +int +getquota32(struct thread *td, struct mount *mp, u_long id, int type, void *addr) +{ + struct dqblk32 dqb32; + struct dqblk64 dqb64; + int error; + + error = _getquota(td, mp, id, type, &dqb64); + if (error) + return (error); + dqb64_dqb32(&dqb64, &dqb32); + error = copyout(&dqb32, addr, sizeof(dqb32)); + return (error); +} + +int +setquota32(struct thread *td, struct mount *mp, u_long id, int type, void *addr) +{ + struct dqblk32 dqb32; + struct dqblk64 dqb64; + int error; + + error = copyin(addr, &dqb32, sizeof(dqb32)); + if (error) + return (error); + dqb32_dqb64(&dqb32, &dqb64); + error = _setquota(td, mp, id, type, &dqb64); + return (error); +} + +int +setuse32(struct thread *td, struct mount *mp, u_long id, int type, void *addr) +{ + struct dqblk32 dqb32; + struct dqblk64 dqb64; + int error; + + error = copyin(addr, &dqb32, sizeof(dqb32)); + if (error) + return (error); + dqb32_dqb64(&dqb32, &dqb64); + error = _setuse(td, mp, id, type, &dqb64); + return (error); +} + +int +getquota(struct thread *td, struct mount *mp, u_long id, int type, void *addr) +{ + struct dqblk64 dqb64; + int error; + + error = _getquota(td, mp, id, type, &dqb64); + if (error) + return (error); + error = copyout(&dqb64, addr, sizeof(dqb64)); + return (error); +} + +int +setquota(struct thread *td, struct mount *mp, u_long id, int type, void *addr) +{ + struct dqblk64 dqb64; + int error; + + error = copyin(addr, &dqb64, sizeof(dqb64)); + if (error) + return (error); + error = _setquota(td, mp, id, type, &dqb64); + return (error); +} + +int +setuse(struct thread *td, struct mount *mp, u_long id, int type, void *addr) +{ + struct dqblk64 dqb64; + int error; + + error = copyin(addr, &dqb64, sizeof(dqb64)); + if (error) + return (error); + error = _setuse(td, mp, id, type, &dqb64); + return (error); +} + +/* + * Q_GETQUOTASIZE - get bit-size of quota file fields + */ +int +getquotasize(struct thread *td, struct mount *mp, u_long id, int type, + void *sizep) +{ + struct ufsmount *ump = VFSTOUFS(mp); + int bitsize; + + UFS_LOCK(ump); + if (ump->um_quotas[type] == NULLVP || + (ump->um_qflags[type] & QTF_CLOSING)) { + UFS_UNLOCK(ump); + return (EINVAL); + } + if ((ump->um_qflags[type] & QTF_64BIT) != 0) + bitsize = 64; + else + bitsize = 32; + UFS_UNLOCK(ump); + return (copyout(&bitsize, sizep, sizeof(int))); +} + +/* + * Q_SYNC - sync quota files to disk. + */ +int +qsync(struct mount *mp) +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct thread *td = curthread; /* XXX */ + struct vnode *vp, *mvp; + struct dquot *dq; + int i, error; + + /* + * Check if the mount point has any quotas. + * If not, simply return. + */ + for (i = 0; i < MAXQUOTAS; i++) + if (ump->um_quotas[i] != NULLVP) + break; + if (i == MAXQUOTAS) + return (0); + /* + * Search vnodes associated with this mount point, + * synchronizing any modified dquot structures. + */ +again: + MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) { + if (vp->v_type == VNON) { + VI_UNLOCK(vp); + continue; + } + error = vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td); + if (error) { + if (error == ENOENT) { + MNT_VNODE_FOREACH_ACTIVE_ABORT(mp, mvp); + goto again; + } + continue; + } + for (i = 0; i < MAXQUOTAS; i++) { + dq = VTOI(vp)->i_dquot[i]; + if (dq != NODQUOT) + dqsync(vp, dq); + } + vput(vp); + } + return (0); +} + +/* + * Sync quota file for given vnode to disk. + */ +int +qsyncvp(struct vnode *vp) +{ + struct ufsmount *ump = VFSTOUFS(vp->v_mount); + struct dquot *dq; + int i; + + /* + * Check if the mount point has any quotas. + * If not, simply return. + */ + for (i = 0; i < MAXQUOTAS; i++) + if (ump->um_quotas[i] != NULLVP) + break; + if (i == MAXQUOTAS) + return (0); + /* + * Search quotas associated with this vnode + * synchronizing any modified dquot structures. + */ + for (i = 0; i < MAXQUOTAS; i++) { + dq = VTOI(vp)->i_dquot[i]; + if (dq != NODQUOT) + dqsync(vp, dq); + } + return (0); +} + +/* + * Code pertaining to management of the in-core dquot data structures. + */ +#define DQHASH(dqvp, id) \ + (&dqhashtbl[((((intptr_t)(dqvp)) >> 8) + id) & dqhash]) +static LIST_HEAD(dqhash, dquot) *dqhashtbl; +static u_long dqhash; + +/* + * Dquot free list. + */ +#define DQUOTINC 5 /* minimum free dquots desired */ +static TAILQ_HEAD(dqfreelist, dquot) dqfreelist; +static long numdquot, desireddquot = DQUOTINC; + +/* + * Lock to protect quota hash, dq free list and dq_cnt ref counters of + * _all_ dqs. + */ +struct mtx dqhlock; + +#define DQH_LOCK() mtx_lock(&dqhlock) +#define DQH_UNLOCK() mtx_unlock(&dqhlock) + +static struct dquot *dqhashfind(struct dqhash *dqh, u_long id, + struct vnode *dqvp); + +/* + * Initialize the quota system. + */ +void +dqinit(void) +{ + + mtx_init(&dqhlock, "dqhlock", NULL, MTX_DEF); + dqhashtbl = hashinit(desiredvnodes, M_DQUOT, &dqhash); + TAILQ_INIT(&dqfreelist); +} + +/* + * Shut down the quota system. + */ +void +dquninit(void) +{ + struct dquot *dq; + + hashdestroy(dqhashtbl, M_DQUOT, dqhash); + while ((dq = TAILQ_FIRST(&dqfreelist)) != NULL) { + TAILQ_REMOVE(&dqfreelist, dq, dq_freelist); + mtx_destroy(&dq->dq_lock); + free(dq, M_DQUOT); + } + mtx_destroy(&dqhlock); +} + +static struct dquot * +dqhashfind(struct dqhash *dqh, u_long id, struct vnode *dqvp) +{ + struct dquot *dq; + + mtx_assert(&dqhlock, MA_OWNED); + LIST_FOREACH(dq, dqh, dq_hash) { + if (dq->dq_id != id || + dq->dq_ump->um_quotas[dq->dq_type] != dqvp) + continue; + /* + * Cache hit with no references. Take + * the structure off the free list. + */ + if (dq->dq_cnt == 0) + TAILQ_REMOVE(&dqfreelist, dq, dq_freelist); + DQREF(dq); + return (dq); + } + return (NODQUOT); +} + +/* + * Determine the quota file type. + * + * A 32-bit quota file is simply an array of struct dqblk32. + * + * A 64-bit quota file is a struct dqhdr64 followed by an array of struct + * dqblk64. The header contains various magic bits which allow us to be + * reasonably confident that it is indeeda 64-bit quota file and not just + * a 32-bit quota file that just happens to "look right". + * + */ +static int +dqopen(struct vnode *vp, struct ufsmount *ump, int type) +{ + struct dqhdr64 dqh; + struct iovec aiov; + struct uio auio; + int error; + + ASSERT_VOP_LOCKED(vp, "dqopen"); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = &dqh; + aiov.iov_len = sizeof(dqh); + auio.uio_resid = sizeof(dqh); + auio.uio_offset = 0; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_td = (struct thread *)0; + error = VOP_READ(vp, &auio, 0, ump->um_cred[type]); + + if (error != 0) + return (error); + if (auio.uio_resid > 0) { + /* assume 32 bits */ + return (0); + } + + UFS_LOCK(ump); + if (strcmp(dqh.dqh_magic, Q_DQHDR64_MAGIC) == 0 && + be32toh(dqh.dqh_version) == Q_DQHDR64_VERSION && + be32toh(dqh.dqh_hdrlen) == (uint32_t)sizeof(struct dqhdr64) && + be32toh(dqh.dqh_reclen) == (uint32_t)sizeof(struct dqblk64)) { + /* XXX: what if the magic matches, but the sizes are wrong? */ + ump->um_qflags[type] |= QTF_64BIT; + } else { + ump->um_qflags[type] &= ~QTF_64BIT; + } + UFS_UNLOCK(ump); + + return (0); +} + +/* + * Obtain a dquot structure for the specified identifier and quota file + * reading the information from the file if necessary. + */ +static int +dqget(struct vnode *vp, u_long id, struct ufsmount *ump, int type, + struct dquot **dqp) +{ + uint8_t buf[sizeof(struct dqblk64)]; + off_t base, recsize; + struct dquot *dq, *dq1; + struct dqhash *dqh; + struct vnode *dqvp; + struct iovec aiov; + struct uio auio; + int dqvplocked, error; + +#ifdef DEBUG_VFS_LOCKS + if (vp != NULLVP) + ASSERT_VOP_ELOCKED(vp, "dqget"); +#endif + + if (vp != NULLVP && *dqp != NODQUOT) { + return (0); + } + + /* XXX: Disallow negative id values to prevent the + * creation of 100GB+ quota data files. + */ + if ((int)id < 0) + return (EINVAL); + + UFS_LOCK(ump); + dqvp = ump->um_quotas[type]; + if (dqvp == NULLVP || (ump->um_qflags[type] & QTF_CLOSING)) { + *dqp = NODQUOT; + UFS_UNLOCK(ump); + return (EINVAL); + } + vref(dqvp); + UFS_UNLOCK(ump); + error = 0; + dqvplocked = 0; + + /* + * Check the cache first. + */ + dqh = DQHASH(dqvp, id); + DQH_LOCK(); + dq = dqhashfind(dqh, id, dqvp); + if (dq != NULL) { + DQH_UNLOCK(); +hfound: DQI_LOCK(dq); + DQI_WAIT(dq, PINOD+1, "dqget"); + DQI_UNLOCK(dq); + if (dq->dq_ump == NULL) { + dqrele(vp, dq); + dq = NODQUOT; + error = EIO; + } + *dqp = dq; + if (dqvplocked) + vput(dqvp); + else + vrele(dqvp); + return (error); + } + + /* + * Quota vnode lock is before DQ_LOCK. Acquire dqvp lock there + * since new dq will appear on the hash chain DQ_LOCKed. + */ + if (vp != dqvp) { + DQH_UNLOCK(); + vn_lock(dqvp, LK_SHARED | LK_RETRY); + dqvplocked = 1; + DQH_LOCK(); + /* + * Recheck the cache after sleep for quota vnode lock. + */ + dq = dqhashfind(dqh, id, dqvp); + if (dq != NULL) { + DQH_UNLOCK(); + goto hfound; + } + } + + /* + * Not in cache, allocate a new one or take it from the + * free list. + */ + if (TAILQ_FIRST(&dqfreelist) == NODQUOT && + numdquot < MAXQUOTAS * desiredvnodes) + desireddquot += DQUOTINC; + if (numdquot < desireddquot) { + numdquot++; + DQH_UNLOCK(); + dq1 = malloc(sizeof *dq1, M_DQUOT, M_WAITOK | M_ZERO); + mtx_init(&dq1->dq_lock, "dqlock", NULL, MTX_DEF); + DQH_LOCK(); + /* + * Recheck the cache after sleep for memory. + */ + dq = dqhashfind(dqh, id, dqvp); + if (dq != NULL) { + numdquot--; + DQH_UNLOCK(); + mtx_destroy(&dq1->dq_lock); + free(dq1, M_DQUOT); + goto hfound; + } + dq = dq1; + } else { + if ((dq = TAILQ_FIRST(&dqfreelist)) == NULL) { + DQH_UNLOCK(); + tablefull("dquot"); + *dqp = NODQUOT; + if (dqvplocked) + vput(dqvp); + else + vrele(dqvp); + return (EUSERS); + } + if (dq->dq_cnt || (dq->dq_flags & DQ_MOD)) + panic("dqget: free dquot isn't %p", dq); + TAILQ_REMOVE(&dqfreelist, dq, dq_freelist); + if (dq->dq_ump != NULL) + LIST_REMOVE(dq, dq_hash); + } + + /* + * Dq is put into hash already locked to prevent parallel + * usage while it is being read from file. + */ + dq->dq_flags = DQ_LOCK; + dq->dq_id = id; + dq->dq_type = type; + dq->dq_ump = ump; + LIST_INSERT_HEAD(dqh, dq, dq_hash); + DQREF(dq); + DQH_UNLOCK(); + + /* + * Read the requested quota record from the quota file, performing + * any necessary conversions. + */ + if (ump->um_qflags[type] & QTF_64BIT) { + recsize = sizeof(struct dqblk64); + base = sizeof(struct dqhdr64); + } else { + recsize = sizeof(struct dqblk32); + base = 0; + } + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = buf; + aiov.iov_len = recsize; + auio.uio_resid = recsize; + auio.uio_offset = base + id * recsize; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_td = (struct thread *)0; + + error = VOP_READ(dqvp, &auio, 0, ump->um_cred[type]); + if (auio.uio_resid == recsize && error == 0) { + bzero(&dq->dq_dqb, sizeof(dq->dq_dqb)); + } else { + if (ump->um_qflags[type] & QTF_64BIT) + dqb64_dq((struct dqblk64 *)buf, dq); + else + dqb32_dq((struct dqblk32 *)buf, dq); + } + if (dqvplocked) + vput(dqvp); + else + vrele(dqvp); + /* + * I/O error in reading quota file, release + * quota structure and reflect problem to caller. + */ + if (error) { + DQH_LOCK(); + dq->dq_ump = NULL; + LIST_REMOVE(dq, dq_hash); + DQH_UNLOCK(); + DQI_LOCK(dq); + if (dq->dq_flags & DQ_WANT) + wakeup(dq); + dq->dq_flags = 0; + DQI_UNLOCK(dq); + dqrele(vp, dq); + *dqp = NODQUOT; + return (error); + } + DQI_LOCK(dq); + /* + * Check for no limit to enforce. + * Initialize time values if necessary. + */ + if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && + dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) + dq->dq_flags |= DQ_FAKE; + if (dq->dq_id != 0) { + if (dq->dq_btime == 0) { + dq->dq_btime = time_second + ump->um_btime[type]; + if (dq->dq_bsoftlimit && + dq->dq_curblocks >= dq->dq_bsoftlimit) + dq->dq_flags |= DQ_MOD; + } + if (dq->dq_itime == 0) { + dq->dq_itime = time_second + ump->um_itime[type]; + if (dq->dq_isoftlimit && + dq->dq_curinodes >= dq->dq_isoftlimit) + dq->dq_flags |= DQ_MOD; + } + } + DQI_WAKEUP(dq); + DQI_UNLOCK(dq); + *dqp = dq; + return (0); +} + +#ifdef DIAGNOSTIC +/* + * Obtain a reference to a dquot. + */ +static void +dqref(struct dquot *dq) +{ + + dq->dq_cnt++; +} +#endif + +/* + * Release a reference to a dquot. + */ +void +dqrele(struct vnode *vp, struct dquot *dq) +{ + + if (dq == NODQUOT) + return; + DQH_LOCK(); + KASSERT(dq->dq_cnt > 0, ("Lost dq %p reference 1", dq)); + if (dq->dq_cnt > 1) { + dq->dq_cnt--; + DQH_UNLOCK(); + return; + } + DQH_UNLOCK(); +sync: + (void) dqsync(vp, dq); + + DQH_LOCK(); + KASSERT(dq->dq_cnt > 0, ("Lost dq %p reference 2", dq)); + if (--dq->dq_cnt > 0) + { + DQH_UNLOCK(); + return; + } + + /* + * The dq may become dirty after it is synced but before it is + * put to the free list. Checking the DQ_MOD there without + * locking dq should be safe since no other references to the + * dq exist. + */ + if ((dq->dq_flags & DQ_MOD) != 0) { + dq->dq_cnt++; + DQH_UNLOCK(); + goto sync; + } + TAILQ_INSERT_TAIL(&dqfreelist, dq, dq_freelist); + DQH_UNLOCK(); +} + +/* + * Update the disk quota in the quota file. + */ +static int +dqsync(struct vnode *vp, struct dquot *dq) +{ + uint8_t buf[sizeof(struct dqblk64)]; + off_t base, recsize; + struct vnode *dqvp; + struct iovec aiov; + struct uio auio; + int error; + struct mount *mp; + struct ufsmount *ump; + +#ifdef DEBUG_VFS_LOCKS + if (vp != NULL) + ASSERT_VOP_ELOCKED(vp, "dqsync"); +#endif + + mp = NULL; + error = 0; + if (dq == NODQUOT) + panic("dqsync: dquot"); + if ((ump = dq->dq_ump) == NULL) + return (0); + UFS_LOCK(ump); + if ((dqvp = ump->um_quotas[dq->dq_type]) == NULLVP) { + if (vp == NULL) { + UFS_UNLOCK(ump); + return (0); + } else + panic("dqsync: file"); + } + vref(dqvp); + UFS_UNLOCK(ump); + + DQI_LOCK(dq); + if ((dq->dq_flags & DQ_MOD) == 0) { + DQI_UNLOCK(dq); + vrele(dqvp); + return (0); + } + DQI_UNLOCK(dq); + + (void) vn_start_secondary_write(dqvp, &mp, V_WAIT); + if (vp != dqvp) + vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY); + + DQI_LOCK(dq); + DQI_WAIT(dq, PINOD+2, "dqsync"); + if ((dq->dq_flags & DQ_MOD) == 0) + goto out; + dq->dq_flags |= DQ_LOCK; + DQI_UNLOCK(dq); + + /* + * Write the quota record to the quota file, performing any + * necessary conversions. See dqget() for additional details. + */ + if (ump->um_qflags[dq->dq_type] & QTF_64BIT) { + dq_dqb64(dq, (struct dqblk64 *)buf); + recsize = sizeof(struct dqblk64); + base = sizeof(struct dqhdr64); + } else { + dq_dqb32(dq, (struct dqblk32 *)buf); + recsize = sizeof(struct dqblk32); + base = 0; + } + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = buf; + aiov.iov_len = recsize; + auio.uio_resid = recsize; + auio.uio_offset = base + dq->dq_id * recsize; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_WRITE; + auio.uio_td = (struct thread *)0; + error = VOP_WRITE(dqvp, &auio, 0, dq->dq_ump->um_cred[dq->dq_type]); + if (auio.uio_resid && error == 0) + error = EIO; + + DQI_LOCK(dq); + DQI_WAKEUP(dq); + dq->dq_flags &= ~DQ_MOD; +out: + DQI_UNLOCK(dq); + if (vp != dqvp) + vput(dqvp); + else + vrele(dqvp); + vn_finished_secondary_write(mp); + return (error); +} + +/* + * Flush all entries from the cache for a particular vnode. + */ +static int +dqflush(struct vnode *vp) +{ + struct dquot *dq, *nextdq; + struct dqhash *dqh; + int error; + + /* + * Move all dquot's that used to refer to this quota + * file off their hash chains (they will eventually + * fall off the head of the free list and be re-used). + */ + error = 0; + DQH_LOCK(); + for (dqh = &dqhashtbl[dqhash]; dqh >= dqhashtbl; dqh--) { + for (dq = LIST_FIRST(dqh); dq; dq = nextdq) { + nextdq = LIST_NEXT(dq, dq_hash); + if (dq->dq_ump->um_quotas[dq->dq_type] != vp) + continue; + if (dq->dq_cnt) + error = EBUSY; + else { + LIST_REMOVE(dq, dq_hash); + dq->dq_ump = NULL; + } + } + } + DQH_UNLOCK(); + return (error); +} + +/* + * The following three functions are provided for the adjustment of + * quotas by the soft updates code. + */ +#ifdef SOFTUPDATES +/* + * Acquire a reference to the quota structures associated with a vnode. + * Return count of number of quota structures found. + */ +int +quotaref(vp, qrp) + struct vnode *vp; + struct dquot **qrp; +{ + struct inode *ip; + struct dquot *dq; + int i, found; + + for (i = 0; i < MAXQUOTAS; i++) + qrp[i] = NODQUOT; + /* + * Disk quotas must be turned off for system files. Currently + * snapshot and quota files. + */ + if ((vp->v_vflag & VV_SYSTEM) != 0) + return (0); + /* + * Iterate through and copy active quotas. + */ + found = 0; + ip = VTOI(vp); + mtx_lock(&dqhlock); + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + DQREF(dq); + qrp[i] = dq; + found++; + } + mtx_unlock(&dqhlock); + return (found); +} + +/* + * Release a set of quota structures obtained from a vnode. + */ +void +quotarele(qrp) + struct dquot **qrp; +{ + struct dquot *dq; + int i; + + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = qrp[i]) == NODQUOT) + continue; + dqrele(NULL, dq); + } +} + +/* + * Adjust the number of blocks associated with a quota. + * Positive numbers when adding blocks; negative numbers when freeing blocks. + */ +void +quotaadj(qrp, ump, blkcount) + struct dquot **qrp; + struct ufsmount *ump; + int64_t blkcount; +{ + struct dquot *dq; + ufs2_daddr_t ncurblocks; + int i; + + if (blkcount == 0) + return; + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = qrp[i]) == NODQUOT) + continue; + DQI_LOCK(dq); + DQI_WAIT(dq, PINOD+1, "adjqta"); + ncurblocks = dq->dq_curblocks + blkcount; + if (ncurblocks >= 0) + dq->dq_curblocks = ncurblocks; + else + dq->dq_curblocks = 0; + if (blkcount < 0) + dq->dq_flags &= ~DQ_BLKS; + else if (dq->dq_curblocks + blkcount >= dq->dq_bsoftlimit && + dq->dq_curblocks < dq->dq_bsoftlimit) + dq->dq_btime = time_second + ump->um_btime[i]; + dq->dq_flags |= DQ_MOD; + DQI_UNLOCK(dq); + } +} +#endif /* SOFTUPDATES */ + +/* + * 32-bit / 64-bit conversion functions. + * + * 32-bit quota records are stored in native byte order. Attention must + * be paid to overflow issues. + * + * 64-bit quota records are stored in network byte order. + */ + +#define CLIP32(u64) (u64 > UINT32_MAX ? UINT32_MAX : (uint32_t)u64) + +/* + * Convert 32-bit host-order structure to dquot. + */ +static void +dqb32_dq(const struct dqblk32 *dqb32, struct dquot *dq) +{ + + dq->dq_bhardlimit = dqb32->dqb_bhardlimit; + dq->dq_bsoftlimit = dqb32->dqb_bsoftlimit; + dq->dq_curblocks = dqb32->dqb_curblocks; + dq->dq_ihardlimit = dqb32->dqb_ihardlimit; + dq->dq_isoftlimit = dqb32->dqb_isoftlimit; + dq->dq_curinodes = dqb32->dqb_curinodes; + dq->dq_btime = dqb32->dqb_btime; + dq->dq_itime = dqb32->dqb_itime; +} + +/* + * Convert 64-bit network-order structure to dquot. + */ +static void +dqb64_dq(const struct dqblk64 *dqb64, struct dquot *dq) +{ + + dq->dq_bhardlimit = be64toh(dqb64->dqb_bhardlimit); + dq->dq_bsoftlimit = be64toh(dqb64->dqb_bsoftlimit); + dq->dq_curblocks = be64toh(dqb64->dqb_curblocks); + dq->dq_ihardlimit = be64toh(dqb64->dqb_ihardlimit); + dq->dq_isoftlimit = be64toh(dqb64->dqb_isoftlimit); + dq->dq_curinodes = be64toh(dqb64->dqb_curinodes); + dq->dq_btime = be64toh(dqb64->dqb_btime); + dq->dq_itime = be64toh(dqb64->dqb_itime); +} + +/* + * Convert dquot to 32-bit host-order structure. + */ +static void +dq_dqb32(const struct dquot *dq, struct dqblk32 *dqb32) +{ + + dqb32->dqb_bhardlimit = CLIP32(dq->dq_bhardlimit); + dqb32->dqb_bsoftlimit = CLIP32(dq->dq_bsoftlimit); + dqb32->dqb_curblocks = CLIP32(dq->dq_curblocks); + dqb32->dqb_ihardlimit = CLIP32(dq->dq_ihardlimit); + dqb32->dqb_isoftlimit = CLIP32(dq->dq_isoftlimit); + dqb32->dqb_curinodes = CLIP32(dq->dq_curinodes); + dqb32->dqb_btime = CLIP32(dq->dq_btime); + dqb32->dqb_itime = CLIP32(dq->dq_itime); +} + +/* + * Convert dquot to 64-bit network-order structure. + */ +static void +dq_dqb64(const struct dquot *dq, struct dqblk64 *dqb64) +{ + + dqb64->dqb_bhardlimit = htobe64(dq->dq_bhardlimit); + dqb64->dqb_bsoftlimit = htobe64(dq->dq_bsoftlimit); + dqb64->dqb_curblocks = htobe64(dq->dq_curblocks); + dqb64->dqb_ihardlimit = htobe64(dq->dq_ihardlimit); + dqb64->dqb_isoftlimit = htobe64(dq->dq_isoftlimit); + dqb64->dqb_curinodes = htobe64(dq->dq_curinodes); + dqb64->dqb_btime = htobe64(dq->dq_btime); + dqb64->dqb_itime = htobe64(dq->dq_itime); +} + +/* + * Convert 64-bit host-order structure to 32-bit host-order structure. + */ +static void +dqb64_dqb32(const struct dqblk64 *dqb64, struct dqblk32 *dqb32) +{ + + dqb32->dqb_bhardlimit = CLIP32(dqb64->dqb_bhardlimit); + dqb32->dqb_bsoftlimit = CLIP32(dqb64->dqb_bsoftlimit); + dqb32->dqb_curblocks = CLIP32(dqb64->dqb_curblocks); + dqb32->dqb_ihardlimit = CLIP32(dqb64->dqb_ihardlimit); + dqb32->dqb_isoftlimit = CLIP32(dqb64->dqb_isoftlimit); + dqb32->dqb_curinodes = CLIP32(dqb64->dqb_curinodes); + dqb32->dqb_btime = CLIP32(dqb64->dqb_btime); + dqb32->dqb_itime = CLIP32(dqb64->dqb_itime); +} + +/* + * Convert 32-bit host-order structure to 64-bit host-order structure. + */ +static void +dqb32_dqb64(const struct dqblk32 *dqb32, struct dqblk64 *dqb64) +{ + + dqb64->dqb_bhardlimit = dqb32->dqb_bhardlimit; + dqb64->dqb_bsoftlimit = dqb32->dqb_bsoftlimit; + dqb64->dqb_curblocks = dqb32->dqb_curblocks; + dqb64->dqb_ihardlimit = dqb32->dqb_ihardlimit; + dqb64->dqb_isoftlimit = dqb32->dqb_isoftlimit; + dqb64->dqb_curinodes = dqb32->dqb_curinodes; + dqb64->dqb_btime = dqb32->dqb_btime; + dqb64->dqb_itime = dqb32->dqb_itime; +} diff --git a/Dump/ufs/ufs/ufs_vfsops.c b/Dump/ufs/ufs/ufs_vfsops.c new file mode 100644 index 0000000..461cd73 --- /dev/null +++ b/Dump/ufs/ufs/ufs_vfsops.c @@ -0,0 +1,244 @@ +/*- + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_vfsops.c 8.8 (Berkeley) 5/20/95 + */ + +#include +__FBSDID("$FreeBSD: releng/11.2/sys/ufs/ufs/ufs_vfsops.c 331722 2018-03-29 02:50:57Z eadler $"); + +#include "opt_quota.h" +#include "opt_ufs.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#ifdef UFS_DIRHASH +#include +#include +#endif + +MALLOC_DEFINE(M_UFSMNT, "ufs_mount", "UFS mount structure"); + +/* + * Return the root of a filesystem. + */ +int +ufs_root(mp, flags, vpp) + struct mount *mp; + int flags; + struct vnode **vpp; +{ + struct vnode *nvp; + int error; + + error = VFS_VGET(mp, (ino_t)ROOTINO, flags, &nvp); + if (error) + return (error); + *vpp = nvp; + return (0); +} + +/* + * Do operations associated with quotas + */ +int +ufs_quotactl(mp, cmds, id, arg) + struct mount *mp; + int cmds; + uid_t id; + void *arg; +{ +#ifndef QUOTA + if ((cmds >> SUBCMDSHIFT) == Q_QUOTAON) + vfs_unbusy(mp); + + return (EOPNOTSUPP); +#else + struct thread *td; + int cmd, type, error; + + td = curthread; + cmd = cmds >> SUBCMDSHIFT; + type = cmds & SUBCMDMASK; + if (id == -1) { + switch (type) { + + case USRQUOTA: + id = td->td_ucred->cr_ruid; + break; + + case GRPQUOTA: + id = td->td_ucred->cr_rgid; + break; + + default: + if (cmd == Q_QUOTAON) + vfs_unbusy(mp); + return (EINVAL); + } + } + if ((u_int)type >= MAXQUOTAS) { + if (cmd == Q_QUOTAON) + vfs_unbusy(mp); + return (EINVAL); + } + + switch (cmd) { + case Q_QUOTAON: + error = quotaon(td, mp, type, arg); + break; + + case Q_QUOTAOFF: + error = quotaoff(td, mp, type); + break; + + case Q_SETQUOTA32: + error = setquota32(td, mp, id, type, arg); + break; + + case Q_SETUSE32: + error = setuse32(td, mp, id, type, arg); + break; + + case Q_GETQUOTA32: + error = getquota32(td, mp, id, type, arg); + break; + + case Q_SETQUOTA: + error = setquota(td, mp, id, type, arg); + break; + + case Q_SETUSE: + error = setuse(td, mp, id, type, arg); + break; + + case Q_GETQUOTA: + error = getquota(td, mp, id, type, arg); + break; + + case Q_GETQUOTASIZE: + error = getquotasize(td, mp, id, type, arg); + break; + + case Q_SYNC: + error = qsync(mp); + break; + + default: + error = EINVAL; + break; + } + return (error); +#endif +} + +/* + * Initial UFS filesystems, done only once. + */ +int +ufs_init(vfsp) + struct vfsconf *vfsp; +{ + +#ifdef QUOTA + dqinit(); +#endif +#ifdef UFS_DIRHASH + ufsdirhash_init(); +#endif + return (0); +} + +/* + * Uninitialise UFS filesystems, done before module unload. + */ +int +ufs_uninit(vfsp) + struct vfsconf *vfsp; +{ + +#ifdef QUOTA + dquninit(); +#endif +#ifdef UFS_DIRHASH + ufsdirhash_uninit(); +#endif + return (0); +} + +/* + * This is the generic part of fhtovp called after the underlying + * filesystem has validated the file handle. + * + * Call the VFS_CHECKEXP beforehand to verify access. + */ +int +ufs_fhtovp(mp, ufhp, flags, vpp) + struct mount *mp; + struct ufid *ufhp; + int flags; + struct vnode **vpp; +{ + struct inode *ip; + struct vnode *nvp; + int error; + + error = VFS_VGET(mp, ufhp->ufid_ino, flags, &nvp); + if (error) { + *vpp = NULLVP; + return (error); + } + ip = VTOI(nvp); + if (ip->i_mode == 0 || ip->i_gen != ufhp->ufid_gen || + ip->i_effnlink <= 0) { + vput(nvp); + *vpp = NULLVP; + return (ESTALE); + } + *vpp = nvp; + vnode_create_vobject(*vpp, DIP(ip, i_size), curthread); + return (0); +} diff --git a/Dump/ufs/ufs/ufs_vnops.c b/Dump/ufs/ufs/ufs_vnops.c new file mode 100644 index 0000000..66662a6 --- /dev/null +++ b/Dump/ufs/ufs/ufs_vnops.c @@ -0,0 +1,2805 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95 + */ + +#include +__FBSDID("$FreeBSD: releng/11.2/sys/ufs/ufs/ufs_vnops.c 332749 2018-04-19 02:47:21Z pfg $"); + +#include "opt_quota.h" +#include "opt_suiddir.h" +#include "opt_ufs.h" +#include "opt_ffs.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include /* XXX */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#ifdef UFS_DIRHASH +#include +#endif +#ifdef UFS_GJOURNAL +#include +FEATURE(ufs_gjournal, "Journaling support through GEOM for UFS"); +#endif + +#ifdef QUOTA +FEATURE(ufs_quota, "UFS disk quotas support"); +FEATURE(ufs_quota64, "64bit UFS disk quotas support"); +#endif + +#ifdef SUIDDIR +FEATURE(suiddir, + "Give all new files in directory the same ownership as the directory"); +#endif + + +#include + +static vop_accessx_t ufs_accessx; +static int ufs_chmod(struct vnode *, int, struct ucred *, struct thread *); +static int ufs_chown(struct vnode *, uid_t, gid_t, struct ucred *, struct thread *); +static vop_close_t ufs_close; +static vop_create_t ufs_create; +static vop_getattr_t ufs_getattr; +static vop_ioctl_t ufs_ioctl; +static vop_link_t ufs_link; +static int ufs_makeinode(int mode, struct vnode *, struct vnode **, struct componentname *, const char *); +static vop_markatime_t ufs_markatime; +static vop_mkdir_t ufs_mkdir; +static vop_mknod_t ufs_mknod; +static vop_open_t ufs_open; +static vop_pathconf_t ufs_pathconf; +static vop_print_t ufs_print; +static vop_readlink_t ufs_readlink; +static vop_remove_t ufs_remove; +static vop_rename_t ufs_rename; +static vop_rmdir_t ufs_rmdir; +static vop_setattr_t ufs_setattr; +static vop_strategy_t ufs_strategy; +static vop_symlink_t ufs_symlink; +static vop_whiteout_t ufs_whiteout; +static vop_close_t ufsfifo_close; +static vop_kqfilter_t ufsfifo_kqfilter; + +SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem"); + +/* + * A virgin directory (no blushing please). + */ +static struct dirtemplate mastertemplate = { + 0, 12, DT_DIR, 1, ".", + 0, DIRBLKSIZ - 12, DT_DIR, 2, ".." +}; +static struct odirtemplate omastertemplate = { + 0, 12, 1, ".", + 0, DIRBLKSIZ - 12, 2, ".." +}; + +static void +ufs_itimes_locked(struct vnode *vp) +{ + struct inode *ip; + struct timespec ts; + + ASSERT_VI_LOCKED(vp, __func__); + + ip = VTOI(vp); + if (UFS_RDONLY(ip)) + goto out; + if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0) + return; + + if ((vp->v_type == VBLK || vp->v_type == VCHR) && !DOINGSOFTDEP(vp)) + ip->i_flag |= IN_LAZYMOD; + else if (((vp->v_mount->mnt_kern_flag & + (MNTK_SUSPENDED | MNTK_SUSPEND)) == 0) || + (ip->i_flag & (IN_CHANGE | IN_UPDATE))) + ip->i_flag |= IN_MODIFIED; + else if (ip->i_flag & IN_ACCESS) + ip->i_flag |= IN_LAZYACCESS; + vfs_timestamp(&ts); + if (ip->i_flag & IN_ACCESS) { + DIP_SET(ip, i_atime, ts.tv_sec); + DIP_SET(ip, i_atimensec, ts.tv_nsec); + } + if (ip->i_flag & IN_UPDATE) { + DIP_SET(ip, i_mtime, ts.tv_sec); + DIP_SET(ip, i_mtimensec, ts.tv_nsec); + } + if (ip->i_flag & IN_CHANGE) { + DIP_SET(ip, i_ctime, ts.tv_sec); + DIP_SET(ip, i_ctimensec, ts.tv_nsec); + DIP_SET(ip, i_modrev, DIP(ip, i_modrev) + 1); + } + + out: + ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); +} + +void +ufs_itimes(struct vnode *vp) +{ + + VI_LOCK(vp); + ufs_itimes_locked(vp); + VI_UNLOCK(vp); +} + +/* + * Create a regular file + */ +static int +ufs_create(ap) + struct vop_create_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + int error; + + error = + ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), + ap->a_dvp, ap->a_vpp, ap->a_cnp, "ufs_create"); + if (error != 0) + return (error); + if ((ap->a_cnp->cn_flags & MAKEENTRY) != 0) + cache_enter(ap->a_dvp, *ap->a_vpp, ap->a_cnp); + return (0); +} + +/* + * Mknod vnode call + */ +/* ARGSUSED */ +static int +ufs_mknod(ap) + struct vop_mknod_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + struct vattr *vap = ap->a_vap; + struct vnode **vpp = ap->a_vpp; + struct inode *ip; + ino_t ino; + int error; + + error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), + ap->a_dvp, vpp, ap->a_cnp, "ufs_mknod"); + if (error) + return (error); + ip = VTOI(*vpp); + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + if (vap->va_rdev != VNOVAL) { + /* + * Want to be able to use this to make badblock + * inodes, so don't truncate the dev number. + */ + DIP_SET(ip, i_rdev, vap->va_rdev); + } + /* + * Remove inode, then reload it through VFS_VGET so it is + * checked to see if it is an alias of an existing entry in + * the inode cache. XXX I don't believe this is necessary now. + */ + (*vpp)->v_type = VNON; + ino = ip->i_number; /* Save this before vgone() invalidates ip. */ + vgone(*vpp); + vput(*vpp); + error = VFS_VGET(ap->a_dvp->v_mount, ino, LK_EXCLUSIVE, vpp); + if (error) { + *vpp = NULL; + return (error); + } + return (0); +} + +/* + * Open called. + */ +/* ARGSUSED */ +static int +ufs_open(struct vop_open_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct inode *ip; + + if (vp->v_type == VCHR || vp->v_type == VBLK) + return (EOPNOTSUPP); + + ip = VTOI(vp); + /* + * Files marked append-only must be opened for appending. + */ + if ((ip->i_flags & APPEND) && + (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) + return (EPERM); + vnode_create_vobject(vp, DIP(ip, i_size), ap->a_td); + return (0); +} + +/* + * Close called. + * + * Update the times on the inode. + */ +/* ARGSUSED */ +static int +ufs_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + int usecount; + + VI_LOCK(vp); + usecount = vp->v_usecount; + if (usecount > 1) + ufs_itimes_locked(vp); + VI_UNLOCK(vp); + return (0); +} + +static int +ufs_accessx(ap) + struct vop_accessx_args /* { + struct vnode *a_vp; + accmode_t a_accmode; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + accmode_t accmode = ap->a_accmode; + int error; +#ifdef QUOTA + int relocked; +#endif +#ifdef UFS_ACL + struct acl *acl; + acl_type_t type; +#endif + + /* + * Disallow write attempts on read-only filesystems; + * unless the file is a socket, fifo, or a block or + * character device resident on the filesystem. + */ + if (accmode & VMODIFY_PERMS) { + switch (vp->v_type) { + case VDIR: + case VLNK: + case VREG: + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); +#ifdef QUOTA + /* + * Inode is accounted in the quotas only if struct + * dquot is attached to it. VOP_ACCESS() is called + * from vn_open_cred() and provides a convenient + * point to call getinoquota(). + */ + if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { + + /* + * Upgrade vnode lock, since getinoquota() + * requires exclusive lock to modify inode. + */ + relocked = 1; + vhold(vp); + vn_lock(vp, LK_UPGRADE | LK_RETRY); + VI_LOCK(vp); + if (vp->v_iflag & VI_DOOMED) { + vdropl(vp); + error = ENOENT; + goto relock; + } + vdropl(vp); + } else + relocked = 0; + error = getinoquota(ip); +relock: + if (relocked) + vn_lock(vp, LK_DOWNGRADE | LK_RETRY); + if (error != 0) + return (error); +#endif + break; + default: + break; + } + } + + /* + * If immutable bit set, nobody gets to write it. "& ~VADMIN_PERMS" + * permits the owner of the file to remove the IMMUTABLE flag. + */ + if ((accmode & (VMODIFY_PERMS & ~VADMIN_PERMS)) && + (ip->i_flags & (IMMUTABLE | SF_SNAPSHOT))) + return (EPERM); + +#ifdef UFS_ACL + if ((vp->v_mount->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS)) != 0) { + if (vp->v_mount->mnt_flag & MNT_NFS4ACLS) + type = ACL_TYPE_NFS4; + else + type = ACL_TYPE_ACCESS; + + acl = acl_alloc(M_WAITOK); + if (type == ACL_TYPE_NFS4) + error = ufs_getacl_nfs4_internal(vp, acl, ap->a_td); + else + error = VOP_GETACL(vp, type, acl, ap->a_cred, ap->a_td); + switch (error) { + case 0: + if (type == ACL_TYPE_NFS4) { + error = vaccess_acl_nfs4(vp->v_type, ip->i_uid, + ip->i_gid, acl, accmode, ap->a_cred, NULL); + } else { + error = vfs_unixify_accmode(&accmode); + if (error == 0) + error = vaccess_acl_posix1e(vp->v_type, ip->i_uid, + ip->i_gid, acl, accmode, ap->a_cred, NULL); + } + break; + default: + if (error != EOPNOTSUPP) + printf( +"ufs_accessx(): Error retrieving ACL on object (%d).\n", + error); + /* + * XXX: Fall back until debugged. Should + * eventually possibly log an error, and return + * EPERM for safety. + */ + error = vfs_unixify_accmode(&accmode); + if (error == 0) + error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, + ip->i_gid, accmode, ap->a_cred, NULL); + } + acl_free(acl); + + return (error); + } +#endif /* !UFS_ACL */ + error = vfs_unixify_accmode(&accmode); + if (error == 0) + error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, + accmode, ap->a_cred, NULL); + return (error); +} + +/* ARGSUSED */ +static int +ufs_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct vattr *vap = ap->a_vap; + + VI_LOCK(vp); + ufs_itimes_locked(vp); + if (I_IS_UFS1(ip)) { + vap->va_atime.tv_sec = ip->i_din1->di_atime; + vap->va_atime.tv_nsec = ip->i_din1->di_atimensec; + } else { + vap->va_atime.tv_sec = ip->i_din2->di_atime; + vap->va_atime.tv_nsec = ip->i_din2->di_atimensec; + } + VI_UNLOCK(vp); + /* + * Copy from inode table + */ + vap->va_fsid = dev2udev(ITOUMP(ip)->um_dev); + vap->va_fileid = ip->i_number; + vap->va_mode = ip->i_mode & ~IFMT; + vap->va_nlink = ip->i_effnlink; + vap->va_uid = ip->i_uid; + vap->va_gid = ip->i_gid; + if (I_IS_UFS1(ip)) { + vap->va_rdev = ip->i_din1->di_rdev; + vap->va_size = ip->i_din1->di_size; + vap->va_mtime.tv_sec = ip->i_din1->di_mtime; + vap->va_mtime.tv_nsec = ip->i_din1->di_mtimensec; + vap->va_ctime.tv_sec = ip->i_din1->di_ctime; + vap->va_ctime.tv_nsec = ip->i_din1->di_ctimensec; + vap->va_bytes = dbtob((u_quad_t)ip->i_din1->di_blocks); + vap->va_filerev = ip->i_din1->di_modrev; + } else { + vap->va_rdev = ip->i_din2->di_rdev; + vap->va_size = ip->i_din2->di_size; + vap->va_mtime.tv_sec = ip->i_din2->di_mtime; + vap->va_mtime.tv_nsec = ip->i_din2->di_mtimensec; + vap->va_ctime.tv_sec = ip->i_din2->di_ctime; + vap->va_ctime.tv_nsec = ip->i_din2->di_ctimensec; + vap->va_birthtime.tv_sec = ip->i_din2->di_birthtime; + vap->va_birthtime.tv_nsec = ip->i_din2->di_birthnsec; + vap->va_bytes = dbtob((u_quad_t)ip->i_din2->di_blocks); + vap->va_filerev = ip->i_din2->di_modrev; + } + vap->va_flags = ip->i_flags; + vap->va_gen = ip->i_gen; + vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; + vap->va_type = IFTOVT(ip->i_mode); + return (0); +} + +/* + * Set attribute vnode op. called from several syscalls + */ +static int +ufs_setattr(ap) + struct vop_setattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + } */ *ap; +{ + struct vattr *vap = ap->a_vap; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct ucred *cred = ap->a_cred; + struct thread *td = curthread; + int error; + + /* + * Check for unsettable attributes. + */ + if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || + (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || + (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || + ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { + return (EINVAL); + } + if (vap->va_flags != VNOVAL) { + if ((vap->va_flags & ~(SF_APPEND | SF_ARCHIVED | SF_IMMUTABLE | + SF_NOUNLINK | SF_SNAPSHOT | UF_APPEND | UF_ARCHIVE | + UF_HIDDEN | UF_IMMUTABLE | UF_NODUMP | UF_NOUNLINK | + UF_OFFLINE | UF_OPAQUE | UF_READONLY | UF_REPARSE | + UF_SPARSE | UF_SYSTEM)) != 0) + return (EOPNOTSUPP); + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + /* + * Callers may only modify the file flags on objects they + * have VADMIN rights for. + */ + if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) + return (error); + /* + * Unprivileged processes are not permitted to unset system + * flags, or modify flags if any system flags are set. + * Privileged non-jail processes may not modify system flags + * if securelevel > 0 and any existing system flags are set. + * Privileged jail processes behave like privileged non-jail + * processes if the security.jail.chflags_allowed sysctl is + * is non-zero; otherwise, they behave like unprivileged + * processes. + */ + if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0)) { + if (ip->i_flags & + (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) { + error = securelevel_gt(cred, 0); + if (error) + return (error); + } + /* The snapshot flag cannot be toggled. */ + if ((vap->va_flags ^ ip->i_flags) & SF_SNAPSHOT) + return (EPERM); + } else { + if (ip->i_flags & + (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || + ((vap->va_flags ^ ip->i_flags) & SF_SETTABLE)) + return (EPERM); + } + ip->i_flags = vap->va_flags; + DIP_SET(ip, i_flags, vap->va_flags); + ip->i_flag |= IN_CHANGE; + error = UFS_UPDATE(vp, 0); + if (ip->i_flags & (IMMUTABLE | APPEND)) + return (error); + } + /* + * If immutable or append, no one can change any of its attributes + * except the ones already handled (in some cases, file flags + * including the immutability flags themselves for the superuser). + */ + if (ip->i_flags & (IMMUTABLE | APPEND)) + return (EPERM); + /* + * Go through the fields and update iff not VNOVAL. + */ + if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + if ((error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, + td)) != 0) + return (error); + } + if (vap->va_size != VNOVAL) { + /* + * XXX most of the following special cases should be in + * callers instead of in N filesystems. The VDIR check + * mostly already is. + */ + switch (vp->v_type) { + case VDIR: + return (EISDIR); + case VLNK: + case VREG: + /* + * Truncation should have an effect in these cases. + * Disallow it if the filesystem is read-only or + * the file is being snapshotted. + */ + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + if ((ip->i_flags & SF_SNAPSHOT) != 0) + return (EPERM); + break; + default: + /* + * According to POSIX, the result is unspecified + * for file types other than regular files, + * directories and shared memory objects. We + * don't support shared memory objects in the file + * system, and have dubious support for truncating + * symlinks. Just ignore the request in other cases. + */ + return (0); + } + if ((error = UFS_TRUNCATE(vp, vap->va_size, IO_NORMAL | + ((vap->va_vaflags & VA_SYNC) != 0 ? IO_SYNC : 0), + cred)) != 0) + return (error); + } + if (vap->va_atime.tv_sec != VNOVAL || + vap->va_mtime.tv_sec != VNOVAL || + vap->va_birthtime.tv_sec != VNOVAL) { + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + if ((ip->i_flags & SF_SNAPSHOT) != 0) + return (EPERM); + error = vn_utimes_perm(vp, vap, cred, td); + if (error != 0) + return (error); + ip->i_flag |= IN_CHANGE | IN_MODIFIED; + if (vap->va_atime.tv_sec != VNOVAL) { + ip->i_flag &= ~IN_ACCESS; + DIP_SET(ip, i_atime, vap->va_atime.tv_sec); + DIP_SET(ip, i_atimensec, vap->va_atime.tv_nsec); + } + if (vap->va_mtime.tv_sec != VNOVAL) { + ip->i_flag &= ~IN_UPDATE; + DIP_SET(ip, i_mtime, vap->va_mtime.tv_sec); + DIP_SET(ip, i_mtimensec, vap->va_mtime.tv_nsec); + } + if (vap->va_birthtime.tv_sec != VNOVAL && I_IS_UFS2(ip)) { + ip->i_din2->di_birthtime = vap->va_birthtime.tv_sec; + ip->i_din2->di_birthnsec = vap->va_birthtime.tv_nsec; + } + error = UFS_UPDATE(vp, 0); + if (error) + return (error); + } + error = 0; + if (vap->va_mode != (mode_t)VNOVAL) { + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + if ((ip->i_flags & SF_SNAPSHOT) != 0 && (vap->va_mode & + (S_IXUSR | S_IWUSR | S_IXGRP | S_IWGRP | S_IXOTH | S_IWOTH))) + return (EPERM); + error = ufs_chmod(vp, (int)vap->va_mode, cred, td); + } + return (error); +} + +#ifdef UFS_ACL +static int +ufs_update_nfs4_acl_after_mode_change(struct vnode *vp, int mode, + int file_owner_id, struct ucred *cred, struct thread *td) +{ + int error; + struct acl *aclp; + + aclp = acl_alloc(M_WAITOK); + error = ufs_getacl_nfs4_internal(vp, aclp, td); + /* + * We don't have to handle EOPNOTSUPP here, as the filesystem claims + * it supports ACLs. + */ + if (error) + goto out; + + acl_nfs4_sync_acl_from_mode(aclp, mode, file_owner_id); + error = ufs_setacl_nfs4_internal(vp, aclp, td); + +out: + acl_free(aclp); + return (error); +} +#endif /* UFS_ACL */ + +/* + * Mark this file's access time for update for vfs_mark_atime(). This + * is called from execve() and mmap(). + */ +static int +ufs_markatime(ap) + struct vop_markatime_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + + VI_LOCK(vp); + ip->i_flag |= IN_ACCESS; + VI_UNLOCK(vp); + /* + * XXXKIB No UFS_UPDATE(ap->a_vp, 0) there. + */ + return (0); +} + +/* + * Change the mode on a file. + * Inode must be locked before calling. + */ +static int +ufs_chmod(vp, mode, cred, td) + struct vnode *vp; + int mode; + struct ucred *cred; + struct thread *td; +{ + struct inode *ip = VTOI(vp); + int error; + + /* + * To modify the permissions on a file, must possess VADMIN + * for that file. + */ + if ((error = VOP_ACCESSX(vp, VWRITE_ACL, cred, td))) + return (error); + /* + * Privileged processes may set the sticky bit on non-directories, + * as well as set the setgid bit on a file with a group that the + * process is not a member of. Both of these are allowed in + * jail(8). + */ + if (vp->v_type != VDIR && (mode & S_ISTXT)) { + if (priv_check_cred(cred, PRIV_VFS_STICKYFILE, 0)) + return (EFTYPE); + } + if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) { + error = priv_check_cred(cred, PRIV_VFS_SETGID, 0); + if (error) + return (error); + } + + /* + * Deny setting setuid if we are not the file owner. + */ + if ((mode & ISUID) && ip->i_uid != cred->cr_uid) { + error = priv_check_cred(cred, PRIV_VFS_ADMIN, 0); + if (error) + return (error); + } + + ip->i_mode &= ~ALLPERMS; + ip->i_mode |= (mode & ALLPERMS); + DIP_SET(ip, i_mode, ip->i_mode); + ip->i_flag |= IN_CHANGE; +#ifdef UFS_ACL + if ((vp->v_mount->mnt_flag & MNT_NFS4ACLS) != 0) + error = ufs_update_nfs4_acl_after_mode_change(vp, mode, ip->i_uid, cred, td); +#endif + if (error == 0 && (ip->i_flag & IN_CHANGE) != 0) + error = UFS_UPDATE(vp, 0); + + return (error); +} + +/* + * Perform chown operation on inode ip; + * inode must be locked prior to call. + */ +static int +ufs_chown(vp, uid, gid, cred, td) + struct vnode *vp; + uid_t uid; + gid_t gid; + struct ucred *cred; + struct thread *td; +{ + struct inode *ip = VTOI(vp); + uid_t ouid; + gid_t ogid; + int error = 0; +#ifdef QUOTA + int i; + ufs2_daddr_t change; +#endif + + if (uid == (uid_t)VNOVAL) + uid = ip->i_uid; + if (gid == (gid_t)VNOVAL) + gid = ip->i_gid; + /* + * To modify the ownership of a file, must possess VADMIN for that + * file. + */ + if ((error = VOP_ACCESSX(vp, VWRITE_OWNER, cred, td))) + return (error); + /* + * To change the owner of a file, or change the group of a file to a + * group of which we are not a member, the caller must have + * privilege. + */ + if (((uid != ip->i_uid && uid != cred->cr_uid) || + (gid != ip->i_gid && !groupmember(gid, cred))) && + (error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0))) + return (error); + ogid = ip->i_gid; + ouid = ip->i_uid; +#ifdef QUOTA + if ((error = getinoquota(ip)) != 0) + return (error); + if (ouid == uid) { + dqrele(vp, ip->i_dquot[USRQUOTA]); + ip->i_dquot[USRQUOTA] = NODQUOT; + } + if (ogid == gid) { + dqrele(vp, ip->i_dquot[GRPQUOTA]); + ip->i_dquot[GRPQUOTA] = NODQUOT; + } + change = DIP(ip, i_blocks); + (void) chkdq(ip, -change, cred, CHOWN); + (void) chkiq(ip, -1, cred, CHOWN); + for (i = 0; i < MAXQUOTAS; i++) { + dqrele(vp, ip->i_dquot[i]); + ip->i_dquot[i] = NODQUOT; + } +#endif + ip->i_gid = gid; + DIP_SET(ip, i_gid, gid); + ip->i_uid = uid; + DIP_SET(ip, i_uid, uid); +#ifdef QUOTA + if ((error = getinoquota(ip)) == 0) { + if (ouid == uid) { + dqrele(vp, ip->i_dquot[USRQUOTA]); + ip->i_dquot[USRQUOTA] = NODQUOT; + } + if (ogid == gid) { + dqrele(vp, ip->i_dquot[GRPQUOTA]); + ip->i_dquot[GRPQUOTA] = NODQUOT; + } + if ((error = chkdq(ip, change, cred, CHOWN)) == 0) { + if ((error = chkiq(ip, 1, cred, CHOWN)) == 0) + goto good; + else + (void) chkdq(ip, -change, cred, CHOWN|FORCE); + } + for (i = 0; i < MAXQUOTAS; i++) { + dqrele(vp, ip->i_dquot[i]); + ip->i_dquot[i] = NODQUOT; + } + } + ip->i_gid = ogid; + DIP_SET(ip, i_gid, ogid); + ip->i_uid = ouid; + DIP_SET(ip, i_uid, ouid); + if (getinoquota(ip) == 0) { + if (ouid == uid) { + dqrele(vp, ip->i_dquot[USRQUOTA]); + ip->i_dquot[USRQUOTA] = NODQUOT; + } + if (ogid == gid) { + dqrele(vp, ip->i_dquot[GRPQUOTA]); + ip->i_dquot[GRPQUOTA] = NODQUOT; + } + (void) chkdq(ip, change, cred, FORCE|CHOWN); + (void) chkiq(ip, 1, cred, FORCE|CHOWN); + (void) getinoquota(ip); + } + return (error); +good: + if (getinoquota(ip)) + panic("ufs_chown: lost quota"); +#endif /* QUOTA */ + ip->i_flag |= IN_CHANGE; + if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) { + if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID, 0)) { + ip->i_mode &= ~(ISUID | ISGID); + DIP_SET(ip, i_mode, ip->i_mode); + } + } + error = UFS_UPDATE(vp, 0); + return (error); +} + +static int +ufs_remove(ap) + struct vop_remove_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + struct inode *ip; + struct vnode *vp = ap->a_vp; + struct vnode *dvp = ap->a_dvp; + int error; + struct thread *td; + + td = curthread; + ip = VTOI(vp); + if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || + (VTOI(dvp)->i_flags & APPEND)) { + error = EPERM; + goto out; + } +#ifdef UFS_GJOURNAL + ufs_gjournal_orphan(vp); +#endif + error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0); + if (ip->i_nlink <= 0) + vp->v_vflag |= VV_NOSYNC; + if ((ip->i_flags & SF_SNAPSHOT) != 0) { + /* + * Avoid deadlock where another thread is trying to + * update the inodeblock for dvp and is waiting on + * snaplk. Temporary unlock the vnode lock for the + * unlinked file and sync the directory. This should + * allow vput() of the directory to not block later on + * while holding the snapshot vnode locked, assuming + * that the directory hasn't been unlinked too. + */ + VOP_UNLOCK(vp, 0); + (void) VOP_FSYNC(dvp, MNT_WAIT, td); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + } +out: + return (error); +} + +static void +print_bad_link_count(const char *funcname, struct vnode *dvp) +{ + struct inode *dip; + + dip = VTOI(dvp); + uprintf("%s: Bad link count %d on parent inode %jd in file system %s\n", + funcname, dip->i_effnlink, (intmax_t)dip->i_number, + dvp->v_mount->mnt_stat.f_mntonname); +} + +/* + * link vnode call + */ +static int +ufs_link(ap) + struct vop_link_args /* { + struct vnode *a_tdvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct vnode *tdvp = ap->a_tdvp; + struct componentname *cnp = ap->a_cnp; + struct inode *ip; + struct direct newdir; + int error; + +#ifdef INVARIANTS + if ((cnp->cn_flags & HASBUF) == 0) + panic("ufs_link: no name"); +#endif + if (VTOI(tdvp)->i_effnlink < 2) { + print_bad_link_count("ufs_link", tdvp); + error = EINVAL; + goto out; + } + ip = VTOI(vp); + if ((nlink_t)ip->i_nlink >= LINK_MAX) { + error = EMLINK; + goto out; + } + /* + * The file may have been removed after namei droped the original + * lock. + */ + if (ip->i_effnlink == 0) { + error = ENOENT; + goto out; + } + if (ip->i_flags & (IMMUTABLE | APPEND)) { + error = EPERM; + goto out; + } + ip->i_effnlink++; + ip->i_nlink++; + DIP_SET(ip, i_nlink, ip->i_nlink); + ip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(vp)) + softdep_setup_link(VTOI(tdvp), ip); + error = UFS_UPDATE(vp, !(DOINGSOFTDEP(vp) | DOINGASYNC(vp))); + if (!error) { + ufs_makedirentry(ip, cnp, &newdir); + error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL, 0); + } + + if (error) { + ip->i_effnlink--; + ip->i_nlink--; + DIP_SET(ip, i_nlink, ip->i_nlink); + ip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(vp)) + softdep_revert_link(VTOI(tdvp), ip); + } +out: + return (error); +} + +/* + * whiteout vnode call + */ +static int +ufs_whiteout(ap) + struct vop_whiteout_args /* { + struct vnode *a_dvp; + struct componentname *a_cnp; + int a_flags; + } */ *ap; +{ + struct vnode *dvp = ap->a_dvp; + struct componentname *cnp = ap->a_cnp; + struct direct newdir; + int error = 0; + + switch (ap->a_flags) { + case LOOKUP: + /* 4.4 format directories support whiteout operations */ + if (dvp->v_mount->mnt_maxsymlinklen > 0) + return (0); + return (EOPNOTSUPP); + + case CREATE: + /* create a new directory whiteout */ +#ifdef INVARIANTS + if ((cnp->cn_flags & SAVENAME) == 0) + panic("ufs_whiteout: missing name"); + if (dvp->v_mount->mnt_maxsymlinklen <= 0) + panic("ufs_whiteout: old format filesystem"); +#endif + + newdir.d_ino = WINO; + newdir.d_namlen = cnp->cn_namelen; + bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1); + newdir.d_type = DT_WHT; + error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL, 0); + break; + + case DELETE: + /* remove an existing directory whiteout */ +#ifdef INVARIANTS + if (dvp->v_mount->mnt_maxsymlinklen <= 0) + panic("ufs_whiteout: old format filesystem"); +#endif + + cnp->cn_flags &= ~DOWHITEOUT; + error = ufs_dirremove(dvp, NULL, cnp->cn_flags, 0); + break; + default: + panic("ufs_whiteout: unknown op"); + } + return (error); +} + +static volatile int rename_restarts; +SYSCTL_INT(_vfs_ufs, OID_AUTO, rename_restarts, CTLFLAG_RD, + __DEVOLATILE(int *, &rename_restarts), 0, + "Times rename had to restart due to lock contention"); + +/* + * Rename system call. + * rename("foo", "bar"); + * is essentially + * unlink("bar"); + * link("foo", "bar"); + * unlink("foo"); + * but ``atomically''. Can't do full commit without saving state in the + * inode on disk which isn't feasible at this time. Best we can do is + * always guarantee the target exists. + * + * Basic algorithm is: + * + * 1) Bump link count on source while we're linking it to the + * target. This also ensure the inode won't be deleted out + * from underneath us while we work (it may be truncated by + * a concurrent `trunc' or `open' for creation). + * 2) Link source to destination. If destination already exists, + * delete it first. + * 3) Unlink source reference to inode if still around. If a + * directory was moved and the parent of the destination + * is different from the source, patch the ".." entry in the + * directory. + */ +static int +ufs_rename(ap) + struct vop_rename_args /* { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + } */ *ap; +{ + struct vnode *tvp = ap->a_tvp; + struct vnode *tdvp = ap->a_tdvp; + struct vnode *fvp = ap->a_fvp; + struct vnode *fdvp = ap->a_fdvp; + struct vnode *nvp; + struct componentname *tcnp = ap->a_tcnp; + struct componentname *fcnp = ap->a_fcnp; + struct thread *td = fcnp->cn_thread; + struct inode *fip, *tip, *tdp, *fdp; + struct direct newdir; + off_t endoff; + int doingdirectory, newparent; + int error = 0; + struct mount *mp; + ino_t ino; + +#ifdef INVARIANTS + if ((tcnp->cn_flags & HASBUF) == 0 || + (fcnp->cn_flags & HASBUF) == 0) + panic("ufs_rename: no name"); +#endif + endoff = 0; + mp = tdvp->v_mount; + VOP_UNLOCK(tdvp, 0); + if (tvp && tvp != tdvp) + VOP_UNLOCK(tvp, 0); + /* + * Check for cross-device rename. + */ + if ((fvp->v_mount != tdvp->v_mount) || + (tvp && (fvp->v_mount != tvp->v_mount))) { + error = EXDEV; + mp = NULL; + goto releout; + } +relock: + /* + * We need to acquire 2 to 4 locks depending on whether tvp is NULL + * and fdvp and tdvp are the same directory. Subsequently we need + * to double-check all paths and in the directory rename case we + * need to verify that we are not creating a directory loop. To + * handle this we acquire all but fdvp using non-blocking + * acquisitions. If we fail to acquire any lock in the path we will + * drop all held locks, acquire the new lock in a blocking fashion, + * and then release it and restart the rename. This acquire/release + * step ensures that we do not spin on a lock waiting for release. + */ + error = vn_lock(fdvp, LK_EXCLUSIVE); + if (error) + goto releout; + if (vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { + VOP_UNLOCK(fdvp, 0); + error = vn_lock(tdvp, LK_EXCLUSIVE); + if (error) + goto releout; + VOP_UNLOCK(tdvp, 0); + atomic_add_int(&rename_restarts, 1); + goto relock; + } + /* + * Re-resolve fvp to be certain it still exists and fetch the + * correct vnode. + */ + error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino); + if (error) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(tdvp, 0); + goto releout; + } + error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp); + if (error) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(tdvp, 0); + if (error != EBUSY) + goto releout; + error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp); + if (error != 0) + goto releout; + VOP_UNLOCK(nvp, 0); + vrele(fvp); + fvp = nvp; + atomic_add_int(&rename_restarts, 1); + goto relock; + } + vrele(fvp); + fvp = nvp; + /* + * Re-resolve tvp and acquire the vnode lock if present. + */ + error = ufs_lookup_ino(tdvp, NULL, tcnp, &ino); + if (error != 0 && error != EJUSTRETURN) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(tdvp, 0); + VOP_UNLOCK(fvp, 0); + goto releout; + } + /* + * If tvp disappeared we just carry on. + */ + if (error == EJUSTRETURN && tvp != NULL) { + vrele(tvp); + tvp = NULL; + } + /* + * Get the tvp ino if the lookup succeeded. We may have to restart + * if the non-blocking acquire fails. + */ + if (error == 0) { + nvp = NULL; + error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp); + if (tvp) + vrele(tvp); + tvp = nvp; + if (error) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(tdvp, 0); + VOP_UNLOCK(fvp, 0); + if (error != EBUSY) + goto releout; + error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp); + if (error != 0) + goto releout; + vput(nvp); + atomic_add_int(&rename_restarts, 1); + goto relock; + } + } + fdp = VTOI(fdvp); + fip = VTOI(fvp); + tdp = VTOI(tdvp); + tip = NULL; + if (tvp) + tip = VTOI(tvp); + if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || + (VTOI(tdvp)->i_flags & APPEND))) { + error = EPERM; + goto unlockout; + } + /* + * Renaming a file to itself has no effect. The upper layers should + * not call us in that case. However, things could change after + * we drop the locks above. + */ + if (fvp == tvp) { + error = 0; + goto unlockout; + } + doingdirectory = 0; + newparent = 0; + ino = fip->i_number; + if (fip->i_nlink >= LINK_MAX) { + error = EMLINK; + goto unlockout; + } + if ((fip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) + || (fdp->i_flags & APPEND)) { + error = EPERM; + goto unlockout; + } + if ((fip->i_mode & IFMT) == IFDIR) { + /* + * Avoid ".", "..", and aliases of "." for obvious reasons. + */ + if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || + fdp == fip || + (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { + error = EINVAL; + goto unlockout; + } + if (fdp->i_number != tdp->i_number) + newparent = tdp->i_number; + doingdirectory = 1; + } + if ((fvp->v_type == VDIR && fvp->v_mountedhere != NULL) || + (tvp != NULL && tvp->v_type == VDIR && + tvp->v_mountedhere != NULL)) { + error = EXDEV; + goto unlockout; + } + + /* + * If ".." must be changed (ie the directory gets a new + * parent) then the source directory must not be in the + * directory hierarchy above the target, as this would + * orphan everything below the source directory. Also + * the user must have write permission in the source so + * as to be able to change "..". + */ + if (doingdirectory && newparent) { + error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread); + if (error) + goto unlockout; + error = ufs_checkpath(ino, fdp->i_number, tdp, tcnp->cn_cred, + &ino); + /* + * We encountered a lock that we have to wait for. Unlock + * everything else and VGET before restarting. + */ + if (ino) { + VOP_UNLOCK(fdvp, 0); + VOP_UNLOCK(fvp, 0); + VOP_UNLOCK(tdvp, 0); + if (tvp) + VOP_UNLOCK(tvp, 0); + error = VFS_VGET(mp, ino, LK_SHARED, &nvp); + if (error == 0) + vput(nvp); + atomic_add_int(&rename_restarts, 1); + goto relock; + } + if (error) + goto unlockout; + if ((tcnp->cn_flags & SAVESTART) == 0) + panic("ufs_rename: lost to startdir"); + } + if (fip->i_effnlink == 0 || fdp->i_effnlink == 0 || + tdp->i_effnlink == 0) + panic("Bad effnlink fip %p, fdp %p, tdp %p", fip, fdp, tdp); + + /* + * 1) Bump link count while we're moving stuff + * around. If we crash somewhere before + * completing our work, the link count + * may be wrong, but correctable. + */ + fip->i_effnlink++; + fip->i_nlink++; + DIP_SET(fip, i_nlink, fip->i_nlink); + fip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(fvp)) + softdep_setup_link(tdp, fip); + error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) | DOINGASYNC(fvp))); + if (error) + goto bad; + + /* + * 2) If target doesn't exist, link the target + * to the source and unlink the source. + * Otherwise, rewrite the target directory + * entry to reference the source inode and + * expunge the original entry's existence. + */ + if (tip == NULL) { + if (ITODEV(tdp) != ITODEV(fip)) + panic("ufs_rename: EXDEV"); + if (doingdirectory && newparent) { + /* + * Account for ".." in new directory. + * When source and destination have the same + * parent we don't adjust the link count. The + * actual link modification is completed when + * .. is rewritten below. + */ + if ((nlink_t)tdp->i_nlink >= LINK_MAX) { + error = EMLINK; + goto bad; + } + } + ufs_makedirentry(fip, tcnp, &newdir); + error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL, 1); + if (error) + goto bad; + /* Setup tdvp for directory compaction if needed. */ + if (tdp->i_count && tdp->i_endoff && + tdp->i_endoff < tdp->i_size) + endoff = tdp->i_endoff; + } else { + if (ITODEV(tip) != ITODEV(tdp) || ITODEV(tip) != ITODEV(fip)) + panic("ufs_rename: EXDEV"); + /* + * Short circuit rename(foo, foo). + */ + if (tip->i_number == fip->i_number) + panic("ufs_rename: same file"); + /* + * If the parent directory is "sticky", then the caller + * must possess VADMIN for the parent directory, or the + * destination of the rename. This implements append-only + * directories. + */ + if ((tdp->i_mode & S_ISTXT) && + VOP_ACCESS(tdvp, VADMIN, tcnp->cn_cred, td) && + VOP_ACCESS(tvp, VADMIN, tcnp->cn_cred, td)) { + error = EPERM; + goto bad; + } + /* + * Target must be empty if a directory and have no links + * to it. Also, ensure source and target are compatible + * (both directories, or both not directories). + */ + if ((tip->i_mode & IFMT) == IFDIR) { + if ((tip->i_effnlink > 2) || + !ufs_dirempty(tip, tdp->i_number, tcnp->cn_cred)) { + error = ENOTEMPTY; + goto bad; + } + if (!doingdirectory) { + error = ENOTDIR; + goto bad; + } + cache_purge(tdvp); + } else if (doingdirectory) { + error = EISDIR; + goto bad; + } + if (doingdirectory) { + if (!newparent) { + tdp->i_effnlink--; + if (DOINGSOFTDEP(tdvp)) + softdep_change_linkcnt(tdp); + } + tip->i_effnlink--; + if (DOINGSOFTDEP(tvp)) + softdep_change_linkcnt(tip); + } + error = ufs_dirrewrite(tdp, tip, fip->i_number, + IFTODT(fip->i_mode), + (doingdirectory && newparent) ? newparent : doingdirectory); + if (error) { + if (doingdirectory) { + if (!newparent) { + tdp->i_effnlink++; + if (DOINGSOFTDEP(tdvp)) + softdep_change_linkcnt(tdp); + } + tip->i_effnlink++; + if (DOINGSOFTDEP(tvp)) + softdep_change_linkcnt(tip); + } + } + if (doingdirectory && !DOINGSOFTDEP(tvp)) { + /* + * The only stuff left in the directory is "." + * and "..". The "." reference is inconsequential + * since we are quashing it. We have removed the "." + * reference and the reference in the parent directory, + * but there may be other hard links. The soft + * dependency code will arrange to do these operations + * after the parent directory entry has been deleted on + * disk, so when running with that code we avoid doing + * them now. + */ + if (!newparent) { + tdp->i_nlink--; + DIP_SET(tdp, i_nlink, tdp->i_nlink); + tdp->i_flag |= IN_CHANGE; + } + tip->i_nlink--; + DIP_SET(tip, i_nlink, tip->i_nlink); + tip->i_flag |= IN_CHANGE; + } + } + + /* + * 3) Unlink the source. We have to resolve the path again to + * fixup the directory offset and count for ufs_dirremove. + */ + if (fdvp == tdvp) { + error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino); + if (error) + panic("ufs_rename: from entry went away!"); + if (ino != fip->i_number) + panic("ufs_rename: ino mismatch %ju != %ju\n", + (uintmax_t)ino, (uintmax_t)fip->i_number); + } + /* + * If the source is a directory with a + * new parent, the link count of the old + * parent directory must be decremented + * and ".." set to point to the new parent. + */ + if (doingdirectory && newparent) { + /* + * If tip exists we simply use its link, otherwise we must + * add a new one. + */ + if (tip == NULL) { + tdp->i_effnlink++; + tdp->i_nlink++; + DIP_SET(tdp, i_nlink, tdp->i_nlink); + tdp->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(tdvp)) + softdep_setup_dotdot_link(tdp, fip); + error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) | + DOINGASYNC(tdvp))); + /* Don't go to bad here as the new link exists. */ + if (error) + goto unlockout; + } else if (DOINGSUJ(tdvp)) + /* Journal must account for each new link. */ + softdep_setup_dotdot_link(tdp, fip); + fip->i_offset = mastertemplate.dot_reclen; + ufs_dirrewrite(fip, fdp, newparent, DT_DIR, 0); + cache_purge(fdvp); + } + error = ufs_dirremove(fdvp, fip, fcnp->cn_flags, 0); + /* + * The kern_renameat() looks up the fvp using the DELETE flag, which + * causes the removal of the name cache entry for fvp. + * As the relookup of the fvp is done in two steps: + * ufs_lookup_ino() and then VFS_VGET(), another thread might do a + * normal lookup of the from name just before the VFS_VGET() call, + * causing the cache entry to be re-instantiated. + * + * The same issue also applies to tvp if it exists as + * otherwise we may have a stale name cache entry for the new + * name that references the old i-node if it has other links + * or open file descriptors. + */ + cache_purge(fvp); + if (tvp) + cache_purge(tvp); + cache_purge_negative(tdvp); + +unlockout: + vput(fdvp); + vput(fvp); + if (tvp) + vput(tvp); + /* + * If compaction or fsync was requested do it now that other locks + * are no longer needed. + */ + if (error == 0 && endoff != 0) { + error = UFS_TRUNCATE(tdvp, endoff, IO_NORMAL | IO_SYNC, + tcnp->cn_cred); + if (error != 0) + vn_printf(tdvp, + "ufs_rename: failed to truncate, error %d\n", + error); +#ifdef UFS_DIRHASH + else if (tdp->i_dirhash != NULL) + ufsdirhash_dirtrunc(tdp, endoff); +#endif + /* + * Even if the directory compaction failed, rename was + * succesful. Do not propagate a UFS_TRUNCATE() error + * to the caller. + */ + error = 0; + } + if (error == 0 && tdp->i_flag & IN_NEEDSYNC) + error = VOP_FSYNC(tdvp, MNT_WAIT, td); + vput(tdvp); + return (error); + +bad: + fip->i_effnlink--; + fip->i_nlink--; + DIP_SET(fip, i_nlink, fip->i_nlink); + fip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(fvp)) + softdep_revert_link(tdp, fip); + goto unlockout; + +releout: + vrele(fdvp); + vrele(fvp); + vrele(tdvp); + if (tvp) + vrele(tvp); + + return (error); +} + +#ifdef UFS_ACL +static int +ufs_do_posix1e_acl_inheritance_dir(struct vnode *dvp, struct vnode *tvp, + mode_t dmode, struct ucred *cred, struct thread *td) +{ + int error; + struct inode *ip = VTOI(tvp); + struct acl *dacl, *acl; + + acl = acl_alloc(M_WAITOK); + dacl = acl_alloc(M_WAITOK); + + /* + * Retrieve default ACL from parent, if any. + */ + error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td); + switch (error) { + case 0: + /* + * Retrieved a default ACL, so merge mode and ACL if + * necessary. If the ACL is empty, fall through to + * the "not defined or available" case. + */ + if (acl->acl_cnt != 0) { + dmode = acl_posix1e_newfilemode(dmode, acl); + ip->i_mode = dmode; + DIP_SET(ip, i_mode, dmode); + *dacl = *acl; + ufs_sync_acl_from_inode(ip, acl); + break; + } + /* FALLTHROUGH */ + + case EOPNOTSUPP: + /* + * Just use the mode as-is. + */ + ip->i_mode = dmode; + DIP_SET(ip, i_mode, dmode); + error = 0; + goto out; + + default: + goto out; + } + + /* + * XXX: If we abort now, will Soft Updates notify the extattr + * code that the EAs for the file need to be released? + */ + error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td); + if (error == 0) + error = VOP_SETACL(tvp, ACL_TYPE_DEFAULT, dacl, cred, td); + switch (error) { + case 0: + break; + + case EOPNOTSUPP: + /* + * XXX: This should not happen, as EOPNOTSUPP above + * was supposed to free acl. + */ + printf("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()\n"); + /* + panic("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()"); + */ + break; + + default: + goto out; + } + +out: + acl_free(acl); + acl_free(dacl); + + return (error); +} + +static int +ufs_do_posix1e_acl_inheritance_file(struct vnode *dvp, struct vnode *tvp, + mode_t mode, struct ucred *cred, struct thread *td) +{ + int error; + struct inode *ip = VTOI(tvp); + struct acl *acl; + + acl = acl_alloc(M_WAITOK); + + /* + * Retrieve default ACL for parent, if any. + */ + error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td); + switch (error) { + case 0: + /* + * Retrieved a default ACL, so merge mode and ACL if + * necessary. + */ + if (acl->acl_cnt != 0) { + /* + * Two possible ways for default ACL to not + * be present. First, the EA can be + * undefined, or second, the default ACL can + * be blank. If it's blank, fall through to + * the it's not defined case. + */ + mode = acl_posix1e_newfilemode(mode, acl); + ip->i_mode = mode; + DIP_SET(ip, i_mode, mode); + ufs_sync_acl_from_inode(ip, acl); + break; + } + /* FALLTHROUGH */ + + case EOPNOTSUPP: + /* + * Just use the mode as-is. + */ + ip->i_mode = mode; + DIP_SET(ip, i_mode, mode); + error = 0; + goto out; + + default: + goto out; + } + + /* + * XXX: If we abort now, will Soft Updates notify the extattr + * code that the EAs for the file need to be released? + */ + error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td); + switch (error) { + case 0: + break; + + case EOPNOTSUPP: + /* + * XXX: This should not happen, as EOPNOTSUPP above was + * supposed to free acl. + */ + printf("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() " + "but no VOP_SETACL()\n"); + /* panic("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() " + "but no VOP_SETACL()"); */ + break; + + default: + goto out; + } + +out: + acl_free(acl); + + return (error); +} + +static int +ufs_do_nfs4_acl_inheritance(struct vnode *dvp, struct vnode *tvp, + mode_t child_mode, struct ucred *cred, struct thread *td) +{ + int error; + struct acl *parent_aclp, *child_aclp; + + parent_aclp = acl_alloc(M_WAITOK); + child_aclp = acl_alloc(M_WAITOK | M_ZERO); + + error = ufs_getacl_nfs4_internal(dvp, parent_aclp, td); + if (error) + goto out; + acl_nfs4_compute_inherited_acl(parent_aclp, child_aclp, + child_mode, VTOI(tvp)->i_uid, tvp->v_type == VDIR); + error = ufs_setacl_nfs4_internal(tvp, child_aclp, td); + if (error) + goto out; +out: + acl_free(parent_aclp); + acl_free(child_aclp); + + return (error); +} +#endif + +/* + * Mkdir system call + */ +static int +ufs_mkdir(ap) + struct vop_mkdir_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + struct vnode *dvp = ap->a_dvp; + struct vattr *vap = ap->a_vap; + struct componentname *cnp = ap->a_cnp; + struct inode *ip, *dp; + struct vnode *tvp; + struct buf *bp; + struct dirtemplate dirtemplate, *dtp; + struct direct newdir; + int error, dmode; + long blkoff; + +#ifdef INVARIANTS + if ((cnp->cn_flags & HASBUF) == 0) + panic("ufs_mkdir: no name"); +#endif + dp = VTOI(dvp); + if ((nlink_t)dp->i_nlink >= LINK_MAX) { + error = EMLINK; + goto out; + } + dmode = vap->va_mode & 0777; + dmode |= IFDIR; + /* + * Must simulate part of ufs_makeinode here to acquire the inode, + * but not have it entered in the parent directory. The entry is + * made later after writing "." and ".." entries. + */ + if (dp->i_effnlink < 2) { + print_bad_link_count("ufs_mkdir", dvp); + error = EINVAL; + goto out; + } + error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, &tvp); + if (error) + goto out; + ip = VTOI(tvp); + ip->i_gid = dp->i_gid; + DIP_SET(ip, i_gid, dp->i_gid); +#ifdef SUIDDIR + { +#ifdef QUOTA + struct ucred ucred, *ucp; + gid_t ucred_group; + ucp = cnp->cn_cred; +#endif + /* + * If we are hacking owners here, (only do this where told to) + * and we are not giving it TO root, (would subvert quotas) + * then go ahead and give it to the other user. + * The new directory also inherits the SUID bit. + * If user's UID and dir UID are the same, + * 'give it away' so that the SUID is still forced on. + */ + if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && + (dp->i_mode & ISUID) && dp->i_uid) { + dmode |= ISUID; + ip->i_uid = dp->i_uid; + DIP_SET(ip, i_uid, dp->i_uid); +#ifdef QUOTA + if (dp->i_uid != cnp->cn_cred->cr_uid) { + /* + * Make sure the correct user gets charged + * for the space. + * Make a dummy credential for the victim. + * XXX This seems to never be accessed out of + * our context so a stack variable is ok. + */ + refcount_init(&ucred.cr_ref, 1); + ucred.cr_uid = ip->i_uid; + ucred.cr_ngroups = 1; + ucred.cr_groups = &ucred_group; + ucred.cr_groups[0] = dp->i_gid; + ucp = &ucred; + } +#endif + } else { + ip->i_uid = cnp->cn_cred->cr_uid; + DIP_SET(ip, i_uid, ip->i_uid); + } +#ifdef QUOTA + if ((error = getinoquota(ip)) || + (error = chkiq(ip, 1, ucp, 0))) { + if (DOINGSOFTDEP(tvp)) + softdep_revert_link(dp, ip); + UFS_VFREE(tvp, ip->i_number, dmode); + vput(tvp); + return (error); + } +#endif + } +#else /* !SUIDDIR */ + ip->i_uid = cnp->cn_cred->cr_uid; + DIP_SET(ip, i_uid, ip->i_uid); +#ifdef QUOTA + if ((error = getinoquota(ip)) || + (error = chkiq(ip, 1, cnp->cn_cred, 0))) { + if (DOINGSOFTDEP(tvp)) + softdep_revert_link(dp, ip); + UFS_VFREE(tvp, ip->i_number, dmode); + vput(tvp); + return (error); + } +#endif +#endif /* !SUIDDIR */ + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + ip->i_mode = dmode; + DIP_SET(ip, i_mode, dmode); + tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ + ip->i_effnlink = 2; + ip->i_nlink = 2; + DIP_SET(ip, i_nlink, 2); + + if (cnp->cn_flags & ISWHITEOUT) { + ip->i_flags |= UF_OPAQUE; + DIP_SET(ip, i_flags, ip->i_flags); + } + + /* + * Bump link count in parent directory to reflect work done below. + * Should be done before reference is created so cleanup is + * possible if we crash. + */ + dp->i_effnlink++; + dp->i_nlink++; + DIP_SET(dp, i_nlink, dp->i_nlink); + dp->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(dvp)) + softdep_setup_mkdir(dp, ip); + error = UFS_UPDATE(dvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp))); + if (error) + goto bad; +#ifdef MAC + if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) { + error = mac_vnode_create_extattr(cnp->cn_cred, dvp->v_mount, + dvp, tvp, cnp); + if (error) + goto bad; + } +#endif +#ifdef UFS_ACL + if (dvp->v_mount->mnt_flag & MNT_ACLS) { + error = ufs_do_posix1e_acl_inheritance_dir(dvp, tvp, dmode, + cnp->cn_cred, cnp->cn_thread); + if (error) + goto bad; + } else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) { + error = ufs_do_nfs4_acl_inheritance(dvp, tvp, dmode, + cnp->cn_cred, cnp->cn_thread); + if (error) + goto bad; + } +#endif /* !UFS_ACL */ + + /* + * Initialize directory with "." and ".." from static template. + */ + if (dvp->v_mount->mnt_maxsymlinklen > 0) + dtp = &mastertemplate; + else + dtp = (struct dirtemplate *)&omastertemplate; + dirtemplate = *dtp; + dirtemplate.dot_ino = ip->i_number; + dirtemplate.dotdot_ino = dp->i_number; + vnode_pager_setsize(tvp, DIRBLKSIZ); + if ((error = UFS_BALLOC(tvp, (off_t)0, DIRBLKSIZ, cnp->cn_cred, + BA_CLRBUF, &bp)) != 0) + goto bad; + ip->i_size = DIRBLKSIZ; + DIP_SET(ip, i_size, DIRBLKSIZ); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + bcopy((caddr_t)&dirtemplate, (caddr_t)bp->b_data, sizeof dirtemplate); + if (DOINGSOFTDEP(tvp)) { + /* + * Ensure that the entire newly allocated block is a + * valid directory so that future growth within the + * block does not have to ensure that the block is + * written before the inode. + */ + blkoff = DIRBLKSIZ; + while (blkoff < bp->b_bcount) { + ((struct direct *) + (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ; + blkoff += DIRBLKSIZ; + } + } + if ((error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(tvp) | + DOINGASYNC(tvp)))) != 0) { + (void)bwrite(bp); + goto bad; + } + /* + * Directory set up, now install its entry in the parent directory. + * + * If we are not doing soft dependencies, then we must write out the + * buffer containing the new directory body before entering the new + * name in the parent. If we are doing soft dependencies, then the + * buffer containing the new directory body will be passed to and + * released in the soft dependency code after the code has attached + * an appropriate ordering dependency to the buffer which ensures that + * the buffer is written before the new name is written in the parent. + */ + if (DOINGASYNC(dvp)) + bdwrite(bp); + else if (!DOINGSOFTDEP(dvp) && ((error = bwrite(bp)))) + goto bad; + ufs_makedirentry(ip, cnp, &newdir); + error = ufs_direnter(dvp, tvp, &newdir, cnp, bp, 0); + +bad: + if (error == 0) { + *ap->a_vpp = tvp; + } else { + dp->i_effnlink--; + dp->i_nlink--; + DIP_SET(dp, i_nlink, dp->i_nlink); + dp->i_flag |= IN_CHANGE; + /* + * No need to do an explicit VOP_TRUNCATE here, vrele will + * do this for us because we set the link count to 0. + */ + ip->i_effnlink = 0; + ip->i_nlink = 0; + DIP_SET(ip, i_nlink, 0); + ip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(tvp)) + softdep_revert_mkdir(dp, ip); + + vput(tvp); + } +out: + return (error); +} + +/* + * Rmdir system call. + */ +static int +ufs_rmdir(ap) + struct vop_rmdir_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct vnode *dvp = ap->a_dvp; + struct componentname *cnp = ap->a_cnp; + struct inode *ip, *dp; + int error; + + ip = VTOI(vp); + dp = VTOI(dvp); + + /* + * Do not remove a directory that is in the process of being renamed. + * Verify the directory is empty (and valid). Rmdir ".." will not be + * valid since ".." will contain a reference to the current directory + * and thus be non-empty. Do not allow the removal of mounted on + * directories (this can happen when an NFS exported filesystem + * tries to remove a locally mounted on directory). + */ + error = 0; + if (dp->i_effnlink <= 2) { + if (dp->i_effnlink == 2) + print_bad_link_count("ufs_rmdir", dvp); + error = EINVAL; + goto out; + } + if (!ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) { + error = ENOTEMPTY; + goto out; + } + if ((dp->i_flags & APPEND) + || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) { + error = EPERM; + goto out; + } + if (vp->v_mountedhere != 0) { + error = EINVAL; + goto out; + } +#ifdef UFS_GJOURNAL + ufs_gjournal_orphan(vp); +#endif + /* + * Delete reference to directory before purging + * inode. If we crash in between, the directory + * will be reattached to lost+found, + */ + dp->i_effnlink--; + ip->i_effnlink--; + if (DOINGSOFTDEP(vp)) + softdep_setup_rmdir(dp, ip); + error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1); + if (error) { + dp->i_effnlink++; + ip->i_effnlink++; + if (DOINGSOFTDEP(vp)) + softdep_revert_rmdir(dp, ip); + goto out; + } + cache_purge(dvp); + /* + * The only stuff left in the directory is "." and "..". The "." + * reference is inconsequential since we are quashing it. The soft + * dependency code will arrange to do these operations after + * the parent directory entry has been deleted on disk, so + * when running with that code we avoid doing them now. + */ + if (!DOINGSOFTDEP(vp)) { + dp->i_nlink--; + DIP_SET(dp, i_nlink, dp->i_nlink); + dp->i_flag |= IN_CHANGE; + error = UFS_UPDATE(dvp, 0); + ip->i_nlink--; + DIP_SET(ip, i_nlink, ip->i_nlink); + ip->i_flag |= IN_CHANGE; + } + cache_purge(vp); +#ifdef UFS_DIRHASH + /* Kill any active hash; i_effnlink == 0, so it will not come back. */ + if (ip->i_dirhash != NULL) + ufsdirhash_free(ip); +#endif +out: + return (error); +} + +/* + * symlink -- make a symbolic link + */ +static int +ufs_symlink(ap) + struct vop_symlink_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + char *a_target; + } */ *ap; +{ + struct vnode *vp, **vpp = ap->a_vpp; + struct inode *ip; + int len, error; + + error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, + vpp, ap->a_cnp, "ufs_symlink"); + if (error) + return (error); + vp = *vpp; + len = strlen(ap->a_target); + if (len < vp->v_mount->mnt_maxsymlinklen) { + ip = VTOI(vp); + bcopy(ap->a_target, SHORTLINK(ip), len); + ip->i_size = len; + DIP_SET(ip, i_size, len); + ip->i_flag |= IN_CHANGE | IN_UPDATE; + error = UFS_UPDATE(vp, 0); + } else + error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0, + UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, + ap->a_cnp->cn_cred, NOCRED, NULL, NULL); + if (error) + vput(vp); + return (error); +} + +/* + * Vnode op for reading directories. + */ +int +ufs_readdir(ap) + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + int *a_eofflag; + int *a_ncookies; + u_long **a_cookies; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct uio *uio = ap->a_uio; + struct buf *bp; + struct inode *ip; + struct direct *dp, *edp; + u_long *cookies; + struct dirent dstdp; + off_t offset, startoffset; + size_t readcnt, skipcnt; + ssize_t startresid; + int ncookies; + int error; + + if (uio->uio_offset < 0) + return (EINVAL); + ip = VTOI(vp); + if (ip->i_effnlink == 0) + return (0); + if (ap->a_ncookies != NULL) { + if (uio->uio_resid < 0) + ncookies = 0; + else + ncookies = uio->uio_resid; + if (uio->uio_offset >= ip->i_size) + ncookies = 0; + else if (ip->i_size - uio->uio_offset < ncookies) + ncookies = ip->i_size - uio->uio_offset; + ncookies = ncookies / (offsetof(struct direct, d_name) + 4) + 1; + cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK); + *ap->a_ncookies = ncookies; + *ap->a_cookies = cookies; + } else { + ncookies = 0; + cookies = NULL; + } + offset = startoffset = uio->uio_offset; + startresid = uio->uio_resid; + error = 0; + while (error == 0 && uio->uio_resid > 0 && + uio->uio_offset < ip->i_size) { + error = ffs_blkatoff(vp, uio->uio_offset, NULL, &bp); + if (error) + break; + if (bp->b_offset + bp->b_bcount > ip->i_size) + readcnt = ip->i_size - bp->b_offset; + else + readcnt = bp->b_bcount; + skipcnt = (size_t)(uio->uio_offset - bp->b_offset) & + ~(size_t)(DIRBLKSIZ - 1); + offset = bp->b_offset + skipcnt; + dp = (struct direct *)&bp->b_data[skipcnt]; + edp = (struct direct *)&bp->b_data[readcnt]; + while (error == 0 && uio->uio_resid > 0 && dp < edp) { + if (dp->d_reclen <= offsetof(struct direct, d_name) || + (caddr_t)dp + dp->d_reclen > (caddr_t)edp) { + error = EIO; + break; + } +#if BYTE_ORDER == LITTLE_ENDIAN + /* Old filesystem format. */ + if (vp->v_mount->mnt_maxsymlinklen <= 0) { + dstdp.d_namlen = dp->d_type; + dstdp.d_type = dp->d_namlen; + } else +#endif + { + dstdp.d_namlen = dp->d_namlen; + dstdp.d_type = dp->d_type; + } + if (offsetof(struct direct, d_name) + dstdp.d_namlen > + dp->d_reclen) { + error = EIO; + break; + } + if (offset < startoffset || dp->d_ino == 0) + goto nextentry; + dstdp.d_fileno = dp->d_ino; + dstdp.d_reclen = GENERIC_DIRSIZ(&dstdp); + bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen); + dstdp.d_name[dstdp.d_namlen] = '\0'; + if (dstdp.d_reclen > uio->uio_resid) { + if (uio->uio_resid == startresid) + error = EINVAL; + else + error = EJUSTRETURN; + break; + } + /* Advance dp. */ + error = uiomove((caddr_t)&dstdp, dstdp.d_reclen, uio); + if (error) + break; + if (cookies != NULL) { + KASSERT(ncookies > 0, + ("ufs_readdir: cookies buffer too small")); + *cookies = offset + dp->d_reclen; + cookies++; + ncookies--; + } +nextentry: + offset += dp->d_reclen; + dp = (struct direct *)((caddr_t)dp + dp->d_reclen); + } + bqrelse(bp); + uio->uio_offset = offset; + } + /* We need to correct uio_offset. */ + uio->uio_offset = offset; + if (error == EJUSTRETURN) + error = 0; + if (ap->a_ncookies != NULL) { + if (error == 0) { + ap->a_ncookies -= ncookies; + } else { + free(*ap->a_cookies, M_TEMP); + *ap->a_ncookies = 0; + *ap->a_cookies = NULL; + } + } + if (error == 0 && ap->a_eofflag) + *ap->a_eofflag = ip->i_size <= uio->uio_offset; + return (error); +} + +/* + * Return target name of a symbolic link + */ +static int +ufs_readlink(ap) + struct vop_readlink_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + doff_t isize; + + isize = ip->i_size; + if ((isize < vp->v_mount->mnt_maxsymlinklen) || + DIP(ip, i_blocks) == 0) { /* XXX - for old fastlink support */ + return (uiomove(SHORTLINK(ip), isize, ap->a_uio)); + } + return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); +} + +/* + * Calculate the logical to physical mapping if not done already, + * then call the device strategy routine. + * + * In order to be able to swap to a file, the ufs_bmaparray() operation may not + * deadlock on memory. See ufs_bmap() for details. + */ +static int +ufs_strategy(ap) + struct vop_strategy_args /* { + struct vnode *a_vp; + struct buf *a_bp; + } */ *ap; +{ + struct buf *bp = ap->a_bp; + struct vnode *vp = ap->a_vp; + ufs2_daddr_t blkno; + int error; + + if (bp->b_blkno == bp->b_lblkno) { + error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, bp, NULL, NULL); + bp->b_blkno = blkno; + if (error) { + bp->b_error = error; + bp->b_ioflags |= BIO_ERROR; + bufdone(bp); + return (0); + } + if ((long)bp->b_blkno == -1) + vfs_bio_clrbuf(bp); + } + if ((long)bp->b_blkno == -1) { + bufdone(bp); + return (0); + } + bp->b_iooffset = dbtob(bp->b_blkno); + BO_STRATEGY(VFSTOUFS(vp->v_mount)->um_bo, bp); + return (0); +} + +/* + * Print out the contents of an inode. + */ +static int +ufs_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + + printf("\tino %lu, on dev %s", (u_long)ip->i_number, + devtoname(ITODEV(ip))); + if (vp->v_type == VFIFO) + fifo_printinfo(vp); + printf("\n"); + return (0); +} + +/* + * Close wrapper for fifos. + * + * Update the times on the inode then do device close. + */ +static int +ufsfifo_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + int usecount; + + VI_LOCK(vp); + usecount = vp->v_usecount; + if (usecount > 1) + ufs_itimes_locked(vp); + VI_UNLOCK(vp); + return (fifo_specops.vop_close(ap)); +} + +/* + * Kqfilter wrapper for fifos. + * + * Fall through to ufs kqfilter routines if needed + */ +static int +ufsfifo_kqfilter(ap) + struct vop_kqfilter_args *ap; +{ + int error; + + error = fifo_specops.vop_kqfilter(ap); + if (error) + error = vfs_kqfilter(ap); + return (error); +} + +/* + * Return POSIX pathconf information applicable to ufs filesystems. + */ +static int +ufs_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + int error; + + error = 0; + switch (ap->a_name) { + case _PC_NAME_MAX: + *ap->a_retval = NAME_MAX; + break; + case _PC_PIPE_BUF: + if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) + *ap->a_retval = PIPE_BUF; + else + error = EINVAL; + break; + case _PC_CHOWN_RESTRICTED: + *ap->a_retval = 1; + break; + case _PC_NO_TRUNC: + *ap->a_retval = 1; + break; + case _PC_ACL_EXTENDED: +#ifdef UFS_ACL + if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS) + *ap->a_retval = 1; + else + *ap->a_retval = 0; +#else + *ap->a_retval = 0; +#endif + break; + + case _PC_ACL_NFS4: +#ifdef UFS_ACL + if (ap->a_vp->v_mount->mnt_flag & MNT_NFS4ACLS) + *ap->a_retval = 1; + else + *ap->a_retval = 0; +#else + *ap->a_retval = 0; +#endif + break; + + case _PC_ACL_PATH_MAX: +#ifdef UFS_ACL + if (ap->a_vp->v_mount->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS)) + *ap->a_retval = ACL_MAX_ENTRIES; + else + *ap->a_retval = 3; +#else + *ap->a_retval = 3; +#endif + break; + case _PC_MAC_PRESENT: +#ifdef MAC + if (ap->a_vp->v_mount->mnt_flag & MNT_MULTILABEL) + *ap->a_retval = 1; + else + *ap->a_retval = 0; +#else + *ap->a_retval = 0; +#endif + break; + case _PC_MIN_HOLE_SIZE: + *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; + break; + case _PC_PRIO_IO: + *ap->a_retval = 0; + break; + case _PC_SYNC_IO: + *ap->a_retval = 0; + break; + case _PC_ALLOC_SIZE_MIN: + *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize; + break; + case _PC_FILESIZEBITS: + *ap->a_retval = 64; + break; + case _PC_REC_INCR_XFER_SIZE: + *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; + break; + case _PC_REC_MAX_XFER_SIZE: + *ap->a_retval = -1; /* means ``unlimited'' */ + break; + case _PC_REC_MIN_XFER_SIZE: + *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; + break; + case _PC_REC_XFER_ALIGN: + *ap->a_retval = PAGE_SIZE; + break; + case _PC_SYMLINK_MAX: + *ap->a_retval = MAXPATHLEN; + break; + + default: + error = vop_stdpathconf(ap); + break; + } + return (error); +} + +/* + * Initialize the vnode associated with a new inode, handle aliased + * vnodes. + */ +int +ufs_vinit(mntp, fifoops, vpp) + struct mount *mntp; + struct vop_vector *fifoops; + struct vnode **vpp; +{ + struct inode *ip; + struct vnode *vp; + + vp = *vpp; + ip = VTOI(vp); + vp->v_type = IFTOVT(ip->i_mode); + if (vp->v_type == VFIFO) + vp->v_op = fifoops; + ASSERT_VOP_LOCKED(vp, "ufs_vinit"); + if (ip->i_number == ROOTINO) + vp->v_vflag |= VV_ROOT; + *vpp = vp; + return (0); +} + +/* + * Allocate a new inode. + * Vnode dvp must be locked. + */ +static int +ufs_makeinode(mode, dvp, vpp, cnp, callfunc) + int mode; + struct vnode *dvp; + struct vnode **vpp; + struct componentname *cnp; + const char *callfunc; +{ + struct inode *ip, *pdir; + struct direct newdir; + struct vnode *tvp; + int error; + + pdir = VTOI(dvp); +#ifdef INVARIANTS + if ((cnp->cn_flags & HASBUF) == 0) + panic("%s: no name", callfunc); +#endif + *vpp = NULL; + if ((mode & IFMT) == 0) + mode |= IFREG; + + if (pdir->i_effnlink < 2) { + print_bad_link_count(callfunc, dvp); + return (EINVAL); + } + error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp); + if (error) + return (error); + ip = VTOI(tvp); + ip->i_gid = pdir->i_gid; + DIP_SET(ip, i_gid, pdir->i_gid); +#ifdef SUIDDIR + { +#ifdef QUOTA + struct ucred ucred, *ucp; + gid_t ucred_group; + ucp = cnp->cn_cred; +#endif + /* + * If we are not the owner of the directory, + * and we are hacking owners here, (only do this where told to) + * and we are not giving it TO root, (would subvert quotas) + * then go ahead and give it to the other user. + * Note that this drops off the execute bits for security. + */ + if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && + (pdir->i_mode & ISUID) && + (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) { + ip->i_uid = pdir->i_uid; + DIP_SET(ip, i_uid, ip->i_uid); + mode &= ~07111; +#ifdef QUOTA + /* + * Make sure the correct user gets charged + * for the space. + * Quickly knock up a dummy credential for the victim. + * XXX This seems to never be accessed out of our + * context so a stack variable is ok. + */ + refcount_init(&ucred.cr_ref, 1); + ucred.cr_uid = ip->i_uid; + ucred.cr_ngroups = 1; + ucred.cr_groups = &ucred_group; + ucred.cr_groups[0] = pdir->i_gid; + ucp = &ucred; +#endif + } else { + ip->i_uid = cnp->cn_cred->cr_uid; + DIP_SET(ip, i_uid, ip->i_uid); + } + +#ifdef QUOTA + if ((error = getinoquota(ip)) || + (error = chkiq(ip, 1, ucp, 0))) { + if (DOINGSOFTDEP(tvp)) + softdep_revert_link(pdir, ip); + UFS_VFREE(tvp, ip->i_number, mode); + vput(tvp); + return (error); + } +#endif + } +#else /* !SUIDDIR */ + ip->i_uid = cnp->cn_cred->cr_uid; + DIP_SET(ip, i_uid, ip->i_uid); +#ifdef QUOTA + if ((error = getinoquota(ip)) || + (error = chkiq(ip, 1, cnp->cn_cred, 0))) { + if (DOINGSOFTDEP(tvp)) + softdep_revert_link(pdir, ip); + UFS_VFREE(tvp, ip->i_number, mode); + vput(tvp); + return (error); + } +#endif +#endif /* !SUIDDIR */ + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + ip->i_mode = mode; + DIP_SET(ip, i_mode, mode); + tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ + ip->i_effnlink = 1; + ip->i_nlink = 1; + DIP_SET(ip, i_nlink, 1); + if (DOINGSOFTDEP(tvp)) + softdep_setup_create(VTOI(dvp), ip); + if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) && + priv_check_cred(cnp->cn_cred, PRIV_VFS_SETGID, 0)) { + ip->i_mode &= ~ISGID; + DIP_SET(ip, i_mode, ip->i_mode); + } + + if (cnp->cn_flags & ISWHITEOUT) { + ip->i_flags |= UF_OPAQUE; + DIP_SET(ip, i_flags, ip->i_flags); + } + + /* + * Make sure inode goes to disk before directory entry. + */ + error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(tvp) | DOINGASYNC(tvp))); + if (error) + goto bad; +#ifdef MAC + if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) { + error = mac_vnode_create_extattr(cnp->cn_cred, dvp->v_mount, + dvp, tvp, cnp); + if (error) + goto bad; + } +#endif +#ifdef UFS_ACL + if (dvp->v_mount->mnt_flag & MNT_ACLS) { + error = ufs_do_posix1e_acl_inheritance_file(dvp, tvp, mode, + cnp->cn_cred, cnp->cn_thread); + if (error) + goto bad; + } else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) { + error = ufs_do_nfs4_acl_inheritance(dvp, tvp, mode, + cnp->cn_cred, cnp->cn_thread); + if (error) + goto bad; + } +#endif /* !UFS_ACL */ + ufs_makedirentry(ip, cnp, &newdir); + error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL, 0); + if (error) + goto bad; + *vpp = tvp; + return (0); + +bad: + /* + * Write error occurred trying to update the inode + * or the directory so must deallocate the inode. + */ + ip->i_effnlink = 0; + ip->i_nlink = 0; + DIP_SET(ip, i_nlink, 0); + ip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(tvp)) + softdep_revert_create(VTOI(dvp), ip); + vput(tvp); + return (error); +} + +static int +ufs_ioctl(struct vop_ioctl_args *ap) +{ + + switch (ap->a_command) { + case FIOSEEKDATA: + case FIOSEEKHOLE: + return (vn_bmap_seekhole(ap->a_vp, ap->a_command, + (off_t *)ap->a_data, ap->a_cred)); + default: + return (ENOTTY); + } +} + +/* Global vfs data structures for ufs. */ +struct vop_vector ufs_vnodeops = { + .vop_default = &default_vnodeops, + .vop_fsync = VOP_PANIC, + .vop_read = VOP_PANIC, + .vop_reallocblks = VOP_PANIC, + .vop_write = VOP_PANIC, + .vop_accessx = ufs_accessx, + .vop_bmap = ufs_bmap, + .vop_cachedlookup = ufs_lookup, + .vop_close = ufs_close, + .vop_create = ufs_create, + .vop_getattr = ufs_getattr, + .vop_inactive = ufs_inactive, + .vop_ioctl = ufs_ioctl, + .vop_link = ufs_link, + .vop_lookup = vfs_cache_lookup, + .vop_markatime = ufs_markatime, + .vop_mkdir = ufs_mkdir, + .vop_mknod = ufs_mknod, + .vop_open = ufs_open, + .vop_pathconf = ufs_pathconf, + .vop_poll = vop_stdpoll, + .vop_print = ufs_print, + .vop_readdir = ufs_readdir, + .vop_readlink = ufs_readlink, + .vop_reclaim = ufs_reclaim, + .vop_remove = ufs_remove, + .vop_rename = ufs_rename, + .vop_rmdir = ufs_rmdir, + .vop_setattr = ufs_setattr, +#ifdef MAC + .vop_setlabel = vop_stdsetlabel_ea, +#endif + .vop_strategy = ufs_strategy, + .vop_symlink = ufs_symlink, + .vop_whiteout = ufs_whiteout, +#ifdef UFS_EXTATTR + .vop_getextattr = ufs_getextattr, + .vop_deleteextattr = ufs_deleteextattr, + .vop_setextattr = ufs_setextattr, +#endif +#ifdef UFS_ACL + .vop_getacl = ufs_getacl, + .vop_setacl = ufs_setacl, + .vop_aclcheck = ufs_aclcheck, +#endif +}; + +struct vop_vector ufs_fifoops = { + .vop_default = &fifo_specops, + .vop_fsync = VOP_PANIC, + .vop_accessx = ufs_accessx, + .vop_close = ufsfifo_close, + .vop_getattr = ufs_getattr, + .vop_inactive = ufs_inactive, + .vop_kqfilter = ufsfifo_kqfilter, + .vop_markatime = ufs_markatime, + .vop_pathconf = ufs_pathconf, + .vop_print = ufs_print, + .vop_read = VOP_PANIC, + .vop_reclaim = ufs_reclaim, + .vop_setattr = ufs_setattr, +#ifdef MAC + .vop_setlabel = vop_stdsetlabel_ea, +#endif + .vop_write = VOP_PANIC, +#ifdef UFS_EXTATTR + .vop_getextattr = ufs_getextattr, + .vop_deleteextattr = ufs_deleteextattr, + .vop_setextattr = ufs_setextattr, +#endif +#ifdef UFS_ACL + .vop_getacl = ufs_getacl, + .vop_setacl = ufs_setacl, + .vop_aclcheck = ufs_aclcheck, +#endif +}; diff --git a/Dump/ufs/ufs/ufsmount.h b/Dump/ufs/ufs/ufsmount.h new file mode 100644 index 0000000..88ecf09 --- /dev/null +++ b/Dump/ufs/ufs/ufsmount.h @@ -0,0 +1,144 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufsmount.h 8.6 (Berkeley) 3/30/95 + * $FreeBSD: releng/11.2/sys/ufs/ufs/ufsmount.h 331722 2018-03-29 02:50:57Z eadler $ + */ + +#ifndef _UFS_UFS_UFSMOUNT_H_ +#define _UFS_UFS_UFSMOUNT_H_ + +/* + * Arguments to mount UFS-based filesystems + */ +struct ufs_args { + char *fspec; /* block special device to mount */ + struct oexport_args export; /* network export information */ +}; + +#ifdef _KERNEL + +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_UFSMNT); +#endif + +struct buf; +struct inode; +struct nameidata; +struct taskqueue; +struct timeval; +struct ucred; +struct uio; +struct vnode; +struct ufs_extattr_per_mount; +struct jblocks; +struct inodedep; + +TAILQ_HEAD(inodedeplst, inodedep); +LIST_HEAD(bmsafemaphd, bmsafemap); + +/* This structure describes the UFS specific mount structure data. */ +struct ufsmount { + struct mount *um_mountp; /* filesystem vfs structure */ + struct cdev *um_dev; /* device mounted */ + struct g_consumer *um_cp; + struct bufobj *um_bo; /* Buffer cache object */ + struct vnode *um_devvp; /* block device mounted vnode */ + u_long um_fstype; /* type of filesystem */ + struct fs *um_fs; /* pointer to superblock */ + struct ufs_extattr_per_mount um_extattr; /* extended attrs */ + u_long um_nindir; /* indirect ptrs per block */ + u_long um_bptrtodb; /* indir ptr to disk block */ + u_long um_seqinc; /* inc between seq blocks */ + struct mtx um_lock; /* Protects ufsmount & fs */ + pid_t um_fsckpid; /* PID permitted fsck sysctls */ + struct mount_softdeps *um_softdep; /* softdep mgmt structure */ + struct vnode *um_quotas[MAXQUOTAS]; /* pointer to quota files */ + struct ucred *um_cred[MAXQUOTAS]; /* quota file access cred */ + time_t um_btime[MAXQUOTAS]; /* block quota time limit */ + time_t um_itime[MAXQUOTAS]; /* inode quota time limit */ + char um_qflags[MAXQUOTAS]; /* quota specific flags */ + int64_t um_savedmaxfilesize; /* XXX - limit maxfilesize */ + int um_candelete; /* devvp supports TRIM */ + int um_writesuspended; /* suspension in progress */ + u_int um_trim_inflight; + struct taskqueue *um_trim_tq; + int (*um_balloc)(struct vnode *, off_t, int, struct ucred *, + int, struct buf **); + int (*um_blkatoff)(struct vnode *, off_t, char **, struct buf **); + int (*um_truncate)(struct vnode *, off_t, int, struct ucred *); + int (*um_update)(struct vnode *, int); + int (*um_valloc)(struct vnode *, int, struct ucred *, + struct vnode **); + int (*um_vfree)(struct vnode *, ino_t, int); + void (*um_ifree)(struct ufsmount *, struct inode *); + int (*um_rdonly)(struct inode *); + void (*um_snapgone)(struct inode *); +}; + +#define UFS_BALLOC(aa, bb, cc, dd, ee, ff) VFSTOUFS((aa)->v_mount)->um_balloc(aa, bb, cc, dd, ee, ff) +#define UFS_BLKATOFF(aa, bb, cc, dd) VFSTOUFS((aa)->v_mount)->um_blkatoff(aa, bb, cc, dd) +#define UFS_TRUNCATE(aa, bb, cc, dd) VFSTOUFS((aa)->v_mount)->um_truncate(aa, bb, cc, dd) +#define UFS_UPDATE(aa, bb) VFSTOUFS((aa)->v_mount)->um_update(aa, bb) +#define UFS_VALLOC(aa, bb, cc, dd) VFSTOUFS((aa)->v_mount)->um_valloc(aa, bb, cc, dd) +#define UFS_VFREE(aa, bb, cc) VFSTOUFS((aa)->v_mount)->um_vfree(aa, bb, cc) +#define UFS_IFREE(aa, bb) ((aa)->um_ifree(aa, bb)) +#define UFS_RDONLY(aa) (ITOUMP(aa)->um_rdonly(aa)) +#define UFS_SNAPGONE(aa) (ITOUMP(aa)->um_snapgone(aa)) + +#define UFS_LOCK(aa) mtx_lock(&(aa)->um_lock) +#define UFS_UNLOCK(aa) mtx_unlock(&(aa)->um_lock) +#define UFS_MTX(aa) (&(aa)->um_lock) + +/* + * Filesystem types + */ +#define UFS1 1 +#define UFS2 2 + +/* + * Flags describing the state of quotas. + */ +#define QTF_OPENING 0x01 /* Q_QUOTAON in progress */ +#define QTF_CLOSING 0x02 /* Q_QUOTAOFF in progress */ +#define QTF_64BIT 0x04 /* 64-bit quota file */ + +/* Convert mount ptr to ufsmount ptr. */ +#define VFSTOUFS(mp) ((struct ufsmount *)((mp)->mnt_data)) +#define UFSTOVFS(ump) (ump)->um_mountp + +/* + * Macros to access filesystem parameters in the ufsmount structure. + * Used by ufs_bmap. + */ +#define MNINDIR(ump) ((ump)->um_nindir) +#define blkptrtodb(ump, b) ((b) << (ump)->um_bptrtodb) +#define is_sequential(ump, a, b) ((b) == (a) + ump->um_seqinc) +#endif /* _KERNEL */ + +#endif diff --git a/sys/include/sys/_types.h b/sys/include/sys/_types.h index c998ff6..2cfb128 100644 --- a/sys/include/sys/_types.h +++ b/sys/include/sys/_types.h @@ -29,6 +29,7 @@ #ifndef _SYS__TYPES_H_ #define _SYS__TYPES_H_ + typedef char __int8_t; typedef unsigned char __uint8_t; typedef short __int16_t; @@ -38,6 +39,10 @@ typedef long long __int64_t; typedef unsigned long long __uint64_t; +typedef __int64_t __rlim_t; /* resource limit - intentionally */ + /* signed, because of legacy code */ + /* that uses -1 for RLIM_INFINITY */ + typedef unsigned long __clock_t; typedef __uint32_t __ino_t; typedef __int32_t __ssize_t;/* stat types */ typedef __uint32_t __dev_t;/* device number */ diff --git a/sys/include/sys/descrip.h b/sys/include/sys/descrip.h index bb4b9a9..a012794 100644 --- a/sys/include/sys/descrip.h +++ b/sys/include/sys/descrip.h @@ -84,10 +84,21 @@ #define O_NDELAY O_NONBLOCK /* compat */ #define FPOSIXSHM O_NOFOLLOW +#define O_DIRECTORY 0x00020000 /* Fail if not directory */ +#define O_EXEC 0x00040000 /* Open for execute only */ + #define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM|O_DIRECT) +/* #define FFLAGS(oflags) ((oflags) + 1) #define OFLAGS(fflags) ((fflags) - 1) +*/ + +/* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */ +#define FFLAGS(oflags) ((oflags) & O_EXEC ? (oflags) : (oflags) + 1) +#define OFLAGS(fflags) ((fflags) & O_EXEC ? (fflags) : (fflags) - 1) + +#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK|O_DIRECT|FEXEC) struct fileOps; struct file; @@ -180,6 +191,8 @@ int ioctl(struct thread *, struct ioctl_args *); int getfd(struct thread *td, struct file **fp, int fd); +int_kern_openat(struct thread *,int, char *,int int); + #endif /*** diff --git a/sys/include/sys/resource.h b/sys/include/sys/resource.h index b831da5..d8dc18f 100644 --- a/sys/include/sys/resource.h +++ b/sys/include/sys/resource.h @@ -29,7 +29,13 @@ #ifndef _SYS_RESOURCE_H_ #define _SYS_RESOURCE_H_ -#include +#include + +#ifndef _RLIM_T_DECLARED +typedef __rlim_t rlim_t; +#define _RLIM_T_DECLARED +#endif + /* * Resource limits diff --git a/sys/kernel/gen_calls.c b/sys/kernel/gen_calls.c index 537409a..098025d 100644 --- a/sys/kernel/gen_calls.c +++ b/sys/kernel/gen_calls.c @@ -376,96 +376,68 @@ int sys_getrlimit(struct thread *thr, struct sys_getrlimit_args *args) { int error = 0; + struct rlimit *rlim = 0x0; + switch (args->which) { case 0: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - args->rlp->rlim_cur = thr->rlim[args->which]->rlim_cur; - args->rlp->rlim_max = thr->rlim[args->which]->rlim_max; + args->rlp->rlim_cur = thr->rlim[args->which].rlim_cur; + args->rlp->rlim_max = thr->rlim[args->which].rlim_max; break; case 1: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - args->rlp->rlim_cur = thr->rlim[args->which]->rlim_cur; - args->rlp->rlim_max = thr->rlim[args->which]->rlim_max; + args->rlp->rlim_cur = thr->rlim[args->which].rlim_cur; + args->rlp->rlim_max = thr->rlim[args->which].rlim_max; break; case 2: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - args->rlp->rlim_cur = thr->rlim[args->which]->rlim_cur; - args->rlp->rlim_max = thr->rlim[args->which]->rlim_max; + args->rlp->rlim_cur = thr->rlim[args->which].rlim_cur; + args->rlp->rlim_max = thr->rlim[args->which].rlim_max; break; case 3: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - args->rlp->rlim_cur = thr->rlim[args->which]->rlim_cur; - args->rlp->rlim_max = thr->rlim[args->which]->rlim_max; + args->rlp->rlim_cur = thr->rlim[args->which].rlim_cur; + args->rlp->rlim_max = thr->rlim[args->which].rlim_max; break; case 4: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - args->rlp->rlim_cur = thr->rlim[args->which]->rlim_cur; - args->rlp->rlim_max = thr->rlim[args->which]->rlim_max; + args->rlp->rlim_cur = thr->rlim[args->which].rlim_cur; + args->rlp->rlim_max = thr->rlim[args->which].rlim_max; break; case 5: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - args->rlp->rlim_cur = thr->rlim[args->which]->rlim_cur; - args->rlp->rlim_max = thr->rlim[args->which]->rlim_max; + args->rlp->rlim_cur = thr->rlim[args->which].rlim_cur; + args->rlp->rlim_max = thr->rlim[args->which].rlim_max; break; case 6: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - args->rlp->rlim_cur = thr->rlim[args->which]->rlim_cur; - args->rlp->rlim_max = thr->rlim[args->which]->rlim_max; + args->rlp->rlim_cur = thr->rlim[args->which].rlim_cur; + args->rlp->rlim_max = thr->rlim[args->which].rlim_max; break; case 7: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - args->rlp->rlim_cur = thr->rlim[args->which]->rlim_cur; - args->rlp->rlim_max = thr->rlim[args->which]->rlim_max; + args->rlp->rlim_cur = thr->rlim[args->which].rlim_cur; + args->rlp->rlim_max = thr->rlim[args->which].rlim_max; break; case 8: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - args->rlp->rlim_cur = thr->rlim[args->which]->rlim_cur; - args->rlp->rlim_max = thr->rlim[args->which]->rlim_max; + args->rlp->rlim_cur = thr->rlim[args->which].rlim_cur; + args->rlp->rlim_max = thr->rlim[args->which].rlim_max; break; case 9: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - args->rlp->rlim_cur = thr->rlim[args->which]->rlim_cur; - args->rlp->rlim_max = thr->rlim[args->which]->rlim_max; + args->rlp->rlim_cur = thr->rlim[args->which].rlim_cur; + args->rlp->rlim_max = thr->rlim[args->which].rlim_max; break; case 10: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - args->rlp->rlim_cur = thr->rlim[args->which]->rlim_cur; - args->rlp->rlim_max = thr->rlim[args->which]->rlim_max; + args->rlp->rlim_cur = thr->rlim[args->which].rlim_cur; + args->rlp->rlim_max = thr->rlim[args->which].rlim_max; break; case 11: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - args->rlp->rlim_cur = thr->rlim[args->which]->rlim_cur; - args->rlp->rlim_max = thr->rlim[args->which]->rlim_max; + args->rlp->rlim_cur = thr->rlim[args->which].rlim_cur; + args->rlp->rlim_max = thr->rlim[args->which].rlim_max; break; case 12: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - args->rlp->rlim_cur = thr->rlim[args->which]->rlim_cur; - args->rlp->rlim_max = thr->rlim[args->which]->rlim_max; + args->rlp->rlim_cur = thr->rlim[args->which].rlim_cur; + args->rlp->rlim_max = thr->rlim[args->which].rlim_max; break; case 13: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - args->rlp->rlim_cur = thr->rlim[args->which]->rlim_cur; - args->rlp->rlim_max = thr->rlim[args->which]->rlim_max; + args->rlp->rlim_cur = thr->rlim[args->which].rlim_cur; + args->rlp->rlim_max = thr->rlim[args->which].rlim_max; break; case 14: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - args->rlp->rlim_cur = thr->rlim[args->which]->rlim_cur; - args->rlp->rlim_max = thr->rlim[args->which]->rlim_max; + args->rlp->rlim_cur = thr->rlim[args->which].rlim_cur; + args->rlp->rlim_max = thr->rlim[args->which].rlim_max; break; default: error = -1; @@ -480,94 +452,64 @@ switch (args->which) { case 0: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - thr->rlim[args->which]->rlim_cur = args->rlp->rlim_cur; - thr->rlim[args->which]->rlim_max = args->rlp->rlim_max; + thr->rlim[args->which].rlim_cur = args->rlp->rlim_cur; + thr->rlim[args->which].rlim_max = args->rlp->rlim_max; break; case 1: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - thr->rlim[args->which]->rlim_cur = args->rlp->rlim_cur; - thr->rlim[args->which]->rlim_max = args->rlp->rlim_max; + thr->rlim[args->which].rlim_cur = args->rlp->rlim_cur; + thr->rlim[args->which].rlim_max = args->rlp->rlim_max; break; case 2: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - thr->rlim[args->which]->rlim_cur = args->rlp->rlim_cur; - thr->rlim[args->which]->rlim_max = args->rlp->rlim_max; + thr->rlim[args->which].rlim_cur = args->rlp->rlim_cur; + thr->rlim[args->which].rlim_max = args->rlp->rlim_max; break; case 3: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - thr->rlim[args->which]->rlim_cur = args->rlp->rlim_cur; - thr->rlim[args->which]->rlim_max = args->rlp->rlim_max; + thr->rlim[args->which].rlim_cur = args->rlp->rlim_cur; + thr->rlim[args->which].rlim_max = args->rlp->rlim_max; break; case 4: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - thr->rlim[args->which]->rlim_cur = args->rlp->rlim_cur; - thr->rlim[args->which]->rlim_max = args->rlp->rlim_max; + thr->rlim[args->which].rlim_cur = args->rlp->rlim_cur; + thr->rlim[args->which].rlim_max = args->rlp->rlim_max; break; case 5: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - thr->rlim[args->which]->rlim_cur = args->rlp->rlim_cur; - thr->rlim[args->which]->rlim_max = args->rlp->rlim_max; + thr->rlim[args->which].rlim_cur = args->rlp->rlim_cur; + thr->rlim[args->which].rlim_max = args->rlp->rlim_max; break; case 6: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - thr->rlim[args->which]->rlim_cur = args->rlp->rlim_cur; - thr->rlim[args->which]->rlim_max = args->rlp->rlim_max; + thr->rlim[args->which].rlim_cur = args->rlp->rlim_cur; + thr->rlim[args->which].rlim_max = args->rlp->rlim_max; break; case 7: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - thr->rlim[args->which]->rlim_cur = args->rlp->rlim_cur; - thr->rlim[args->which]->rlim_max = args->rlp->rlim_max; + thr->rlim[args->which].rlim_cur = args->rlp->rlim_cur; + thr->rlim[args->which].rlim_max = args->rlp->rlim_max; break; case 8: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - thr->rlim[args->which]->rlim_cur = args->rlp->rlim_cur; - thr->rlim[args->which]->rlim_max = args->rlp->rlim_max; + thr->rlim[args->which].rlim_cur = args->rlp->rlim_cur; + thr->rlim[args->which].rlim_max = args->rlp->rlim_max; break; case 9: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - thr->rlim[args->which]->rlim_cur = args->rlp->rlim_cur; - thr->rlim[args->which]->rlim_max = args->rlp->rlim_max; + thr->rlim[args->which].rlim_cur = args->rlp->rlim_cur; + thr->rlim[args->which].rlim_max = args->rlp->rlim_max; break; case 10: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - thr->rlim[args->which]->rlim_cur = args->rlp->rlim_cur; - thr->rlim[args->which]->rlim_max = args->rlp->rlim_max; + thr->rlim[args->which].rlim_cur = args->rlp->rlim_cur; + thr->rlim[args->which].rlim_max = args->rlp->rlim_max; break; case 11: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - thr->rlim[args->which]->rlim_cur = args->rlp->rlim_cur; - thr->rlim[args->which]->rlim_max = args->rlp->rlim_max; + thr->rlim[args->which].rlim_cur = args->rlp->rlim_cur; + thr->rlim[args->which].rlim_max = args->rlp->rlim_max; break; case 12: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - thr->rlim[args->which]->rlim_cur = args->rlp->rlim_cur; - thr->rlim[args->which]->rlim_max = args->rlp->rlim_max; + thr->rlim[args->which].rlim_cur = args->rlp->rlim_cur; + thr->rlim[args->which].rlim_max = args->rlp->rlim_max; break; case 13: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - thr->rlim[args->which]->rlim_cur = args->rlp->rlim_cur; - thr->rlim[args->which]->rlim_max = args->rlp->rlim_max; + thr->rlim[args->which].rlim_cur = args->rlp->rlim_cur; + thr->rlim[args->which].rlim_max = args->rlp->rlim_max; break; case 14: - if (thr->rlim[args->which] == 0) - thr->rlim[args->which] = (struct rlimit *) kmalloc(sizeof(struct rlimit)); - thr->rlim[args->which]->rlim_cur = args->rlp->rlim_cur; - thr->rlim[args->which]->rlim_max = args->rlp->rlim_max; + thr->rlim[args->which].rlim_cur = args->rlp->rlim_cur; + thr->rlim[args->which].rlim_max = args->rlp->rlim_max; break; default: error = -1; diff --git a/sys/kernel/vfs_calls.c b/sys/kernel/vfs_calls.c index 1fe71b4..acc1b86 100644 --- a/sys/kernel/vfs_calls.c +++ b/sys/kernel/vfs_calls.c @@ -36,36 +36,13 @@ #include int sys_open(struct thread *td, struct sys_open_args *args) { - int error = 0x0; - int fd = 0x0; - struct file *nfp = 0x0; - error = falloc(td, &nfp, &fd); + return(kern_openat(td, AT_FDCWD, args->path, args->flags, args->mode)); - if (error) { - td->td_retval[0] = -1; - return (error); - } - - - nfp->fd = fopen(args->path, "rb"); - - if (nfp->fd == 0x0) { - fdestroy(td, nfp, fd); - - td->td_retval[0] = -1; - error = -1; - } - else { - td->td_retval[0] = fd; - } - - //kprintf("sO: 0x%X:%s:", args->mode, args->path, td->td_retval[0]); - - return (error); } int sys_openat(struct thread *td, struct sys_openat_args *args) { + int error = 0x0; int fd = 0x0; struct file *nfp = 0x0; @@ -413,3 +390,52 @@ thr->td_retval[0] = 2; return (-1); } + +int kern_openat(struct thread *thr, int fd, char *path, int flags, int mode) { + int error = 0x0; + int fd = 0x0; + struct file *nfp = 0x0; + + /* + * Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags + * may be specified. + */ + if (flags & O_EXEC) { + if (flags & O_ACCMODE) + return (EINVAL); + } + else if ((flags & O_ACCMODE) == O_ACCMODE) { + return (EINVAL); + } + else { + flags = FFLAGS(flags); + } + + error = falloc(td, &nfp, &fd); + + if (error) { + thr->td_retval[0] = -1; + return (error); + } + + nfp->f_flag = flags & FMASK; + + + nfp->fd = fopen(args->path, "rb"); + + + if (nfp->fd == 0x0) { + fdestroy(td, nfp, fd); + + td->td_retval[0] = -1; + error = -1; + } + else { + td->td_retval[0] = fd; + } + + //kprintf("sO: 0x%X:%s:", args->mode, args->path, td->td_retval[0]); + + return (error); + +}