diff --git a/src/sys/ufs/ufs/README.acls b/src/sys/ufs/ufs/README.acls new file mode 100644 index 0000000..ef752c7 --- /dev/null +++ b/src/sys/ufs/ufs/README.acls @@ -0,0 +1,79 @@ +$FreeBSD: src/sys/ufs/ufs/README.acls,v 1.6 2002/10/19 16:09:16 rwatson Exp $ + + UFS Access Control Lists Copyright + +The UFS Access Control Lists implementation is copyright Robert Watson, +and is made available under a Berkeley-style license. + + About UFS Access Control Lists (ACLs) + +Access control lists allow the association of fine-grained discretionary +access control information with files and directories, extending the +base UNIX permission model in a (mostly) compatible way. This +implementation largely follows the POSIX.1e model, and relies on the +availability of extended attributes to store extended components of +the ACL, while maintaining the base permission information in the inode. + + Using UFS Access Control Lists (ACLs) + +Support for UFS access control lists may be enabled by adding: + + options UFS_ACL + +to your kernel configuration. As ACLs rely on the availability of extended +attributes, your file systems must have support for extended attributes. +For UFS2, this is supported natively, so no further configuration is +necessary. For UFS1, you must also enable the optional extended attributes +support documented in README.extattr. A summary of the instructions +and ACL-specific information follows. + +To enable support for ACLs on a file system, the 'acls' mount flag +must be set for the file system. This may be set using the tunefs +'-a' flag: + + tunefs -a enable /dev/md0a + +Or by using the mount-time flag: + + mount -o acls /dev/md0a /mnt + +The flag may also be set in /etc/fstab. Note that mounting a file +system previously configured for ACLs without ACL-support will result +in incorrect application of discretionary protections. Likewise, +mounting an ACL-enabled file system without kernel support for ACLs +will result in incorrect application of discretionary protections. If +the kernel is not configured for ACL support, a warning will be +printed by the kernel at mount-time. For reliability purposes, it +is recommended that the superblock flag be used instead of the +mount-time flag, as this will avoid re-mount isses with the root file +system. For reliability and performance reasons, the use of ACLs on +UFS1 is discouraged; UFS2 extended attributes provide a more reliable +storage mechanism for ACLs. + +Currently, support for ACLs on UFS1 requires the use of UFS1 EAs, which may +be enabled by adding: + + options UFS_EXTATTR + +to your kernel configuration file and rebuilding. Because of filesystem +mount atomicity requirements, it is also recommended that: + + options UFS_EXTATTR_AUTOSTART + +be added to the kernel so as to support the atomic enabling of the +required extended attributes with the filesystem mount operation. To +enable ACLs, two extended attributes must be available in the +EXTATTR_NAMESPACE_SYSTEM namespace: "posix1e.acl_access", which holds +the access ACL, and "posix1e.acl_default" which holds the default ACL +for directories. If you're using UFS1 Extended Attributes, the following +commands may be used to create the necessary EA backing files for +ACLs in the filesystem root of each filesystem. In these examples, +the root filesystem is used; see README.extattr for more details. + + mkdir -p /.attribute/system + cd /.attribute/system + extattrctl initattr -p / 388 posix1e.acl_access + extattrctl initattr -p / 388 posix1e.acl_default + +On the next mount of the root filesystem, the attributes will be +automatically started, and ACLs will be enabled. diff --git a/src/sys/ufs/ufs/README.extattr b/src/sys/ufs/ufs/README.extattr new file mode 100644 index 0000000..a6e07d0 --- /dev/null +++ b/src/sys/ufs/ufs/README.extattr @@ -0,0 +1,91 @@ +$FreeBSD: src/sys/ufs/ufs/README.extattr,v 1.5 2002/10/18 21:11:36 rwatson Exp $ + + UFS Extended Attributes Copyright + +The UFS Extended Attributes implementation is copyright Robert Watson, and +is made available under a Berkeley-style license. + + About UFS Extended Attributes + +Extended attributes allow the association of additional arbitrary +meta-data with files and directories. Extended attributes are defined in +the form name=value, where name is an nul-terminated string in the style +of a filename, and value is a binary blob of zero or more bytes. The UFS +extended attribute service layers support for extended attributes onto a +backing file, in the style of the quota implementation, meaning that it +requires no underlying format changes in the filesystem. This design +choice exchanges simplicity, usability and easy deployment for +performance. When defined, extended attribute names exist in a series of +disjoint namespaces: currently, two namespaces are defined: +EXTATTR_NAMESPACE_SYSTEM and EXTATTR_NAMESPACE_USER. The primary +distinction lies in the protection model: USER EAs are protected using the +normal inode protections, whereas SYSTEM EAs require privilege to access +or modify. + + Using UFS Extended Attributes + +Support for UFS extended attributes is natively available in UFS2, and +requires no special configuration. For reliability, administrative, +and performance reasons, if you plan to use extended attributes, it +is recommended that you use UFS2 in preference to UFS1. + +Support for UFS extended attributes may be enabled for UFS1 by adding: + + options UFS_EXTATTR + +to your kernel configuration file. This allows UFS-based filesystems to +support extended attributes, but requires manual administration of EAs +using the extattrctl tool, including the starting of EA support for each +filesystem, and the enabling of individual attributes for the file +system. The extattrctl utility may be used to initialize backing files +before first use, to start and stop EA service on a filesystem, and to +enable and disable named attributes. The command lines for extattrctl +take the following forms: + + extattrctl start [path] + extattrctl stop [path] + extattrctl initattr [-f] [-p path] [attrsize] [attrfile] + extattrctl enable [path] [attrnamespace] [attrname] [attrfile] + extattrctl disable [path] [attrnamespace] [attrname] + +In each case, [path] is used to indicate the mounted filesystem on which +to perform the operation. [attrnamespace] refers to the namespace in +which the attribute is being manipulated, and may be "system" or "user". +The [attrname] is the attribute name to use for the operation. The +[attrfile] argument specifies the attribute backing file to use. When +using the "initattr" function to initialize a backing file, the maximum +size of attribute data must be defined in bytes using the [attrsize] +field. Optionally, the [-p path] argument may be used to indicate to +extattrctl that it should pre-allocate space for EA data, rather than +creating a sparse backing file. This prevents attribute operations from +failing in low disk-space conditions (which can be important when EAs are +used for security purposes), but pre-allocation will consume space +proportional to the product of the defined maximum attribute size and +number of attributes on the specified filesystem. + +Manual configuration increases administrative overhead, but also +introduces the possibility of race conditions during filesystem mount, if +EAs are used to support other features, as starting the EAs manually is +not atomic with the mount operation. To address this problem, an +additional kernel option may be defined to auto-start EAs on a UFS file +system based on special directories at mount-time: + + options UFS_EXTATTR_AUTOSTART + +If this option is defined, UFS will search for a ".attribute" +sub-directory of the filesystem root during the mount operation. If it +is found, EA support will be started for the filesystem. UFS will then +search for "system" and "user" sub-directories of the ".attribute" +directory for any potential backing files, and enable an EA for each valid +backing file with the name of the backing file as the attribute name. +For example, by creating the following tree, the two EAs, +posix1e.acl_access and posix1e.acl_default will be enabled in the system +namespace of the root filesystem, reserving space for attribute data: + + mkdir -p /.attribute/system + cd /.attribute/system + extattrctl initattr -p / 388 posix1e.acl_access + extattrctl initattr -p / 388 posix1e.acl_default + +On the next mount of the root filesystem, the attributes will be +automatically started. diff --git a/src/sys/ufs/ufs/acl.h b/src/sys/ufs/ufs/acl.h new file mode 100644 index 0000000..6c3cc99 --- /dev/null +++ b/src/sys/ufs/ufs/acl.h @@ -0,0 +1,49 @@ +/*- + * Copyright (c) 1999-2001 Robert N. M. Watson + * All rights reserved. + * + * This software was developed by Robert Watson for the TrustedBSD Project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/ufs/ufs/acl.h,v 1.5 2003/08/04 03:29:13 rwatson Exp $ + */ +/* + * Developed by the TrustedBSD Project. + * Support for POSIX.1e access control lists. + */ + +#ifndef _UFS_UFS_ACL_H_ +#define _UFS_UFS_ACL_H_ + +#ifdef _KERNEL + +void ufs_sync_acl_from_inode(struct inode *ip, struct acl *acl); +void ufs_sync_inode_from_acl(struct acl *acl, struct inode *ip); + +int ufs_getacl(struct vop_getacl_args *); +int ufs_setacl(struct vop_setacl_args *); +int ufs_aclcheck(struct vop_aclcheck_args *); + +#endif /* !_KERNEL */ + +#endif /* !_UFS_UFS_ACL_H_ */ diff --git a/src/sys/ufs/ufs/dinode.h b/src/sys/ufs/ufs/dinode.h new file mode 100644 index 0000000..56da597 --- /dev/null +++ b/src/sys/ufs/ufs/dinode.h @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2002 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed for the FreeBSD Project by Marshall + * Kirk McKusick and Network Associates Laboratories, the Security + * Research Division of Network Associates, Inc. under DARPA/SPAWAR + * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS + * research program + * + * Copyright (c) 1982, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the authors may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)dinode.h 8.3 (Berkeley) 1/21/94 + * $FreeBSD: src/sys/ufs/ufs/dinode.h,v 1.11 2002/07/16 22:36:00 mckusick Exp $ + */ + +#ifndef _UFS_UFS_DINODE_H_ +#define _UFS_UFS_DINODE_H_ + +/* + * The root inode is the root of the filesystem. Inode 0 can't be used for + * normal purposes and historically bad blocks were linked to inode 1, thus + * the root inode is 2. (Inode 1 is no longer used for this purpose, however + * numerous dump tapes make this assumption, so we are stuck with it). + */ +#define ROOTINO ((ino_t)2) + +/* + * The Whiteout inode# is a dummy non-zero inode number which will + * never be allocated to a real file. It is used as a place holder + * in the directory entry which has been tagged as a DT_W entry. + * See the comments about ROOTINO above. + */ +#define WINO ((ino_t)1) + +/* + * The size of physical and logical block numbers and time fields in UFS. + */ +typedef int32_t ufs1_daddr_t; +typedef int64_t ufs2_daddr_t; +typedef int64_t ufs_lbn_t; +typedef int64_t ufs_time_t; + +/* File permissions. */ +#define IEXEC 0000100 /* Executable. */ +#define IWRITE 0000200 /* Writeable. */ +#define IREAD 0000400 /* Readable. */ +#define ISVTX 0001000 /* Sticky bit. */ +#define ISGID 0002000 /* Set-gid. */ +#define ISUID 0004000 /* Set-uid. */ + +/* File types. */ +#define IFMT 0170000 /* Mask of file type. */ +#define IFIFO 0010000 /* Named pipe (fifo). */ +#define IFCHR 0020000 /* Character device. */ +#define IFDIR 0040000 /* Directory file. */ +#define IFBLK 0060000 /* Block device. */ +#define IFREG 0100000 /* Regular file. */ +#define IFLNK 0120000 /* Symbolic link. */ +#define IFSOCK 0140000 /* UNIX domain socket. */ +#define IFWHT 0160000 /* Whiteout. */ + +/* + * A dinode contains all the meta-data associated with a UFS2 file. + * This structure defines the on-disk format of a dinode. Since + * this structure describes an on-disk structure, all its fields + * are defined by types with precise widths. + */ + +#define NXADDR 2 /* External addresses in inode. */ +#define NDADDR 12 /* Direct addresses in inode. */ +#define NIADDR 3 /* Indirect addresses in inode. */ + +struct ufs2_dinode { + u_int16_t di_mode; /* 0: IFMT, permissions; see below. */ + int16_t di_nlink; /* 2: File link count. */ + u_int32_t di_uid; /* 4: File owner. */ + u_int32_t di_gid; /* 8: File group. */ + u_int32_t di_blksize; /* 12: Inode blocksize. */ + u_int64_t di_size; /* 16: File byte count. */ + u_int64_t di_blocks; /* 24: Bytes actually held. */ + ufs_time_t di_atime; /* 32: Last access time. */ + ufs_time_t di_mtime; /* 40: Last modified time. */ + ufs_time_t di_ctime; /* 48: Last inode change time. */ + ufs_time_t di_birthtime; /* 56: Inode creation time. */ + int32_t di_mtimensec; /* 64: Last modified time. */ + int32_t di_atimensec; /* 68: Last access time. */ + int32_t di_ctimensec; /* 72: Last inode change time. */ + int32_t di_birthnsec; /* 76: Inode creation time. */ + int32_t di_gen; /* 80: Generation number. */ + u_int32_t di_kernflags; /* 84: Kernel flags. */ + u_int32_t di_flags; /* 88: Status flags (chflags). */ + int32_t di_extsize; /* 92: External attributes block. */ + ufs2_daddr_t di_extb[NXADDR];/* 96: External attributes block. */ + ufs2_daddr_t di_db[NDADDR]; /* 112: Direct disk blocks. */ + ufs2_daddr_t di_ib[NIADDR]; /* 208: Indirect disk blocks. */ + int64_t di_spare[3]; /* 232: Reserved; currently unused */ +}; + +/* + * The di_db fields may be overlaid with other information for + * file types that do not have associated disk storage. Block + * and character devices overlay the first data block with their + * dev_t value. Short symbolic links place their path in the + * di_db area. + */ +#define di_rdev di_db[0] + +/* + * A UFS1 dinode contains all the meta-data associated with a UFS1 file. + * This structure defines the on-disk format of a UFS1 dinode. Since + * this structure describes an on-disk structure, all its fields + * are defined by types with precise widths. + */ +struct ufs1_dinode { + u_int16_t di_mode; /* 0: IFMT, permissions; see below. */ + int16_t di_nlink; /* 2: File link count. */ + union { + u_int16_t oldids[2]; /* 4: Ffs: old user and group ids. */ + } di_u; + u_int64_t di_size; /* 8: File byte count. */ + int32_t di_atime; /* 16: Last access time. */ + int32_t di_atimensec; /* 20: Last access time. */ + int32_t di_mtime; /* 24: Last modified time. */ + int32_t di_mtimensec; /* 28: Last modified time. */ + int32_t di_ctime; /* 32: Last inode change time. */ + int32_t di_ctimensec; /* 36: Last inode change time. */ + ufs1_daddr_t di_db[NDADDR]; /* 40: Direct disk blocks. */ + ufs1_daddr_t di_ib[NIADDR]; /* 88: Indirect disk blocks. */ + u_int32_t di_flags; /* 100: Status flags (chflags). */ + int32_t di_blocks; /* 104: Blocks actually held. */ + int32_t di_gen; /* 108: Generation number. */ + u_int32_t di_uid; /* 112: File owner. */ + u_int32_t di_gid; /* 116: File group. */ + int32_t di_spare[2]; /* 120: Reserved; currently unused */ +}; +#define di_ogid di_u.oldids[1] +#define di_ouid di_u.oldids[0] + +#endif /* _UFS_UFS_DINODE_H_ */ diff --git a/src/sys/ufs/ufs/dir.h b/src/sys/ufs/ufs/dir.h new file mode 100644 index 0000000..50e4c33 --- /dev/null +++ b/src/sys/ufs/ufs/dir.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)dir.h 8.2 (Berkeley) 1/21/94 + * $FreeBSD: src/sys/ufs/ufs/dir.h,v 1.9 1999/08/28 00:52:27 peter Exp $ + */ + +#ifndef _UFS_UFS_DIR_H_ +#define _UFS_UFS_DIR_H_ + +/* + * Theoretically, directories can be more than 2Gb in length, however, in + * practice this seems unlikely. So, we define the type doff_t as a 32-bit + * quantity to keep down the cost of doing lookup on a 32-bit machine. + */ +#define doff_t int32_t +#define MAXDIRSIZE (0x7fffffff) + +/* + * A directory consists of some number of blocks of DIRBLKSIZ + * bytes, where DIRBLKSIZ is chosen such that it can be transferred + * to disk in a single atomic operation (e.g. 512 bytes on most machines). + * + * Each DIRBLKSIZ byte block contains some number of directory entry + * structures, which are of variable length. Each directory entry has + * a struct direct at the front of it, containing its inode number, + * the length of the entry, and the length of the name contained in + * the entry. These are followed by the name padded to a 4 byte boundary + * with null bytes. All names are guaranteed null terminated. + * The maximum length of a name in a directory is MAXNAMLEN. + * + * The macro DIRSIZ(fmt, dp) gives the amount of space required to represent + * a directory entry. Free space in a directory is represented by + * entries which have dp->d_reclen > DIRSIZ(fmt, dp). All DIRBLKSIZ bytes + * in a directory block are claimed by the directory entries. This + * usually results in the last entry in a directory having a large + * dp->d_reclen. When entries are deleted from a directory, the + * space is returned to the previous entry in the same directory + * block by increasing its dp->d_reclen. If the first entry of + * a directory block is free, then its dp->d_ino is set to 0. + * Entries other than the first in a directory do not normally have + * dp->d_ino set to 0. + */ +#define DIRBLKSIZ DEV_BSIZE +#define MAXNAMLEN 255 + +struct direct { + u_int32_t d_ino; /* inode number of entry */ + u_int16_t d_reclen; /* length of this record */ + u_int8_t d_type; /* file type, see below */ + u_int8_t d_namlen; /* length of string in d_name */ + char d_name[MAXNAMLEN + 1];/* name with length <= MAXNAMLEN */ +}; + +/* + * File types + */ +#define DT_UNKNOWN 0 +#define DT_FIFO 1 +#define DT_CHR 2 +#define DT_DIR 4 +#define DT_BLK 6 +#define DT_REG 8 +#define DT_LNK 10 +#define DT_SOCK 12 +#define DT_WHT 14 + +/* + * Convert between stat structure types and directory types. + */ +#define IFTODT(mode) (((mode) & 0170000) >> 12) +#define DTTOIF(dirtype) ((dirtype) << 12) + +/* + * The DIRSIZ macro gives the minimum record length which will hold + * the directory entry. This requires the amount of space in struct direct + * without the d_name field, plus enough space for the name with a terminating + * null byte (dp->d_namlen+1), rounded up to a 4 byte boundary. + * + * + */ +#define DIRECTSIZ(namlen) \ + (((int)&((struct direct *)0)->d_name + \ + ((namlen)+1)*sizeof(((struct direct *)0)->d_name[0]) + 3) & ~3) +#if (BYTE_ORDER == LITTLE_ENDIAN) +#define DIRSIZ(oldfmt, dp) \ + ((oldfmt) ? DIRECTSIZ((dp)->d_type) : DIRECTSIZ((dp)->d_namlen)) +#else +#define DIRSIZ(oldfmt, dp) \ + DIRECTSIZ((dp)->d_namlen) +#endif +#define OLDDIRFMT 1 +#define NEWDIRFMT 0 + +/* + * Template for manipulating directories. Should use struct direct's, + * but the name field is MAXNAMLEN - 1, and this just won't do. + */ +struct dirtemplate { + u_int32_t dot_ino; + int16_t dot_reclen; + u_int8_t dot_type; + u_int8_t dot_namlen; + char dot_name[4]; /* must be multiple of 4 */ + u_int32_t dotdot_ino; + int16_t dotdot_reclen; + u_int8_t dotdot_type; + u_int8_t dotdot_namlen; + char dotdot_name[4]; /* ditto */ +}; + +/* + * This is the old format of directories, sanz type element. + */ +struct odirtemplate { + u_int32_t dot_ino; + int16_t dot_reclen; + u_int16_t dot_namlen; + char dot_name[4]; /* must be multiple of 4 */ + u_int32_t dotdot_ino; + int16_t dotdot_reclen; + u_int16_t dotdot_namlen; + char dotdot_name[4]; /* ditto */ +}; +#endif /* !_DIR_H_ */ diff --git a/src/sys/ufs/ufs/dirhash.h b/src/sys/ufs/ufs/dirhash.h new file mode 100644 index 0000000..77177f1 --- /dev/null +++ b/src/sys/ufs/ufs/dirhash.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2001 Ian Dowse. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/ufs/ufs/dirhash.h,v 1.4 2003/01/01 18:48:59 schweikh Exp $ + */ + +#ifndef _UFS_UFS_DIRHASH_H_ +#define _UFS_UFS_DIRHASH_H_ + +/* + * For fast operations on large directories, we maintain a hash + * that maps the file name to the offset of the directory entry within + * the directory file. + * + * The hashing uses a dumb spillover to the next free slot on + * collisions, so we must keep the utilisation low to avoid + * long linear searches. Deleted entries that are not the last + * in a chain must be marked DIRHASH_DEL. + * + * We also maintain information about free space in each block + * to speed up creations. + */ +#define DIRHASH_EMPTY (-1) /* entry unused */ +#define DIRHASH_DEL (-2) /* deleted entry; may be part of chain */ + +#define DIRALIGN 4 +#define DH_NFSTATS (DIRECTSIZ(MAXNAMLEN + 1) / DIRALIGN) + /* max DIRALIGN words in a directory entry */ + +/* + * Dirhash uses a score mechanism to achieve a hybrid between a + * least-recently-used and a least-often-used algorithm for entry + * recycling. The score is incremented when a directory is used, and + * decremented when the directory is a candidate for recycling. When + * the score reaches zero, the hash is recycled. Hashes are linked + * together on a TAILQ list, and hashes with higher scores filter + * towards the tail (most recently used) end of the list. + * + * New hash entries are given an inital score of DH_SCOREINIT and are + * placed at the most-recently-used end of the list. This helps a lot + * in the worst-case case scenario where every directory access is + * to a directory that is not hashed (i.e. the working set of hash + * candidates is much larger than the configured memry limit). In this + * case it limits the number of hash builds to 1/DH_SCOREINIT of the + * number of accesses. + */ +#define DH_SCOREINIT 8 /* initial dh_score when dirhash built */ +#define DH_SCOREMAX 64 /* max dh_score value */ + +/* + * The main hash table has 2 levels. It is an array of pointers to + * blocks of DH_NBLKOFF offsets. + */ +#define DH_BLKOFFSHIFT 8 +#define DH_NBLKOFF (1 << DH_BLKOFFSHIFT) +#define DH_BLKOFFMASK (DH_NBLKOFF - 1) + +#define DH_ENTRY(dh, slot) \ + ((dh)->dh_hash[(slot) >> DH_BLKOFFSHIFT][(slot) & DH_BLKOFFMASK]) + +struct dirhash { + struct mtx dh_mtx; /* protects all fields except dh_list */ + + doff_t **dh_hash; /* the hash array (2-level) */ + int dh_narrays; /* number of entries in dh_hash */ + int dh_hlen; /* total slots in the 2-level hash array */ + int dh_hused; /* entries in use */ + + /* Free space statistics. XXX assumes DIRBLKSIZ is 512. */ + u_int8_t *dh_blkfree; /* free DIRALIGN words in each dir block */ + int dh_nblk; /* size of dh_blkfree array */ + int dh_dirblks; /* number of DIRBLKSIZ blocks in dir */ + int dh_firstfree[DH_NFSTATS + 1]; /* first blk with N words free */ + + int dh_seqopt; /* sequential access optimisation enabled */ + doff_t dh_seqoff; /* sequential access optimisation offset */ + + int dh_score; /* access count for this dirhash */ + + int dh_onlist; /* true if on the ufsdirhash_list chain */ + + /* Protected by ufsdirhash_mtx. */ + TAILQ_ENTRY(dirhash) dh_list; /* chain of all dirhashes */ +}; + + +/* + * Dirhash functions. + */ +void ufsdirhash_init(void); +void ufsdirhash_uninit(void); +int ufsdirhash_build(struct inode *); +doff_t ufsdirhash_findfree(struct inode *, int, int *); +doff_t ufsdirhash_enduseful(struct inode *); +int ufsdirhash_lookup(struct inode *, char *, int, doff_t *, struct buf **, + doff_t *); +void ufsdirhash_newblk(struct inode *, doff_t); +void ufsdirhash_add(struct inode *, struct direct *, doff_t); +void ufsdirhash_remove(struct inode *, struct direct *, doff_t); +void ufsdirhash_move(struct inode *, struct direct *, doff_t, doff_t); +void ufsdirhash_dirtrunc(struct inode *, doff_t); +void ufsdirhash_free(struct inode *); + +void ufsdirhash_checkblock(struct inode *, char *, doff_t); + +#endif /* !_UFS_UFS_DIRHASH_H_ */ diff --git a/src/sys/ufs/ufs/extattr.h b/src/sys/ufs/ufs/extattr.h new file mode 100644 index 0000000..49809b6 --- /dev/null +++ b/src/sys/ufs/ufs/extattr.h @@ -0,0 +1,111 @@ +/*- + * Copyright (c) 1999-2001 Robert N. M. Watson + * All rights reserved. + * + * This software was developed by Robert Watson for the TrustedBSD Project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/ufs/ufs/extattr.h,v 1.18 2003/07/28 18:53:28 rwatson Exp $ + */ +/* + * Developed by the TrustedBSD Project. + * Support for extended filesystem attributes. + */ + +#ifndef _UFS_UFS_EXTATTR_H_ +#define _UFS_UFS_EXTATTR_H_ + +#define UFS_EXTATTR_MAGIC 0x00b5d5ec +#define UFS_EXTATTR_VERSION 0x00000003 +#define UFS_EXTATTR_FSROOTSUBDIR ".attribute" +#define UFS_EXTATTR_SUBDIR_SYSTEM "system" +#define UFS_EXTATTR_SUBDIR_USER "user" +#define UFS_EXTATTR_MAXEXTATTRNAME 65 /* including null */ + +#define UFS_EXTATTR_ATTR_FLAG_INUSE 0x00000001 /* attr has been set */ +#define UFS_EXTATTR_PERM_KERNEL 0x00000000 +#define UFS_EXTATTR_PERM_ROOT 0x00000001 +#define UFS_EXTATTR_PERM_OWNER 0x00000002 +#define UFS_EXTATTR_PERM_ANYONE 0x00000003 + +#define UFS_EXTATTR_UEPM_INITIALIZED 0x00000001 +#define UFS_EXTATTR_UEPM_STARTED 0x00000002 + +#define UFS_EXTATTR_CMD_START 0x00000001 +#define UFS_EXTATTR_CMD_STOP 0x00000002 +#define UFS_EXTATTR_CMD_ENABLE 0x00000003 +#define UFS_EXTATTR_CMD_DISABLE 0x00000004 + +struct ufs_extattr_fileheader { + u_int uef_magic; /* magic number for sanity checking */ + u_int uef_version; /* version of attribute file */ + u_int uef_size; /* size of attributes, w/o header */ +}; + +struct ufs_extattr_header { + u_int ueh_flags; /* flags for attribute */ + u_int ueh_len; /* local defined length; <= uef_size */ + u_int32_t ueh_i_gen; /* generation number for sanity */ + /* data follows the header */ +}; + +#ifdef _KERNEL + +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_EXTATTR); +#endif + +struct vnode; +LIST_HEAD(ufs_extattr_list_head, ufs_extattr_list_entry); +struct ufs_extattr_list_entry { + LIST_ENTRY(ufs_extattr_list_entry) uele_entries; + struct ufs_extattr_fileheader uele_fileheader; + int uele_attrnamespace; + char uele_attrname[UFS_EXTATTR_MAXEXTATTRNAME]; + struct vnode *uele_backing_vnode; +}; + +struct lock; +struct ucred; +struct ufs_extattr_per_mount { + struct lock uepm_lock; + struct ufs_extattr_list_head uepm_list; + struct ucred *uepm_ucred; + int uepm_flags; +}; + +void ufs_extattr_uepm_init(struct ufs_extattr_per_mount *uepm); +void ufs_extattr_uepm_destroy(struct ufs_extattr_per_mount *uepm); +int ufs_extattr_start(struct mount *mp, struct thread *td); +int ufs_extattr_autostart(struct mount *mp, struct thread *td); +int ufs_extattr_stop(struct mount *mp, struct thread *td); +int ufs_extattrctl(struct mount *mp, int cmd, struct vnode *filename, + int attrnamespace, const char *attrname, struct thread *td); +int ufs_getextattr(struct vop_getextattr_args *ap); +int ufs_deleteextattr(struct vop_deleteextattr_args *ap); +int ufs_setextattr(struct vop_setextattr_args *ap); +void ufs_extattr_vnode_inactive(struct vnode *vp, struct thread *td); + +#endif /* !_KERNEL */ + +#endif /* !_UFS_UFS_EXTATTR_H_ */ diff --git a/src/sys/ufs/ufs/inode.h b/src/sys/ufs/ufs/inode.h new file mode 100644 index 0000000..f0f8a74 --- /dev/null +++ b/src/sys/ufs/ufs/inode.h @@ -0,0 +1,179 @@ +/* + * Copyright (c) 1982, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)inode.h 8.9 (Berkeley) 5/14/95 + * $FreeBSD: src/sys/ufs/ufs/inode.h,v 1.44 2003/08/15 20:03:19 phk Exp $ + */ + +#ifndef _UFS_UFS_INODE_H_ +#define _UFS_UFS_INODE_H_ + +#include +#include +#include + +/* + * This must agree with the definition in . + */ +#define doff_t int32_t + +/* + * The inode is used to describe each active (or recently active) file in the + * UFS filesystem. It is composed of two types of information. The first part + * is the information that is needed only while the file is active (such as + * the identity of the file and linkage to speed its lookup). The second part + * is the permanent meta-data associated with the file which is read in + * from the permanent dinode from long term storage when the file becomes + * active, and is put back when the file is no longer being used. + */ +struct inode { + LIST_ENTRY(inode) i_hash;/* Hash chain. */ + TAILQ_ENTRY(inode) i_nextsnap; /* snapshot file list. */ + struct vnode *i_vnode;/* Vnode associated with this inode. */ + struct ufsmount *i_ump;/* Ufsmount point associated with this inode. */ + u_int32_t i_flag; /* flags, see below */ + struct cdev *i_dev; /* Device associated with the inode. */ + ino_t i_number; /* The identity of the inode. */ + int i_effnlink; /* i_nlink when I/O completes */ + + struct fs *i_fs; /* Associated filesystem superblock. */ + struct dquot *i_dquot[MAXQUOTAS]; /* Dquot structures. */ + u_quad_t i_modrev; /* Revision level for NFS lease. */ + struct lockf *i_lockf;/* Head of byte-level lock list. */ + /* + * Side effects; used during directory lookup. + */ + int32_t i_count; /* Size of free slot in directory. */ + doff_t i_endoff; /* End of useful stuff in directory. */ + doff_t i_diroff; /* Offset in dir, where we found last entry. */ + doff_t i_offset; /* Offset of free space in directory. */ + ino_t i_ino; /* Inode number of found directory. */ + u_int32_t i_reclen; /* Size of found directory entry. */ + + union { + struct dirhash *dirhash; /* Hashing for large directories. */ + daddr_t *snapblklist; /* Collect expunged snapshot blocks. */ + } i_un; + + /* + * Data for extended attribute modification. + */ + u_char *i_ea_area; /* Pointer to malloced copy of EA area */ + unsigned i_ea_len; /* Length of i_ea_area */ + int i_ea_error; /* First errno in transaction */ + + /* + * Copies from the on-disk dinode itself. + */ + u_int16_t i_mode; /* IFMT, permissions; see below. */ + int16_t i_nlink; /* File link count. */ + u_int64_t i_size; /* File byte count. */ + u_int32_t i_flags; /* Status flags (chflags). */ + int64_t i_gen; /* Generation number. */ + u_int32_t i_uid; /* File owner. */ + u_int32_t i_gid; /* File group. */ + /* + * The real copy of the on-disk inode. + */ + union { + struct ufs1_dinode *din1; /* UFS1 on-disk dinode. */ + struct ufs2_dinode *din2; /* UFS2 on-disk dinode. */ + } dinode_u; +}; +/* + * These flags are kept in i_flag. + */ +#define IN_ACCESS 0x0001 /* Access time update request. */ +#define IN_CHANGE 0x0002 /* Inode change time update request. */ +#define IN_UPDATE 0x0004 /* Modification time update request. */ +#define IN_MODIFIED 0x0008 /* Inode has been modified. */ +#define IN_RENAME 0x0010 /* Inode is being renamed. */ +#define IN_HASHED 0x0020 /* Inode is on hash list */ +#define IN_LAZYMOD 0x0040 /* Modified, but don't write yet. */ +#define IN_SPACECOUNTED 0x0080 /* Blocks to be freed in free count. */ + +#define i_devvp i_ump->um_devvp +#define i_dirhash i_un.dirhash +#define i_snapblklist i_un.snapblklist +#define i_din1 dinode_u.din1 +#define i_din2 dinode_u.din2 + +#ifdef _KERNEL +/* + * The DIP macro is used to access fields in the dinode that are + * not cached in the inode itself. + */ +#define DIP(ip, field) \ + (((ip)->i_ump->um_fstype == UFS1) ? \ + (ip)->i_din1->d##field : (ip)->i_din2->d##field) + +#define MAXSYMLINKLEN(ip) \ + ((ip)->i_ump->um_fstype == UFS1) ? \ + ((NDADDR + NIADDR) * sizeof(ufs1_daddr_t)) : \ + ((NDADDR + NIADDR) * sizeof(ufs2_daddr_t)) +#define SHORTLINK(ip) \ + (((ip)->i_ump->um_fstype == UFS1) ? \ + (caddr_t)(ip)->i_din1->di_db : (caddr_t)(ip)->i_din2->di_db) + +/* + * Structure used to pass around logical block paths generated by + * ufs_getlbns and used by truncate and bmap code. + */ +struct indir { + ufs2_daddr_t in_lbn; /* Logical block number. */ + int in_off; /* Offset in buffer. */ + int in_exists; /* Flag if the block exists. */ +}; + +/* Convert between inode pointers and vnode pointers. */ +#define VTOI(vp) ((struct inode *)(vp)->v_data) +#define ITOV(ip) ((ip)->i_vnode) + +/* Determine if soft dependencies are being done */ +#define DOINGSOFTDEP(vp) ((vp)->v_mount->mnt_flag & MNT_SOFTDEP) +#define DOINGASYNC(vp) ((vp)->v_mount->mnt_flag & MNT_ASYNC) + +/* This overlays the fid structure (see mount.h). */ +struct ufid { + u_int16_t ufid_len; /* Length of structure. */ + u_int16_t ufid_pad; /* Force 32-bit alignment. */ + ino_t ufid_ino; /* File number (ino). */ + int32_t ufid_gen; /* Generation number. */ +}; +#endif /* _KERNEL */ + +#endif /* !_UFS_UFS_INODE_H_ */ diff --git a/src/sys/ufs/ufs/quota.h b/src/sys/ufs/ufs/quota.h new file mode 100644 index 0000000..bf4876e --- /dev/null +++ b/src/sys/ufs/ufs/quota.h @@ -0,0 +1,204 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Robert Elz at The University of Melbourne. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)quota.h 8.3 (Berkeley) 8/19/94 + * $FreeBSD: src/sys/ufs/ufs/quota.h,v 1.25 2003/06/15 06:36:19 rwatson Exp $ + */ + +#ifndef _UFS_UFS_QUOTA_H_ +#define _UFS_UFS_QUOTA_H_ + +/* + * Definitions for disk quotas imposed on the average user + * (big brother finally hits UNIX). + * + * The following constants define the amount of time given a user before the + * soft limits are treated as hard limits (usually resulting in an allocation + * failure). The timer is started when the user crosses their soft limit, it + * is reset when they go below their soft limit. + */ +#define MAX_IQ_TIME (7*24*60*60) /* seconds in 1 week */ +#define MAX_DQ_TIME (7*24*60*60) /* seconds in 1 week */ + +/* + * The following constants define the usage of the quota file array in the + * ufsmount structure and dquot array in the inode structure. The semantics + * of the elements of these arrays are defined in the routine getinoquota; + * the remainder of the quota code treats them generically and need not be + * inspected when changing the size of the array. + */ +#define MAXQUOTAS 2 +#define USRQUOTA 0 /* element used for user quotas */ +#define GRPQUOTA 1 /* element used for group quotas */ + +/* + * Definitions for the default names of the quotas files. + */ +#define INITQFNAMES { \ + "user", /* USRQUOTA */ \ + "group", /* GRPQUOTA */ \ + "undefined", \ +} +#define QUOTAFILENAME "quota" +#define QUOTAGROUP "operator" + +/* + * Command definitions for the 'quotactl' system call. The commands are + * broken into a main command defined below and a subcommand that is used + * to convey the type of quota that is being manipulated (see above). + */ +#define SUBCMDMASK 0x00ff +#define SUBCMDSHIFT 8 +#define QCMD(cmd, type) (((cmd) << SUBCMDSHIFT) | ((type) & SUBCMDMASK)) + +#define Q_QUOTAON 0x0100 /* enable quotas */ +#define Q_QUOTAOFF 0x0200 /* disable quotas */ +#define Q_GETQUOTA 0x0300 /* get limits and usage */ +#define Q_SETQUOTA 0x0400 /* set limits and usage */ +#define Q_SETUSE 0x0500 /* set usage */ +#define Q_SYNC 0x0600 /* sync disk copy of a filesystems quotas */ + +/* + * The following structure defines the format of the disk quota file + * (as it appears on disk) - the file is an array of these structures + * indexed by user or group number. The setquota system call establishes + * the vnode for each quota file (a pointer is retained in the ufsmount + * structure). + */ +struct dqblk { + u_int32_t dqb_bhardlimit; /* absolute limit on disk blks alloc */ + u_int32_t dqb_bsoftlimit; /* preferred limit on disk blks */ + u_int32_t dqb_curblocks; /* current block count */ + u_int32_t dqb_ihardlimit; /* maximum # allocated inodes + 1 */ + u_int32_t dqb_isoftlimit; /* preferred inode limit */ + u_int32_t dqb_curinodes; /* current # allocated inodes */ + int32_t dqb_btime; /* time limit for excessive disk use */ + int32_t dqb_itime; /* time limit for excessive files */ +}; + +#ifdef _KERNEL + +#include + +/* + * The following structure records disk usage for a user or group on a + * filesystem. There is one allocated for each quota that exists on any + * filesystem for the current user or group. A cache is kept of recently + * used entries. + */ +struct dquot { + LIST_ENTRY(dquot) dq_hash; /* hash list */ + TAILQ_ENTRY(dquot) dq_freelist; /* free list */ + u_int16_t dq_flags; /* flags, see below */ + u_int16_t dq_type; /* quota type of this dquot */ + u_int32_t dq_cnt; /* count of active references */ + u_int32_t dq_id; /* identifier this applies to */ + struct ufsmount *dq_ump; /* filesystem that this is taken from */ + struct dqblk dq_dqb; /* actual usage & quotas */ +}; +/* + * Flag values. + */ +#define DQ_LOCK 0x01 /* this quota locked (no MODS) */ +#define DQ_WANT 0x02 /* wakeup on unlock */ +#define DQ_MOD 0x04 /* this quota modified since read */ +#define DQ_FAKE 0x08 /* no limits here, just usage */ +#define DQ_BLKS 0x10 /* has been warned about blk limit */ +#define DQ_INODS 0x20 /* has been warned about inode limit */ +/* + * Shorthand notation. + */ +#define dq_bhardlimit dq_dqb.dqb_bhardlimit +#define dq_bsoftlimit dq_dqb.dqb_bsoftlimit +#define dq_curblocks dq_dqb.dqb_curblocks +#define dq_ihardlimit dq_dqb.dqb_ihardlimit +#define dq_isoftlimit dq_dqb.dqb_isoftlimit +#define dq_curinodes dq_dqb.dqb_curinodes +#define dq_btime dq_dqb.dqb_btime +#define dq_itime dq_dqb.dqb_itime + +/* + * If the system has never checked for a quota for this file, then it is + * set to NODQUOT. Once a write attempt is made the inode pointer is set + * to reference a dquot structure. + */ +#define NODQUOT NULL + +/* + * Flags to chkdq() and chkiq() + */ +#define FORCE 0x01 /* force usage changes independent of limits */ +#define CHOWN 0x02 /* (advisory) change initiated by chown */ + +/* + * Macros to avoid subroutine calls to trivial functions. + */ +#ifdef DIAGNOSTIC +#define DQREF(dq) dqref(dq) +#else +#define DQREF(dq) (dq)->dq_cnt++ +#endif + +struct inode; +struct mount; +struct thread; +struct ucred; +struct vnode; + +int chkdq(struct inode *, int64_t, struct ucred *, int); +int chkiq(struct inode *, ino_t, struct ucred *, int); +void dqinit(void); +void dqrele(struct vnode *, struct dquot *); +void dquninit(void); +int getinoquota(struct inode *); +int getquota(struct thread *, struct mount *, u_long, int, caddr_t); +int qsync(struct mount *mp); +int quotaoff(struct thread *td, struct mount *, int); +int quotaon(struct thread *td, struct mount *, int, caddr_t); +int setquota(struct thread *, struct mount *, u_long, int, caddr_t); +int setuse(struct thread *, struct mount *, u_long, int, caddr_t); +vfs_quotactl_t ufs_quotactl; + +#else /* !_KERNEL */ + +#include + +__BEGIN_DECLS +int quotactl(const char *, int, int, void *); +__END_DECLS + +#endif /* _KERNEL */ + +#endif /* !_UFS_UFS_QUOTA_H_ */ diff --git a/src/sys/ufs/ufs/ufs_acl.c b/src/sys/ufs/ufs/ufs_acl.c new file mode 100644 index 0000000..299c7be --- /dev/null +++ b/src/sys/ufs/ufs/ufs_acl.c @@ -0,0 +1,441 @@ +/*- + * Copyright (c) 1999-2001, 2003 Robert N. M. Watson + * All rights reserved. + * + * This software was developed by Robert Watson for the TrustedBSD Project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Support for POSIX.1e access control lists: UFS-specific support functions. + */ + +#include +__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_acl.c,v 1.18 2003/08/04 03:29:13 rwatson Exp $"); + +#include "opt_ufs.h" +#include "opt_quota.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#ifdef UFS_ACL + +/* + * Synchronize an ACL and an inode by copying over appropriate inode fields + * to the passed ACL. Assumes an ACL that would satisfy acl_posix1e_check(), + * and may panic if not. + */ +void +ufs_sync_acl_from_inode(struct inode *ip, struct acl *acl) +{ + struct acl_entry *acl_mask, *acl_group_obj; + int i; + + /* + * Update ACL_USER_OBJ, ACL_OTHER, but simply identify ACL_MASK + * and ACL_GROUP_OBJ for use after we know whether ACL_MASK is + * present. + */ + acl_mask = NULL; + acl_group_obj = NULL; + for (i = 0; i < acl->acl_cnt; i++) { + switch (acl->acl_entry[i].ae_tag) { + case ACL_USER_OBJ: + acl->acl_entry[i].ae_perm = acl_posix1e_mode_to_perm( + ACL_USER_OBJ, ip->i_mode); + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; + break; + + case ACL_GROUP_OBJ: + acl_group_obj = &acl->acl_entry[i]; + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; + break; + + case ACL_OTHER: + acl->acl_entry[i].ae_perm = acl_posix1e_mode_to_perm( + ACL_OTHER, ip->i_mode); + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; + break; + + case ACL_MASK: + acl_mask = &acl->acl_entry[i]; + acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; + break; + + case ACL_USER: + case ACL_GROUP: + break; + + default: + panic("ufs_sync_acl_from_inode(): bad ae_tag"); + } + } + + if (acl_group_obj == NULL) + panic("ufs_sync_acl_from_inode(): no ACL_GROUP_OBJ"); + + if (acl_mask == NULL) { + /* + * There is no ACL_MASK, so update ACL_GROUP_OBJ. + */ + acl_group_obj->ae_perm = acl_posix1e_mode_to_perm( + ACL_GROUP_OBJ, ip->i_mode); + } else { + /* + * Update the ACL_MASK entry instead of ACL_GROUP_OBJ. + */ + acl_mask->ae_perm = acl_posix1e_mode_to_perm(ACL_GROUP_OBJ, + ip->i_mode); + } +} + +/* + * Calculate what the inode mode should look like based on an authoritative + * ACL for the inode. Replace only the fields in the inode that the ACL + * can represent. + */ +void +ufs_sync_inode_from_acl(struct acl *acl, struct inode *ip) +{ + + ip->i_mode &= ACL_PRESERVE_MASK; + ip->i_mode |= acl_posix1e_acl_to_mode(acl); + DIP(ip, i_mode) = ip->i_mode; +} + +/* + * Retrieve the ACL on a file. + * + * As part of the ACL is stored in the inode, and the rest in an EA, + * assemble both into a final ACL product. Right now this is not done + * very efficiently. + */ +int +ufs_getacl(ap) + struct vop_getacl_args /* { + struct vnode *vp; + struct acl_type_t type; + struct acl *aclp; + struct ucred *cred; + struct thread *td; + } */ *ap; +{ + struct inode *ip = VTOI(ap->a_vp); + int error, len; + + /* + * XXX: If ufs_getacl() should work on file systems not supporting + * ACLs, remove this check. + */ + if ((ap->a_vp->v_mount->mnt_flag & MNT_ACLS) == 0) + return (EOPNOTSUPP); + + /* + * Attempt to retrieve the ACL based on the ACL type. + */ + bzero(ap->a_aclp, sizeof(*ap->a_aclp)); + len = sizeof(*ap->a_aclp); + switch(ap->a_type) { + case ACL_TYPE_ACCESS: + /* + * ACL_TYPE_ACCESS ACLs may or may not be stored in the + * EA, as they are in fact a combination of the inode + * ownership/permissions and the EA contents. If the + * EA is present, merge the two in a temporary ACL + * storage, otherwise just return the inode contents. + */ + error = vn_extattr_get(ap->a_vp, IO_NODELOCKED, + POSIX1E_ACL_ACCESS_EXTATTR_NAMESPACE, + POSIX1E_ACL_ACCESS_EXTATTR_NAME, &len, (char *) ap->a_aclp, + ap->a_td); + switch (error) { + /* XXX: If ufs_getacl() should work on filesystems without + * the EA configured, add case EOPNOTSUPP here. */ + case ENOATTR: + /* + * Legitimately no ACL set on object, purely + * emulate it through the inode. These fields will + * be updated when the ACL is synchronized with + * the inode later. + */ + ap->a_aclp->acl_cnt = 3; + ap->a_aclp->acl_entry[0].ae_tag = ACL_USER_OBJ; + ap->a_aclp->acl_entry[0].ae_id = ACL_UNDEFINED_ID; + ap->a_aclp->acl_entry[0].ae_perm = ACL_PERM_NONE; + ap->a_aclp->acl_entry[1].ae_tag = ACL_GROUP_OBJ; + ap->a_aclp->acl_entry[1].ae_id = ACL_UNDEFINED_ID; + ap->a_aclp->acl_entry[1].ae_perm = ACL_PERM_NONE; + ap->a_aclp->acl_entry[2].ae_tag = ACL_OTHER; + ap->a_aclp->acl_entry[2].ae_id = ACL_UNDEFINED_ID; + ap->a_aclp->acl_entry[2].ae_perm = ACL_PERM_NONE; + ufs_sync_acl_from_inode(ip, ap->a_aclp); + error = 0; + break; + + case 0: + if (len != sizeof(*ap->a_aclp)) { + /* + * A short (or long) read, meaning that for + * some reason the ACL is corrupted. Return + * EPERM since the object DAC protections + * are unsafe. + */ + printf("ufs_getacl(): Loaded invalid ACL (" + "%d bytes)\n", len); + return (EPERM); + } + ufs_sync_acl_from_inode(ip, ap->a_aclp); + break; + + default: + break; + } + break; + + case ACL_TYPE_DEFAULT: + if (ap->a_vp->v_type != VDIR) { + error = EINVAL; + break; + } + error = vn_extattr_get(ap->a_vp, IO_NODELOCKED, + POSIX1E_ACL_DEFAULT_EXTATTR_NAMESPACE, + POSIX1E_ACL_DEFAULT_EXTATTR_NAME, &len, + (char *) ap->a_aclp, ap->a_td); + /* + * Unlike ACL_TYPE_ACCESS, there is no relationship between + * the inode contents and the ACL, and it is therefore + * possible for the request for the ACL to fail since the + * ACL is undefined. In this situation, return success + * and an empty ACL, as required by POSIX.1e. + */ + switch (error) { + /* XXX: If ufs_getacl() should work on filesystems without + * the EA configured, add case EOPNOTSUPP here. */ + case ENOATTR: + bzero(ap->a_aclp, sizeof(*ap->a_aclp)); + ap->a_aclp->acl_cnt = 0; + error = 0; + break; + + case 0: + if (len != sizeof(*ap->a_aclp)) { + /* + * A short (or long) read, meaning that for + * some reason the ACL is corrupted. Return + * EPERM since the object default DAC + * protections are unsafe. + */ + printf("ufs_getacl(): Loaded invalid ACL (" + "%d bytes)\n", len); + return (EPERM); + } + break; + + default: + break; + } + break; + + default: + error = EINVAL; + } + + return (error); +} + +/* + * Set the ACL on a file. + * + * As part of the ACL is stored in the inode, and the rest in an EA, + * this is necessarily non-atomic, and has complex authorization. + * As ufs_setacl() includes elements of ufs_chown() and ufs_chmod(), + * a fair number of different access checks may be required to go ahead + * with the operation at all. + */ +int +ufs_setacl(ap) + struct vop_setacl_args /* { + struct vnode *vp; + acl_type_t type; + struct acl *aclp; + struct ucred *cred; + struct proc *p; + } */ *ap; +{ + struct inode *ip = VTOI(ap->a_vp); + int error; + + if ((ap->a_vp->v_mount->mnt_flag & MNT_ACLS) == 0) + return (EOPNOTSUPP); + + /* + * If this is a set operation rather than a delete operation, + * invoke VOP_ACLCHECK() on the passed ACL to determine if it is + * valid for the target. This will include a check on ap->a_type. + */ + if (ap->a_aclp != NULL) { + /* + * Set operation. + */ + error = VOP_ACLCHECK(ap->a_vp, ap->a_type, ap->a_aclp, + ap->a_cred, ap->a_td); + if (error != 0) + return (error); + } else { + /* + * Delete operation. + * POSIX.1e allows only deletion of the default ACL on a + * directory (ACL_TYPE_DEFAULT). + */ + if (ap->a_type != ACL_TYPE_DEFAULT) + return (EINVAL); + if (ap->a_vp->v_type != VDIR) + return (ENOTDIR); + } + + if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + + /* + * Authorize the ACL operation. + */ + if (ip->i_flags & (IMMUTABLE | APPEND)) + return (EPERM); + + /* + * Must hold VADMIN (be file owner) or have appropriate privilege. + */ + if ((error = VOP_ACCESS(ap->a_vp, VADMIN, ap->a_cred, ap->a_td))) + return (error); + + switch(ap->a_type) { + case ACL_TYPE_ACCESS: + error = vn_extattr_set(ap->a_vp, IO_NODELOCKED, + POSIX1E_ACL_ACCESS_EXTATTR_NAMESPACE, + POSIX1E_ACL_ACCESS_EXTATTR_NAME, sizeof(*ap->a_aclp), + (char *) ap->a_aclp, ap->a_td); + break; + + case ACL_TYPE_DEFAULT: + if (ap->a_aclp == NULL) { + error = vn_extattr_rm(ap->a_vp, IO_NODELOCKED, + POSIX1E_ACL_DEFAULT_EXTATTR_NAMESPACE, + POSIX1E_ACL_DEFAULT_EXTATTR_NAME, ap->a_td); + /* + * Attempting to delete a non-present default ACL + * will return success for portability purposes. + * (TRIX) + * + * XXX: Note that since we can't distinguish + * "that EA is not supported" from "that EA is not + * defined", the success case here overlaps the + * the ENOATTR->EOPNOTSUPP case below. + */ + if (error == ENOATTR) + error = 0; + } else + error = vn_extattr_set(ap->a_vp, IO_NODELOCKED, + POSIX1E_ACL_DEFAULT_EXTATTR_NAMESPACE, + POSIX1E_ACL_DEFAULT_EXTATTR_NAME, + sizeof(*ap->a_aclp), (char *) ap->a_aclp, ap->a_td); + break; + + default: + error = EINVAL; + } + /* + * Map lack of attribute definition in UFS_EXTATTR into lack of + * support for ACLs on the filesystem. + */ + if (error == ENOATTR) + return (EOPNOTSUPP); + if (error != 0) + return (error); + + if (ap->a_type == ACL_TYPE_ACCESS) { + /* + * Now that the EA is successfully updated, update the + * inode and mark it as changed. + */ + ufs_sync_inode_from_acl(ap->a_aclp, ip); + ip->i_flag |= IN_CHANGE; + } + + VN_KNOTE(ap->a_vp, NOTE_ATTRIB); + return (0); +} + +/* + * Check the validity of an ACL for a file. + */ +int +ufs_aclcheck(ap) + struct vop_aclcheck_args /* { + struct vnode *vp; + acl_type_t type; + struct acl *aclp; + struct ucred *cred; + struct thread *td; + } */ *ap; +{ + + if ((ap->a_vp->v_mount->mnt_flag & MNT_ACLS) == 0) + return (EOPNOTSUPP); + + /* + * Verify we understand this type of ACL, and that it applies + * to this kind of object. + * Rely on the acl_posix1e_check() routine to verify the contents. + */ + switch(ap->a_type) { + case ACL_TYPE_ACCESS: + break; + + case ACL_TYPE_DEFAULT: + if (ap->a_vp->v_type != VDIR) + return (EINVAL); + break; + + default: + return (EINVAL); + } + return (acl_posix1e_check(ap->a_aclp)); +} + +#endif /* !UFS_ACL */ diff --git a/src/sys/ufs/ufs/ufs_bmap.c b/src/sys/ufs/ufs/ufs_bmap.c new file mode 100644 index 0000000..a5e6287 --- /dev/null +++ b/src/sys/ufs/ufs/ufs_bmap.c @@ -0,0 +1,385 @@ +/* + * Copyright (c) 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_bmap.c 8.7 (Berkeley) 3/21/95 + */ + +#include +__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_bmap.c,v 1.59 2003/10/18 14:10:27 phk Exp $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* + * Bmap converts a the logical block number of a file to its physical block + * number on the disk. The conversion is done by using the logical block + * number to index into the array of block pointers described by the dinode. + */ +int +ufs_bmap(ap) + struct vop_bmap_args /* { + struct vnode *a_vp; + daddr_t a_bn; + struct vnode **a_vpp; + daddr_t *a_bnp; + int *a_runp; + int *a_runb; + } */ *ap; +{ + ufs2_daddr_t blkno; + int error; + + /* + * Check for underlying vnode requests and ensure that logical + * to physical mapping is requested. + */ + if (ap->a_vpp != NULL) + *ap->a_vpp = VTOI(ap->a_vp)->i_devvp; + if (ap->a_bnp == NULL) + return (0); + + error = ufs_bmaparray(ap->a_vp, ap->a_bn, &blkno, NULL, + ap->a_runp, ap->a_runb); + *ap->a_bnp = blkno; + return (error); +} + +/* + * Indirect blocks are now on the vnode for the file. They are given negative + * logical block numbers. Indirect blocks are addressed by the negative + * address of the first data block to which they point. Double indirect blocks + * are addressed by one less than the address of the first indirect block to + * which they point. Triple indirect blocks are addressed by one less than + * the address of the first double indirect block to which they point. + * + * ufs_bmaparray does the bmap conversion, and if requested returns the + * array of logical blocks which must be traversed to get to a block. + * Each entry contains the offset into that block that gets you to the + * next block and the disk address of the block (if it is assigned). + */ + +int +ufs_bmaparray(vp, bn, bnp, nbp, runp, runb) + struct vnode *vp; + ufs2_daddr_t bn; + ufs2_daddr_t *bnp; + struct buf *nbp; + int *runp; + int *runb; +{ + struct inode *ip; + struct buf *bp; + struct ufsmount *ump; + struct mount *mp; + struct vnode *devvp; + struct indir a[NIADDR+1], *ap; + ufs2_daddr_t daddr; + ufs_lbn_t metalbn; + int error, num, maxrun = 0; + int *nump; + + ap = NULL; + ip = VTOI(vp); + mp = vp->v_mount; + ump = VFSTOUFS(mp); + devvp = ump->um_devvp; + + if (runp) { + maxrun = mp->mnt_iosize_max / mp->mnt_stat.f_iosize - 1; + *runp = 0; + } + + if (runb) { + *runb = 0; + } + + + ap = a; + nump = # + error = ufs_getlbns(vp, bn, ap, nump); + if (error) + return (error); + + num = *nump; + if (num == 0) { + if (bn >= 0 && bn < NDADDR) { + *bnp = blkptrtodb(ump, DIP(ip, i_db[bn])); + } else if (bn < 0 && bn >= -NXADDR) { + *bnp = blkptrtodb(ump, ip->i_din2->di_extb[-1 - bn]); + if (*bnp == 0) + *bnp = -1; + if (nbp == NULL) + panic("ufs_bmaparray: mapping ext data"); + nbp->b_xflags |= BX_ALTDATA; + return (0); + } else { + panic("ufs_bmaparray: blkno out of range"); + } + /* + * Since this is FFS independent code, we are out of + * scope for the definitions of BLK_NOCOPY and + * BLK_SNAP, but we do know that they will fall in + * the range 1..um_seqinc, so we use that test and + * return a request for a zeroed out buffer if attempts + * are made to read a BLK_NOCOPY or BLK_SNAP block. + */ + if ((ip->i_flags & SF_SNAPSHOT) && DIP(ip, i_db[bn]) > 0 && + DIP(ip, i_db[bn]) < ump->um_seqinc) { + *bnp = -1; + } else if (*bnp == 0) { + if (ip->i_flags & SF_SNAPSHOT) + *bnp = blkptrtodb(ump, bn * ump->um_seqinc); + else + *bnp = -1; + } else if (runp) { + ufs2_daddr_t bnb = bn; + for (++bn; bn < NDADDR && *runp < maxrun && + is_sequential(ump, DIP(ip, i_db[bn - 1]), + DIP(ip, i_db[bn])); + ++bn, ++*runp); + bn = bnb; + if (runb && (bn > 0)) { + for (--bn; (bn >= 0) && (*runb < maxrun) && + is_sequential(ump, DIP(ip, i_db[bn]), + DIP(ip, i_db[bn+1])); + --bn, ++*runb); + } + } + return (0); + } + + + /* Get disk address out of indirect block array */ + daddr = DIP(ip, i_ib[ap->in_off]); + + for (bp = NULL, ++ap; --num; ++ap) { + /* + * Exit the loop if there is no disk address assigned yet and + * the indirect block isn't in the cache, or if we were + * looking for an indirect block and we've found it. + */ + + metalbn = ap->in_lbn; + if ((daddr == 0 && !incore(vp, metalbn)) || metalbn == bn) + break; + /* + * If we get here, we've either got the block in the cache + * or we have a disk address for it, go fetch it. + */ + if (bp) + bqrelse(bp); + + ap->in_exists = 1; + bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0, 0); + if ((bp->b_flags & B_CACHE) == 0) { +#ifdef DIAGNOSTIC + if (!daddr) + panic("ufs_bmaparray: indirect block not in cache"); +#endif + bp->b_blkno = blkptrtodb(ump, daddr); + bp->b_iocmd = BIO_READ; + bp->b_flags &= ~B_INVAL; + bp->b_ioflags &= ~BIO_ERROR; + vfs_busy_pages(bp, 0); + bp->b_iooffset = dbtob(bp->b_blkno); + VOP_STRATEGY(bp->b_vp, bp); + curproc->p_stats->p_ru.ru_inblock++; /* XXX */ + error = bufwait(bp); + if (error) { + brelse(bp); + return (error); + } + } + + if (ip->i_ump->um_fstype == UFS1) { + daddr = ((ufs1_daddr_t *)bp->b_data)[ap->in_off]; + if (num == 1 && daddr && runp) { + for (bn = ap->in_off + 1; + bn < MNINDIR(ump) && *runp < maxrun && + is_sequential(ump, + ((ufs1_daddr_t *)bp->b_data)[bn - 1], + ((ufs1_daddr_t *)bp->b_data)[bn]); + ++bn, ++*runp); + bn = ap->in_off; + if (runb && bn) { + for (--bn; bn >= 0 && *runb < maxrun && + is_sequential(ump, + ((ufs1_daddr_t *)bp->b_data)[bn], + ((ufs1_daddr_t *)bp->b_data)[bn+1]); + --bn, ++*runb); + } + } + continue; + } + daddr = ((ufs2_daddr_t *)bp->b_data)[ap->in_off]; + if (num == 1 && daddr && runp) { + for (bn = ap->in_off + 1; + bn < MNINDIR(ump) && *runp < maxrun && + is_sequential(ump, + ((ufs2_daddr_t *)bp->b_data)[bn - 1], + ((ufs2_daddr_t *)bp->b_data)[bn]); + ++bn, ++*runp); + bn = ap->in_off; + if (runb && bn) { + for (--bn; bn >= 0 && *runb < maxrun && + is_sequential(ump, + ((ufs2_daddr_t *)bp->b_data)[bn], + ((ufs2_daddr_t *)bp->b_data)[bn + 1]); + --bn, ++*runb); + } + } + } + if (bp) + bqrelse(bp); + + /* + * Since this is FFS independent code, we are out of scope for the + * definitions of BLK_NOCOPY and BLK_SNAP, but we do know that they + * will fall in the range 1..um_seqinc, so we use that test and + * return a request for a zeroed out buffer if attempts are made + * to read a BLK_NOCOPY or BLK_SNAP block. + */ + if ((ip->i_flags & SF_SNAPSHOT) && daddr > 0 && daddr < ump->um_seqinc){ + *bnp = -1; + return (0); + } + *bnp = blkptrtodb(ump, daddr); + if (*bnp == 0) { + if (ip->i_flags & SF_SNAPSHOT) + *bnp = blkptrtodb(ump, bn * ump->um_seqinc); + else + *bnp = -1; + } + return (0); +} + +/* + * Create an array of logical block number/offset pairs which represent the + * path of indirect blocks required to access a data block. The first "pair" + * contains the logical block number of the appropriate single, double or + * triple indirect block and the offset into the inode indirect block array. + * Note, the logical block number of the inode single/double/triple indirect + * block appears twice in the array, once with the offset into the i_ib and + * once with the offset into the page itself. + */ +int +ufs_getlbns(vp, bn, ap, nump) + struct vnode *vp; + ufs2_daddr_t bn; + struct indir *ap; + int *nump; +{ + ufs2_daddr_t blockcnt; + ufs_lbn_t metalbn, realbn; + struct ufsmount *ump; + int i, numlevels, off; + + ump = VFSTOUFS(vp->v_mount); + if (nump) + *nump = 0; + numlevels = 0; + realbn = bn; + if (bn < 0) + bn = -bn; + + /* The first NDADDR blocks are direct blocks. */ + if (bn < NDADDR) + return (0); + + /* + * Determine the number of levels of indirection. After this loop + * is done, blockcnt indicates the number of data blocks possible + * at the previous level of indirection, and NIADDR - i is the number + * of levels of indirection needed to locate the requested block. + */ + for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) { + if (i == 0) + return (EFBIG); + blockcnt *= MNINDIR(ump); + if (bn < blockcnt) + break; + } + + /* Calculate the address of the first meta-block. */ + if (realbn >= 0) + metalbn = -(realbn - bn + NIADDR - i); + else + metalbn = -(-realbn - bn + NIADDR - i); + + /* + * At each iteration, off is the offset into the bap array which is + * an array of disk addresses at the current level of indirection. + * The logical block number and the offset in that block are stored + * into the argument array. + */ + ap->in_lbn = metalbn; + ap->in_off = off = NIADDR - i; + ap->in_exists = 0; + ap++; + for (++numlevels; i <= NIADDR; i++) { + /* If searching for a meta-data block, quit when found. */ + if (metalbn == realbn) + break; + + blockcnt /= MNINDIR(ump); + off = (bn / blockcnt) % MNINDIR(ump); + + ++numlevels; + ap->in_lbn = metalbn; + ap->in_off = off; + ap->in_exists = 0; + ++ap; + + metalbn -= -1 + off * blockcnt; + } + if (nump) + *nump = numlevels; + return (0); +} diff --git a/src/sys/ufs/ufs/ufs_dirhash.c b/src/sys/ufs/ufs/ufs_dirhash.c new file mode 100644 index 0000000..902e04b --- /dev/null +++ b/src/sys/ufs/ufs/ufs_dirhash.c @@ -0,0 +1,1084 @@ +/* + * Copyright (c) 2001, 2002 Ian Dowse. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This implements a hash-based lookup scheme for UFS directories. + */ + +#include +__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_dirhash.c,v 1.17 2003/06/11 06:34:30 obrien Exp $"); + +#include "opt_ufs.h" + +#ifdef UFS_DIRHASH + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define WRAPINCR(val, limit) (((val) + 1 == (limit)) ? 0 : ((val) + 1)) +#define WRAPDECR(val, limit) (((val) == 0) ? ((limit) - 1) : ((val) - 1)) +#define OFSFMT(vp) ((vp)->v_mount->mnt_maxsymlinklen <= 0) +#define BLKFREE2IDX(n) ((n) > DH_NFSTATS ? DH_NFSTATS : (n)) + +static MALLOC_DEFINE(M_DIRHASH, "UFS dirhash", "UFS directory hash tables"); + +SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem"); + +static int ufs_mindirhashsize = DIRBLKSIZ * 5; +SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_minsize, CTLFLAG_RW, + &ufs_mindirhashsize, + 0, "minimum directory size in bytes for which to use hashed lookup"); +static int ufs_dirhashmaxmem = 2 * 1024 * 1024; +SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_maxmem, CTLFLAG_RW, &ufs_dirhashmaxmem, + 0, "maximum allowed dirhash memory usage"); +static int ufs_dirhashmem; +SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_mem, CTLFLAG_RD, &ufs_dirhashmem, + 0, "current dirhash memory usage"); +static int ufs_dirhashcheck = 0; +SYSCTL_INT(_vfs_ufs, OID_AUTO, dirhash_docheck, CTLFLAG_RW, &ufs_dirhashcheck, + 0, "enable extra sanity tests"); + + +static int ufsdirhash_hash(struct dirhash *dh, char *name, int namelen); +static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff); +static void ufsdirhash_delslot(struct dirhash *dh, int slot); +static int ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen, + doff_t offset); +static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset); +static int ufsdirhash_recycle(int wanted); + +static uma_zone_t ufsdirhash_zone; + +/* Dirhash list; recently-used entries are near the tail. */ +static TAILQ_HEAD(, dirhash) ufsdirhash_list; + +/* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */ +static struct mtx ufsdirhash_mtx; + +/* + * Locking order: + * ufsdirhash_mtx + * dh_mtx + * + * The dh_mtx mutex should be acquired either via the inode lock, or via + * ufsdirhash_mtx. Only the owner of the inode may free the associated + * dirhash, but anything can steal its memory and set dh_hash to NULL. + */ + +/* + * Attempt to build up a hash table for the directory contents in + * inode 'ip'. Returns 0 on success, or -1 of the operation failed. + */ +int +ufsdirhash_build(struct inode *ip) +{ + struct dirhash *dh; + struct buf *bp = NULL; + struct direct *ep; + struct vnode *vp; + doff_t bmask, pos; + int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot; + + /* Check if we can/should use dirhash. */ + if (ip->i_dirhash == NULL) { + if (ip->i_size < ufs_mindirhashsize || OFSFMT(ip->i_vnode)) + return (-1); + } else { + /* Hash exists, but sysctls could have changed. */ + if (ip->i_size < ufs_mindirhashsize || + ufs_dirhashmem > ufs_dirhashmaxmem) { + ufsdirhash_free(ip); + return (-1); + } + /* Check if hash exists and is intact (note: unlocked read). */ + if (ip->i_dirhash->dh_hash != NULL) + return (0); + /* Free the old, recycled hash and build a new one. */ + ufsdirhash_free(ip); + } + + /* Don't hash removed directories. */ + if (ip->i_effnlink == 0) + return (-1); + + vp = ip->i_vnode; + /* Allocate 50% more entries than this dir size could ever need. */ + KASSERT(ip->i_size >= DIRBLKSIZ, ("ufsdirhash_build size")); + nslots = ip->i_size / DIRECTSIZ(1); + nslots = (nslots * 3 + 1) / 2; + narrays = howmany(nslots, DH_NBLKOFF); + nslots = narrays * DH_NBLKOFF; + dirblocks = howmany(ip->i_size, DIRBLKSIZ); + nblocks = (dirblocks * 3 + 1) / 2; + + memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) + + narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) + + nblocks * sizeof(*dh->dh_blkfree); + mtx_lock(&ufsdirhash_mtx); + if (memreqd + ufs_dirhashmem > ufs_dirhashmaxmem) { + mtx_unlock(&ufsdirhash_mtx); + if (memreqd > ufs_dirhashmaxmem / 2) + return (-1); + + /* Try to free some space. */ + if (ufsdirhash_recycle(memreqd) != 0) + return (-1); + /* Enough was freed, and ufsdirhash_mtx has been locked. */ + } + ufs_dirhashmem += memreqd; + mtx_unlock(&ufsdirhash_mtx); + + /* + * Use non-blocking mallocs so that we will revert to a linear + * lookup on failure rather than potentially blocking forever. + */ + MALLOC(dh, struct dirhash *, sizeof *dh, M_DIRHASH, M_NOWAIT | M_ZERO); + if (dh == NULL) { + mtx_lock(&ufsdirhash_mtx); + ufs_dirhashmem -= memreqd; + mtx_unlock(&ufsdirhash_mtx); + return (-1); + } + MALLOC(dh->dh_hash, doff_t **, narrays * sizeof(dh->dh_hash[0]), + M_DIRHASH, M_NOWAIT | M_ZERO); + MALLOC(dh->dh_blkfree, u_int8_t *, nblocks * sizeof(dh->dh_blkfree[0]), + M_DIRHASH, M_NOWAIT); + if (dh->dh_hash == NULL || dh->dh_blkfree == NULL) + goto fail; + for (i = 0; i < narrays; i++) { + if ((dh->dh_hash[i] = uma_zalloc(ufsdirhash_zone, + M_WAITOK)) == NULL) + goto fail; + for (j = 0; j < DH_NBLKOFF; j++) + dh->dh_hash[i][j] = DIRHASH_EMPTY; + } + + /* Initialise the hash table and block statistics. */ + mtx_init(&dh->dh_mtx, "dirhash", NULL, MTX_DEF); + dh->dh_narrays = narrays; + dh->dh_hlen = nslots; + dh->dh_nblk = nblocks; + dh->dh_dirblks = dirblocks; + for (i = 0; i < dirblocks; i++) + dh->dh_blkfree[i] = DIRBLKSIZ / DIRALIGN; + for (i = 0; i < DH_NFSTATS; i++) + dh->dh_firstfree[i] = -1; + dh->dh_firstfree[DH_NFSTATS] = 0; + dh->dh_seqopt = 0; + dh->dh_seqoff = 0; + dh->dh_score = DH_SCOREINIT; + ip->i_dirhash = dh; + + bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; + pos = 0; + while (pos < ip->i_size) { + /* If necessary, get the next directory block. */ + if ((pos & bmask) == 0) { + if (bp != NULL) + brelse(bp); + if (UFS_BLKATOFF(vp, (off_t)pos, NULL, &bp) != 0) + goto fail; + } + + /* Add this entry to the hash. */ + ep = (struct direct *)((char *)bp->b_data + (pos & bmask)); + if (ep->d_reclen == 0 || ep->d_reclen > + DIRBLKSIZ - (pos & (DIRBLKSIZ - 1))) { + /* Corrupted directory. */ + brelse(bp); + goto fail; + } + if (ep->d_ino != 0) { + /* Add the entry (simplified ufsdirhash_add). */ + slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen); + while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY) + slot = WRAPINCR(slot, dh->dh_hlen); + dh->dh_hused++; + DH_ENTRY(dh, slot) = pos; + ufsdirhash_adjfree(dh, pos, -DIRSIZ(0, ep)); + } + pos += ep->d_reclen; + } + + if (bp != NULL) + brelse(bp); + mtx_lock(&ufsdirhash_mtx); + TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list); + dh->dh_onlist = 1; + mtx_unlock(&ufsdirhash_mtx); + return (0); + +fail: + if (dh->dh_hash != NULL) { + for (i = 0; i < narrays; i++) + if (dh->dh_hash[i] != NULL) + uma_zfree(ufsdirhash_zone, dh->dh_hash[i]); + FREE(dh->dh_hash, M_DIRHASH); + } + if (dh->dh_blkfree != NULL) + FREE(dh->dh_blkfree, M_DIRHASH); + FREE(dh, M_DIRHASH); + ip->i_dirhash = NULL; + mtx_lock(&ufsdirhash_mtx); + ufs_dirhashmem -= memreqd; + mtx_unlock(&ufsdirhash_mtx); + return (-1); +} + +/* + * Free any hash table associated with inode 'ip'. + */ +void +ufsdirhash_free(struct inode *ip) +{ + struct dirhash *dh; + int i, mem; + + if ((dh = ip->i_dirhash) == NULL) + return; + mtx_lock(&ufsdirhash_mtx); + mtx_lock(&dh->dh_mtx); + if (dh->dh_onlist) + TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); + mtx_unlock(&dh->dh_mtx); + mtx_unlock(&ufsdirhash_mtx); + + /* The dirhash pointed to by 'dh' is exclusively ours now. */ + + mem = sizeof(*dh); + if (dh->dh_hash != NULL) { + for (i = 0; i < dh->dh_narrays; i++) + uma_zfree(ufsdirhash_zone, dh->dh_hash[i]); + FREE(dh->dh_hash, M_DIRHASH); + FREE(dh->dh_blkfree, M_DIRHASH); + mem += dh->dh_narrays * sizeof(*dh->dh_hash) + + dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) + + dh->dh_nblk * sizeof(*dh->dh_blkfree); + } + mtx_destroy(&dh->dh_mtx); + FREE(dh, M_DIRHASH); + ip->i_dirhash = NULL; + + mtx_lock(&ufsdirhash_mtx); + ufs_dirhashmem -= mem; + mtx_unlock(&ufsdirhash_mtx); +} + +/* + * Find the offset of the specified name within the given inode. + * Returns 0 on success, ENOENT if the entry does not exist, or + * EJUSTRETURN if the caller should revert to a linear search. + * + * If successful, the directory offset is stored in *offp, and a + * pointer to a struct buf containing the entry is stored in *bpp. If + * prevoffp is non-NULL, the offset of the previous entry within + * the DIRBLKSIZ-sized block is stored in *prevoffp (if the entry + * is the first in a block, the start of the block is used). + */ +int +ufsdirhash_lookup(struct inode *ip, char *name, int namelen, doff_t *offp, + struct buf **bpp, doff_t *prevoffp) +{ + struct dirhash *dh, *dh_next; + struct direct *dp; + struct vnode *vp; + struct buf *bp; + doff_t blkoff, bmask, offset, prevoff; + int i, slot; + + if ((dh = ip->i_dirhash) == NULL) + return (EJUSTRETURN); + /* + * Move this dirhash towards the end of the list if it has a + * score higher than the next entry, and acquire the dh_mtx. + * Optimise the case where it's already the last by performing + * an unlocked read of the TAILQ_NEXT pointer. + * + * In both cases, end up holding just dh_mtx. + */ + if (TAILQ_NEXT(dh, dh_list) != NULL) { + mtx_lock(&ufsdirhash_mtx); + mtx_lock(&dh->dh_mtx); + /* + * If the new score will be greater than that of the next + * entry, then move this entry past it. With both mutexes + * held, dh_next won't go away, but its dh_score could + * change; that's not important since it is just a hint. + */ + if (dh->dh_hash != NULL && + (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL && + dh->dh_score >= dh_next->dh_score) { + KASSERT(dh->dh_onlist, ("dirhash: not on list")); + TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); + TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh, + dh_list); + } + mtx_unlock(&ufsdirhash_mtx); + } else { + /* Already the last, though that could change as we wait. */ + mtx_lock(&dh->dh_mtx); + } + if (dh->dh_hash == NULL) { + mtx_unlock(&dh->dh_mtx); + ufsdirhash_free(ip); + return (EJUSTRETURN); + } + + /* Update the score. */ + if (dh->dh_score < DH_SCOREMAX) + dh->dh_score++; + + vp = ip->i_vnode; + bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; + blkoff = -1; + bp = NULL; +restart: + slot = ufsdirhash_hash(dh, name, namelen); + + if (dh->dh_seqopt) { + /* + * Sequential access optimisation. dh_seqoff contains the + * offset of the directory entry immediately following + * the last entry that was looked up. Check if this offset + * appears in the hash chain for the name we are looking for. + */ + for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY; + i = WRAPINCR(i, dh->dh_hlen)) + if (offset == dh->dh_seqoff) + break; + if (offset == dh->dh_seqoff) { + /* + * We found an entry with the expected offset. This + * is probably the entry we want, but if not, the + * code below will turn off seqoff and retry. + */ + slot = i; + } else + dh->dh_seqopt = 0; + } + + for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY; + slot = WRAPINCR(slot, dh->dh_hlen)) { + if (offset == DIRHASH_DEL) + continue; + mtx_unlock(&dh->dh_mtx); + + if (offset < 0 || offset >= ip->i_size) + panic("ufsdirhash_lookup: bad offset in hash array"); + if ((offset & ~bmask) != blkoff) { + if (bp != NULL) + brelse(bp); + blkoff = offset & ~bmask; + if (UFS_BLKATOFF(vp, (off_t)blkoff, NULL, &bp) != 0) + return (EJUSTRETURN); + } + dp = (struct direct *)(bp->b_data + (offset & bmask)); + if (dp->d_reclen == 0 || dp->d_reclen > + DIRBLKSIZ - (offset & (DIRBLKSIZ - 1))) { + /* Corrupted directory. */ + brelse(bp); + return (EJUSTRETURN); + } + if (dp->d_namlen == namelen && + bcmp(dp->d_name, name, namelen) == 0) { + /* Found. Get the prev offset if needed. */ + if (prevoffp != NULL) { + if (offset & (DIRBLKSIZ - 1)) { + prevoff = ufsdirhash_getprev(dp, + offset); + if (prevoff == -1) { + brelse(bp); + return (EJUSTRETURN); + } + } else + prevoff = offset; + *prevoffp = prevoff; + } + + /* Check for sequential access, and update offset. */ + if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset) + dh->dh_seqopt = 1; + dh->dh_seqoff = offset + DIRSIZ(0, dp); + + *bpp = bp; + *offp = offset; + return (0); + } + + mtx_lock(&dh->dh_mtx); + if (dh->dh_hash == NULL) { + mtx_unlock(&dh->dh_mtx); + if (bp != NULL) + brelse(bp); + ufsdirhash_free(ip); + return (EJUSTRETURN); + } + /* + * When the name doesn't match in the seqopt case, go back + * and search normally. + */ + if (dh->dh_seqopt) { + dh->dh_seqopt = 0; + goto restart; + } + } + mtx_unlock(&dh->dh_mtx); + if (bp != NULL) + brelse(bp); + return (ENOENT); +} + +/* + * Find a directory block with room for 'slotneeded' bytes. Returns + * the offset of the directory entry that begins the free space. + * This will either be the offset of an existing entry that has free + * space at the end, or the offset of an entry with d_ino == 0 at + * the start of a DIRBLKSIZ block. + * + * To use the space, the caller may need to compact existing entries in + * the directory. The total number of bytes in all of the entries involved + * in the compaction is stored in *slotsize. In other words, all of + * the entries that must be compacted are exactly contained in the + * region beginning at the returned offset and spanning *slotsize bytes. + * + * Returns -1 if no space was found, indicating that the directory + * must be extended. + */ +doff_t +ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize) +{ + struct direct *dp; + struct dirhash *dh; + struct buf *bp; + doff_t pos, slotstart; + int dirblock, error, freebytes, i; + + if ((dh = ip->i_dirhash) == NULL) + return (-1); + mtx_lock(&dh->dh_mtx); + if (dh->dh_hash == NULL) { + mtx_unlock(&dh->dh_mtx); + ufsdirhash_free(ip); + return (-1); + } + + /* Find a directory block with the desired free space. */ + dirblock = -1; + for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++) + if ((dirblock = dh->dh_firstfree[i]) != -1) + break; + if (dirblock == -1) { + mtx_unlock(&dh->dh_mtx); + return (-1); + } + + KASSERT(dirblock < dh->dh_nblk && + dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN), + ("ufsdirhash_findfree: bad stats")); + mtx_unlock(&dh->dh_mtx); + pos = dirblock * DIRBLKSIZ; + error = UFS_BLKATOFF(ip->i_vnode, (off_t)pos, (char **)&dp, &bp); + if (error) + return (-1); + + /* Find the first entry with free space. */ + for (i = 0; i < DIRBLKSIZ; ) { + if (dp->d_reclen == 0) { + brelse(bp); + return (-1); + } + if (dp->d_ino == 0 || dp->d_reclen > DIRSIZ(0, dp)) + break; + i += dp->d_reclen; + dp = (struct direct *)((char *)dp + dp->d_reclen); + } + if (i > DIRBLKSIZ) { + brelse(bp); + return (-1); + } + slotstart = pos + i; + + /* Find the range of entries needed to get enough space */ + freebytes = 0; + while (i < DIRBLKSIZ && freebytes < slotneeded) { + freebytes += dp->d_reclen; + if (dp->d_ino != 0) + freebytes -= DIRSIZ(0, dp); + if (dp->d_reclen == 0) { + brelse(bp); + return (-1); + } + i += dp->d_reclen; + dp = (struct direct *)((char *)dp + dp->d_reclen); + } + if (i > DIRBLKSIZ) { + brelse(bp); + return (-1); + } + if (freebytes < slotneeded) + panic("ufsdirhash_findfree: free mismatch"); + brelse(bp); + *slotsize = pos + i - slotstart; + return (slotstart); +} + +/* + * Return the start of the unused space at the end of a directory, or + * -1 if there are no trailing unused blocks. + */ +doff_t +ufsdirhash_enduseful(struct inode *ip) +{ + + struct dirhash *dh; + int i; + + if ((dh = ip->i_dirhash) == NULL) + return (-1); + mtx_lock(&dh->dh_mtx); + if (dh->dh_hash == NULL) { + mtx_unlock(&dh->dh_mtx); + ufsdirhash_free(ip); + return (-1); + } + + if (dh->dh_blkfree[dh->dh_dirblks - 1] != DIRBLKSIZ / DIRALIGN) { + mtx_unlock(&dh->dh_mtx); + return (-1); + } + + for (i = dh->dh_dirblks - 1; i >= 0; i--) + if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN) + break; + mtx_unlock(&dh->dh_mtx); + return ((doff_t)(i + 1) * DIRBLKSIZ); +} + +/* + * Insert information into the hash about a new directory entry. dirp + * points to a struct direct containing the entry, and offset specifies + * the offset of this entry. + */ +void +ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset) +{ + struct dirhash *dh; + int slot; + + if ((dh = ip->i_dirhash) == NULL) + return; + mtx_lock(&dh->dh_mtx); + if (dh->dh_hash == NULL) { + mtx_unlock(&dh->dh_mtx); + ufsdirhash_free(ip); + return; + } + + KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ, + ("ufsdirhash_add: bad offset")); + /* + * Normal hash usage is < 66%. If the usage gets too high then + * remove the hash entirely and let it be rebuilt later. + */ + if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) { + mtx_unlock(&dh->dh_mtx); + ufsdirhash_free(ip); + return; + } + + /* Find a free hash slot (empty or deleted), and add the entry. */ + slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen); + while (DH_ENTRY(dh, slot) >= 0) + slot = WRAPINCR(slot, dh->dh_hlen); + if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY) + dh->dh_hused++; + DH_ENTRY(dh, slot) = offset; + + /* Update the per-block summary info. */ + ufsdirhash_adjfree(dh, offset, -DIRSIZ(0, dirp)); + mtx_unlock(&dh->dh_mtx); +} + +/* + * Remove the specified directory entry from the hash. The entry to remove + * is defined by the name in `dirp', which must exist at the specified + * `offset' within the directory. + */ +void +ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset) +{ + struct dirhash *dh; + int slot; + + if ((dh = ip->i_dirhash) == NULL) + return; + mtx_lock(&dh->dh_mtx); + if (dh->dh_hash == NULL) { + mtx_unlock(&dh->dh_mtx); + ufsdirhash_free(ip); + return; + } + + KASSERT(offset < dh->dh_dirblks * DIRBLKSIZ, + ("ufsdirhash_remove: bad offset")); + /* Find the entry */ + slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset); + + /* Remove the hash entry. */ + ufsdirhash_delslot(dh, slot); + + /* Update the per-block summary info. */ + ufsdirhash_adjfree(dh, offset, DIRSIZ(0, dirp)); + mtx_unlock(&dh->dh_mtx); +} + +/* + * Change the offset associated with a directory entry in the hash. Used + * when compacting directory blocks. + */ +void +ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff, + doff_t newoff) +{ + struct dirhash *dh; + int slot; + + if ((dh = ip->i_dirhash) == NULL) + return; + mtx_lock(&dh->dh_mtx); + if (dh->dh_hash == NULL) { + mtx_unlock(&dh->dh_mtx); + ufsdirhash_free(ip); + return; + } + + KASSERT(oldoff < dh->dh_dirblks * DIRBLKSIZ && + newoff < dh->dh_dirblks * DIRBLKSIZ, + ("ufsdirhash_move: bad offset")); + /* Find the entry, and update the offset. */ + slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff); + DH_ENTRY(dh, slot) = newoff; + mtx_unlock(&dh->dh_mtx); +} + +/* + * Inform dirhash that the directory has grown by one block that + * begins at offset (i.e. the new length is offset + DIRBLKSIZ). + */ +void +ufsdirhash_newblk(struct inode *ip, doff_t offset) +{ + struct dirhash *dh; + int block; + + if ((dh = ip->i_dirhash) == NULL) + return; + mtx_lock(&dh->dh_mtx); + if (dh->dh_hash == NULL) { + mtx_unlock(&dh->dh_mtx); + ufsdirhash_free(ip); + return; + } + + KASSERT(offset == dh->dh_dirblks * DIRBLKSIZ, + ("ufsdirhash_newblk: bad offset")); + block = offset / DIRBLKSIZ; + if (block >= dh->dh_nblk) { + /* Out of space; must rebuild. */ + mtx_unlock(&dh->dh_mtx); + ufsdirhash_free(ip); + return; + } + dh->dh_dirblks = block + 1; + + /* Account for the new free block. */ + dh->dh_blkfree[block] = DIRBLKSIZ / DIRALIGN; + if (dh->dh_firstfree[DH_NFSTATS] == -1) + dh->dh_firstfree[DH_NFSTATS] = block; + mtx_unlock(&dh->dh_mtx); +} + +/* + * Inform dirhash that the directory is being truncated. + */ +void +ufsdirhash_dirtrunc(struct inode *ip, doff_t offset) +{ + struct dirhash *dh; + int block, i; + + if ((dh = ip->i_dirhash) == NULL) + return; + mtx_lock(&dh->dh_mtx); + if (dh->dh_hash == NULL) { + mtx_unlock(&dh->dh_mtx); + ufsdirhash_free(ip); + return; + } + + KASSERT(offset <= dh->dh_dirblks * DIRBLKSIZ, + ("ufsdirhash_dirtrunc: bad offset")); + block = howmany(offset, DIRBLKSIZ); + /* + * If the directory shrinks to less than 1/8 of dh_nblk blocks + * (about 20% of its original size due to the 50% extra added in + * ufsdirhash_build) then free it, and let the caller rebuild + * if necessary. + */ + if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) { + mtx_unlock(&dh->dh_mtx); + ufsdirhash_free(ip); + return; + } + + /* + * Remove any `first free' information pertaining to the + * truncated blocks. All blocks we're removing should be + * completely unused. + */ + if (dh->dh_firstfree[DH_NFSTATS] >= block) + dh->dh_firstfree[DH_NFSTATS] = -1; + for (i = block; i < dh->dh_dirblks; i++) + if (dh->dh_blkfree[i] != DIRBLKSIZ / DIRALIGN) + panic("ufsdirhash_dirtrunc: blocks in use"); + for (i = 0; i < DH_NFSTATS; i++) + if (dh->dh_firstfree[i] >= block) + panic("ufsdirhash_dirtrunc: first free corrupt"); + dh->dh_dirblks = block; + mtx_unlock(&dh->dh_mtx); +} + +/* + * Debugging function to check that the dirhash information about + * a directory block matches its actual contents. Panics if a mismatch + * is detected. + * + * On entry, `buf' should point to the start of an in-core + * DIRBLKSIZ-sized directory block, and `offset' should contain the + * offset from the start of the directory of that block. + */ +void +ufsdirhash_checkblock(struct inode *ip, char *buf, doff_t offset) +{ + struct dirhash *dh; + struct direct *dp; + int block, ffslot, i, nfree; + + if (!ufs_dirhashcheck) + return; + if ((dh = ip->i_dirhash) == NULL) + return; + mtx_lock(&dh->dh_mtx); + if (dh->dh_hash == NULL) { + mtx_unlock(&dh->dh_mtx); + ufsdirhash_free(ip); + return; + } + + block = offset / DIRBLKSIZ; + if ((offset & (DIRBLKSIZ - 1)) != 0 || block >= dh->dh_dirblks) + panic("ufsdirhash_checkblock: bad offset"); + + nfree = 0; + for (i = 0; i < DIRBLKSIZ; i += dp->d_reclen) { + dp = (struct direct *)(buf + i); + if (dp->d_reclen == 0 || i + dp->d_reclen > DIRBLKSIZ) + panic("ufsdirhash_checkblock: bad dir"); + + if (dp->d_ino == 0) { +#if 0 + /* + * XXX entries with d_ino == 0 should only occur + * at the start of a DIRBLKSIZ block. However the + * ufs code is tolerant of such entries at other + * offsets, and fsck does not fix them. + */ + if (i != 0) + panic("ufsdirhash_checkblock: bad dir inode"); +#endif + nfree += dp->d_reclen; + continue; + } + + /* Check that the entry exists (will panic if it doesn't). */ + ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i); + + nfree += dp->d_reclen - DIRSIZ(0, dp); + } + if (i != DIRBLKSIZ) + panic("ufsdirhash_checkblock: bad dir end"); + + if (dh->dh_blkfree[block] * DIRALIGN != nfree) + panic("ufsdirhash_checkblock: bad free count"); + + ffslot = BLKFREE2IDX(nfree / DIRALIGN); + for (i = 0; i <= DH_NFSTATS; i++) + if (dh->dh_firstfree[i] == block && i != ffslot) + panic("ufsdirhash_checkblock: bad first-free"); + if (dh->dh_firstfree[ffslot] == -1) + panic("ufsdirhash_checkblock: missing first-free entry"); + mtx_unlock(&dh->dh_mtx); +} + +/* + * Hash the specified filename into a dirhash slot. + */ +static int +ufsdirhash_hash(struct dirhash *dh, char *name, int namelen) +{ + u_int32_t hash; + + /* + * We hash the name and then some other bit of data that is + * invariant over the dirhash's lifetime. Otherwise names + * differing only in the last byte are placed close to one + * another in the table, which is bad for linear probing. + */ + hash = fnv_32_buf(name, namelen, FNV1_32_INIT); + hash = fnv_32_buf(dh, sizeof(dh), hash); + return (hash % dh->dh_hlen); +} + +/* + * Adjust the number of free bytes in the block containing `offset' + * by the value specified by `diff'. + * + * The caller must ensure we have exclusive access to `dh'; normally + * that means that dh_mtx should be held, but this is also called + * from ufsdirhash_build() where exclusive access can be assumed. + */ +static void +ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff) +{ + int block, i, nfidx, ofidx; + + /* Update the per-block summary info. */ + block = offset / DIRBLKSIZ; + KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks, + ("dirhash bad offset")); + ofidx = BLKFREE2IDX(dh->dh_blkfree[block]); + dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN); + nfidx = BLKFREE2IDX(dh->dh_blkfree[block]); + + /* Update the `first free' list if necessary. */ + if (ofidx != nfidx) { + /* If removing, scan forward for the next block. */ + if (dh->dh_firstfree[ofidx] == block) { + for (i = block + 1; i < dh->dh_dirblks; i++) + if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx) + break; + dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1; + } + + /* Make this the new `first free' if necessary */ + if (dh->dh_firstfree[nfidx] > block || + dh->dh_firstfree[nfidx] == -1) + dh->dh_firstfree[nfidx] = block; + } +} + +/* + * Find the specified name which should have the specified offset. + * Returns a slot number, and panics on failure. + * + * `dh' must be locked on entry and remains so on return. + */ +static int +ufsdirhash_findslot(struct dirhash *dh, char *name, int namelen, doff_t offset) +{ + int slot; + + mtx_assert(&dh->dh_mtx, MA_OWNED); + + /* Find the entry. */ + KASSERT(dh->dh_hused < dh->dh_hlen, ("dirhash find full")); + slot = ufsdirhash_hash(dh, name, namelen); + while (DH_ENTRY(dh, slot) != offset && + DH_ENTRY(dh, slot) != DIRHASH_EMPTY) + slot = WRAPINCR(slot, dh->dh_hlen); + if (DH_ENTRY(dh, slot) != offset) + panic("ufsdirhash_findslot: '%.*s' not found", namelen, name); + + return (slot); +} + +/* + * Remove the entry corresponding to the specified slot from the hash array. + * + * `dh' must be locked on entry and remains so on return. + */ +static void +ufsdirhash_delslot(struct dirhash *dh, int slot) +{ + int i; + + mtx_assert(&dh->dh_mtx, MA_OWNED); + + /* Mark the entry as deleted. */ + DH_ENTRY(dh, slot) = DIRHASH_DEL; + + /* If this is the end of a chain of DIRHASH_DEL slots, remove them. */ + for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; ) + i = WRAPINCR(i, dh->dh_hlen); + if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) { + i = WRAPDECR(i, dh->dh_hlen); + while (DH_ENTRY(dh, i) == DIRHASH_DEL) { + DH_ENTRY(dh, i) = DIRHASH_EMPTY; + dh->dh_hused--; + i = WRAPDECR(i, dh->dh_hlen); + } + KASSERT(dh->dh_hused >= 0, ("ufsdirhash_delslot neg hlen")); + } +} + +/* + * Given a directory entry and its offset, find the offset of the + * previous entry in the same DIRBLKSIZ-sized block. Returns an + * offset, or -1 if there is no previous entry in the block or some + * other problem occurred. + */ +static doff_t +ufsdirhash_getprev(struct direct *dirp, doff_t offset) +{ + struct direct *dp; + char *blkbuf; + doff_t blkoff, prevoff; + int entrypos, i; + + blkoff = offset & ~(DIRBLKSIZ - 1); /* offset of start of block */ + entrypos = offset & (DIRBLKSIZ - 1); /* entry relative to block */ + blkbuf = (char *)dirp - entrypos; + prevoff = blkoff; + + /* If `offset' is the start of a block, there is no previous entry. */ + if (entrypos == 0) + return (-1); + + /* Scan from the start of the block until we get to the entry. */ + for (i = 0; i < entrypos; i += dp->d_reclen) { + dp = (struct direct *)(blkbuf + i); + if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos) + return (-1); /* Corrupted directory. */ + prevoff = blkoff + i; + } + return (prevoff); +} + +/* + * Try to free up `wanted' bytes by stealing memory from existing + * dirhashes. Returns zero with ufsdirhash_mtx locked if successful. + */ +static int +ufsdirhash_recycle(int wanted) +{ + struct dirhash *dh; + doff_t **hash; + u_int8_t *blkfree; + int i, mem, narrays; + + mtx_lock(&ufsdirhash_mtx); + while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) { + /* Find a dirhash, and lock it. */ + if ((dh = TAILQ_FIRST(&ufsdirhash_list)) == NULL) { + mtx_unlock(&ufsdirhash_mtx); + return (-1); + } + mtx_lock(&dh->dh_mtx); + KASSERT(dh->dh_hash != NULL, ("dirhash: NULL hash on list")); + + /* Decrement the score; only recycle if it becomes zero. */ + if (--dh->dh_score > 0) { + mtx_unlock(&dh->dh_mtx); + mtx_unlock(&ufsdirhash_mtx); + return (-1); + } + + /* Remove it from the list and detach its memory. */ + TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); + dh->dh_onlist = 0; + hash = dh->dh_hash; + dh->dh_hash = NULL; + blkfree = dh->dh_blkfree; + dh->dh_blkfree = NULL; + narrays = dh->dh_narrays; + mem = narrays * sizeof(*dh->dh_hash) + + narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) + + dh->dh_nblk * sizeof(*dh->dh_blkfree); + + /* Unlock everything, free the detached memory. */ + mtx_unlock(&dh->dh_mtx); + mtx_unlock(&ufsdirhash_mtx); + for (i = 0; i < narrays; i++) + uma_zfree(ufsdirhash_zone, hash[i]); + FREE(hash, M_DIRHASH); + FREE(blkfree, M_DIRHASH); + + /* Account for the returned memory, and repeat if necessary. */ + mtx_lock(&ufsdirhash_mtx); + ufs_dirhashmem -= mem; + } + /* Success; return with ufsdirhash_mtx locked. */ + return (0); +} + + +void +ufsdirhash_init() +{ + ufsdirhash_zone = uma_zcreate("DIRHASH", DH_NBLKOFF * sizeof(doff_t), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + mtx_init(&ufsdirhash_mtx, "dirhash list", NULL, MTX_DEF); + TAILQ_INIT(&ufsdirhash_list); +} + +void +ufsdirhash_uninit() +{ + KASSERT(TAILQ_EMPTY(&ufsdirhash_list), ("ufsdirhash_uninit")); + uma_zdestroy(ufsdirhash_zone); + mtx_destroy(&ufsdirhash_mtx); +} + +#endif /* UFS_DIRHASH */ diff --git a/src/sys/ufs/ufs/ufs_extattr.c b/src/sys/ufs/ufs/ufs_extattr.c new file mode 100644 index 0000000..11a4d75 --- /dev/null +++ b/src/sys/ufs/ufs/ufs_extattr.c @@ -0,0 +1,1301 @@ +/*- + * Copyright (c) 1999, 2000, 2001, 2002 Robert N. M. Watson + * Copyright (c) 2002, 2003 Networks Associates Technology, Inc. + * All rights reserved. + * + * This software was developed by Robert Watson for the TrustedBSD Project. + * + * This software was developed for the FreeBSD Project in part by Network + * Associates Laboratories, the Security Research Division of Network + * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), + * as part of the DARPA CHATS research program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +/* + * Support for filesystem extended attribute: UFS-specific support functions. + */ + +#include +__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_extattr.c,v 1.67 2003/07/28 18:53:28 rwatson Exp $"); + +#include "opt_ufs.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#ifdef UFS_EXTATTR + +static MALLOC_DEFINE(M_UFS_EXTATTR, "ufs_extattr", "ufs extended attribute"); + +static int ufs_extattr_sync = 0; +SYSCTL_INT(_debug, OID_AUTO, ufs_extattr_sync, CTLFLAG_RW, &ufs_extattr_sync, + 0, ""); + +static int ufs_extattr_valid_attrname(int attrnamespace, + const char *attrname); +static int ufs_extattr_enable_with_open(struct ufsmount *ump, + struct vnode *vp, int attrnamespace, const char *attrname, + struct thread *td); +static int ufs_extattr_enable(struct ufsmount *ump, int attrnamespace, + const char *attrname, struct vnode *backing_vnode, + struct thread *td); +static int ufs_extattr_disable(struct ufsmount *ump, int attrnamespace, + const char *attrname, struct thread *td); +static int ufs_extattr_get(struct vnode *vp, int attrnamespace, + const char *name, struct uio *uio, size_t *size, + struct ucred *cred, struct thread *td); +static int ufs_extattr_set(struct vnode *vp, int attrnamespace, + const char *name, struct uio *uio, struct ucred *cred, + struct thread *td); +static int ufs_extattr_rm(struct vnode *vp, int attrnamespace, + const char *name, struct ucred *cred, struct thread *td); + +/* + * Per-FS attribute lock protecting attribute operations. + * XXX Right now there is a lot of lock contention due to having a single + * lock per-FS; really, this should be far more fine-grained. + */ +static void +ufs_extattr_uepm_lock(struct ufsmount *ump, struct thread *td) +{ + + /* Ideally, LK_CANRECURSE would not be used, here. */ + lockmgr(&ump->um_extattr.uepm_lock, LK_EXCLUSIVE | LK_RETRY | + LK_CANRECURSE, 0, td); +} + +static void +ufs_extattr_uepm_unlock(struct ufsmount *ump, struct thread *td) +{ + + lockmgr(&ump->um_extattr.uepm_lock, LK_RELEASE, 0, td); +} + +/* + * Determine whether the name passed is a valid name for an actual + * attribute. + * + * Invalid currently consists of: + * NULL pointer for attrname + * zero-length attrname (used to retrieve application attribute list) + */ +static int +ufs_extattr_valid_attrname(int attrnamespace, const char *attrname) +{ + + if (attrname == NULL) + return (0); + if (strlen(attrname) == 0) + return (0); + return (1); +} + +/* + * Locate an attribute given a name and mountpoint. + * Must be holding uepm lock for the mount point. + */ +static struct ufs_extattr_list_entry * +ufs_extattr_find_attr(struct ufsmount *ump, int attrnamespace, + const char *attrname) +{ + struct ufs_extattr_list_entry *search_attribute; + + for (search_attribute = LIST_FIRST(&ump->um_extattr.uepm_list); + search_attribute; + search_attribute = LIST_NEXT(search_attribute, uele_entries)) { + if (!(strncmp(attrname, search_attribute->uele_attrname, + UFS_EXTATTR_MAXEXTATTRNAME)) && + (attrnamespace == search_attribute->uele_attrnamespace)) { + return (search_attribute); + } + } + + return (0); +} + +/* + * Initialize per-FS structures supporting extended attributes. Do not + * start extended attributes yet. + */ +void +ufs_extattr_uepm_init(struct ufs_extattr_per_mount *uepm) +{ + + uepm->uepm_flags = 0; + + LIST_INIT(&uepm->uepm_list); + /* XXX is PVFS right, here? */ + lockinit(&uepm->uepm_lock, PVFS, "extattr", 0, 0); + uepm->uepm_flags |= UFS_EXTATTR_UEPM_INITIALIZED; +} + +/* + * Destroy per-FS structures supporting extended attributes. Assumes + * that EAs have already been stopped, and will panic if not. + */ +void +ufs_extattr_uepm_destroy(struct ufs_extattr_per_mount *uepm) +{ + + if (!(uepm->uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) + panic("ufs_extattr_uepm_destroy: not initialized"); + + if ((uepm->uepm_flags & UFS_EXTATTR_UEPM_STARTED)) + panic("ufs_extattr_uepm_destroy: called while still started"); + + /* + * It's not clear that either order for the next two lines is + * ideal, and it should never be a problem if this is only called + * during unmount, and with vfs_busy(). + */ + uepm->uepm_flags &= ~UFS_EXTATTR_UEPM_INITIALIZED; + lockdestroy(&uepm->uepm_lock); +} + +/* + * Start extended attribute support on an FS. + */ +int +ufs_extattr_start(struct mount *mp, struct thread *td) +{ + struct ufsmount *ump; + int error = 0; + + ump = VFSTOUFS(mp); + + ufs_extattr_uepm_lock(ump, td); + + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) { + error = EOPNOTSUPP; + goto unlock; + } + if (ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED) { + error = EBUSY; + goto unlock; + } + + ump->um_extattr.uepm_flags |= UFS_EXTATTR_UEPM_STARTED; + + ump->um_extattr.uepm_ucred = crhold(td->td_ucred); + +unlock: + ufs_extattr_uepm_unlock(ump, td); + + return (error); +} + +#ifdef UFS_EXTATTR_AUTOSTART +/* + * Helper routine: given a locked parent directory and filename, return + * the locked vnode of the inode associated with the name. Will not + * follow symlinks, may return any type of vnode. Lock on parent will + * be released even in the event of a failure. In the event that the + * target is the parent (i.e., "."), there will be two references and + * one lock, requiring the caller to possibly special-case. + */ +#define UE_GETDIR_LOCKPARENT 1 +#define UE_GETDIR_LOCKPARENT_DONT 2 +static int +ufs_extattr_lookup(struct vnode *start_dvp, int lockparent, char *dirname, + struct vnode **vp, struct thread *td) +{ + struct vop_cachedlookup_args vargs; + struct componentname cnp; + struct vnode *target_vp; + int error; + + bzero(&cnp, sizeof(cnp)); + cnp.cn_nameiop = LOOKUP; + cnp.cn_flags = ISLASTCN; + if (lockparent == UE_GETDIR_LOCKPARENT) + cnp.cn_flags |= LOCKPARENT; + cnp.cn_thread = td; + cnp.cn_cred = td->td_ucred; + cnp.cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); + cnp.cn_nameptr = cnp.cn_pnbuf; + error = copystr(dirname, cnp.cn_pnbuf, MAXPATHLEN, + (size_t *) &cnp.cn_namelen); + if (error) { + if (lockparent == UE_GETDIR_LOCKPARENT_DONT) { + VOP_UNLOCK(start_dvp, 0, td); + } + uma_zfree(namei_zone, cnp.cn_pnbuf); + printf("ufs_extattr_lookup: copystr failed\n"); + return (error); + } + cnp.cn_namelen--; /* trim nul termination */ + vargs.a_desc = NULL; + vargs.a_dvp = start_dvp; + vargs.a_vpp = &target_vp; + vargs.a_cnp = &cnp; + error = ufs_lookup(&vargs); + uma_zfree(namei_zone, cnp.cn_pnbuf); + if (error) { + /* + * Error condition, may have to release the lock on the parent + * if ufs_lookup() didn't. + */ + if (!(cnp.cn_flags & PDIRUNLOCK) && + (lockparent == UE_GETDIR_LOCKPARENT_DONT)) + VOP_UNLOCK(start_dvp, 0, td); + + /* + * Check that ufs_lookup() didn't release the lock when we + * didn't want it to. + */ + if ((cnp.cn_flags & PDIRUNLOCK) && + (lockparent == UE_GETDIR_LOCKPARENT)) + panic("ufs_extattr_lookup: lockparent but PDIRUNLOCK"); + + return (error); + } +/* + if (target_vp == start_dvp) + panic("ufs_extattr_lookup: target_vp == start_dvp"); +*/ + + if (target_vp != start_dvp && + !(cnp.cn_flags & PDIRUNLOCK) && + (lockparent == UE_GETDIR_LOCKPARENT_DONT)) + panic("ufs_extattr_lookup: !lockparent but !PDIRUNLOCK"); + + if ((cnp.cn_flags & PDIRUNLOCK) && + (lockparent == UE_GETDIR_LOCKPARENT)) + panic("ufs_extattr_lookup: lockparent but PDIRUNLOCK"); + + /* printf("ufs_extattr_lookup: success\n"); */ + *vp = target_vp; + return (0); +} +#endif /* !UFS_EXTATTR_AUTOSTART */ + +/* + * Enable an EA using the passed filesystem, backing vnode, attribute name, + * namespace, and proc. Will perform a VOP_OPEN() on the vp, so expects vp + * to be locked when passed in. The vnode will be returned unlocked, + * regardless of success/failure of the function. As a result, the caller + * will always need to vrele(), but not vput(). + */ +static int +ufs_extattr_enable_with_open(struct ufsmount *ump, struct vnode *vp, + int attrnamespace, const char *attrname, struct thread *td) +{ + int error; + + error = VOP_OPEN(vp, FREAD|FWRITE, td->td_ucred, td, -1); + if (error) { + printf("ufs_extattr_enable_with_open.VOP_OPEN(): failed " + "with %d\n", error); + VOP_UNLOCK(vp, 0, td); + return (error); + } + + /* + * XXX: Note, should VOP_CLOSE() if vfs_object_create() fails, but due + * to a similar piece of code in vn_open(), we don't. + */ + if (vn_canvmio(vp) == TRUE) + if ((error = vfs_object_create(vp, td, + td->td_ucred)) != 0) { + /* + * XXX: bug replicated from vn_open(): should + * VOP_CLOSE() here. + */ + VOP_UNLOCK(vp, 0, td); + return (error); + } + + vp->v_writecount++; + + vref(vp); + + VOP_UNLOCK(vp, 0, td); + + error = ufs_extattr_enable(ump, attrnamespace, attrname, vp, td); + if (error != 0) + vn_close(vp, FREAD|FWRITE, td->td_ucred, td); + return (error); +} + +#ifdef UFS_EXTATTR_AUTOSTART +/* + * Given a locked directory vnode, iterate over the names in the directory + * and use ufs_extattr_lookup() to retrieve locked vnodes of potential + * attribute files. Then invoke ufs_extattr_enable_with_open() on each + * to attempt to start the attribute. Leaves the directory locked on + * exit. + */ +static int +ufs_extattr_iterate_directory(struct ufsmount *ump, struct vnode *dvp, + int attrnamespace, struct thread *td) +{ + struct vop_readdir_args vargs; + struct dirent *dp, *edp; + struct vnode *attr_vp; + struct uio auio; + struct iovec aiov; + char *dirbuf; + int error, eofflag = 0; + + if (dvp->v_type != VDIR) + return (ENOTDIR); + + MALLOC(dirbuf, char *, DIRBLKSIZ, M_TEMP, M_WAITOK); + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_td = td; + auio.uio_offset = 0; + + vargs.a_desc = NULL; + vargs.a_vp = dvp; + vargs.a_uio = &auio; + vargs.a_cred = td->td_ucred; + vargs.a_eofflag = &eofflag; + vargs.a_ncookies = NULL; + vargs.a_cookies = NULL; + + while (!eofflag) { + auio.uio_resid = DIRBLKSIZ; + aiov.iov_base = dirbuf; + aiov.iov_len = DIRBLKSIZ; + error = ufs_readdir(&vargs); + if (error) { + printf("ufs_extattr_iterate_directory: ufs_readdir " + "%d\n", error); + return (error); + } + + edp = (struct dirent *)&dirbuf[DIRBLKSIZ]; + for (dp = (struct dirent *)dirbuf; dp < edp; ) { +#if (BYTE_ORDER == LITTLE_ENDIAN) + dp->d_type = dp->d_namlen; + dp->d_namlen = 0; +#else + dp->d_type = 0; +#endif + if (dp->d_reclen == 0) + break; + error = ufs_extattr_lookup(dvp, UE_GETDIR_LOCKPARENT, + dp->d_name, &attr_vp, td); + if (error) { + printf("ufs_extattr_iterate_directory: lookup " + "%s %d\n", dp->d_name, error); + } else if (attr_vp == dvp) { + vrele(attr_vp); + } else if (attr_vp->v_type != VREG) { + vput(attr_vp); + } else { + error = ufs_extattr_enable_with_open(ump, + attr_vp, attrnamespace, dp->d_name, td); + vrele(attr_vp); + if (error) { + printf("ufs_extattr_iterate_directory: " + "enable %s %d\n", dp->d_name, + error); + } else if (bootverbose) { + printf("UFS autostarted EA %s\n", + dp->d_name); + } + } + dp = (struct dirent *) ((char *)dp + dp->d_reclen); + if (dp >= edp) + break; + } + } + FREE(dirbuf, M_TEMP); + + return (0); +} + +/* + * Auto-start of extended attributes, to be executed (optionally) at + * mount-time. + */ +int +ufs_extattr_autostart(struct mount *mp, struct thread *td) +{ + struct vnode *rvp, *attr_dvp, *attr_system_dvp, *attr_user_dvp; + int error; + + /* + * Does UFS_EXTATTR_FSROOTSUBDIR exist off the filesystem root? + * If so, automatically start EA's. + */ + error = VFS_ROOT(mp, &rvp); + if (error) { + printf("ufs_extattr_autostart.VFS_ROOT() returned %d\n", + error); + return (error); + } + + error = ufs_extattr_lookup(rvp, UE_GETDIR_LOCKPARENT_DONT, + UFS_EXTATTR_FSROOTSUBDIR, &attr_dvp, td); + if (error) { + /* rvp ref'd but now unlocked */ + vrele(rvp); + return (error); + } + if (rvp == attr_dvp) { + /* Should never happen. */ + vrele(attr_dvp); + vput(rvp); + return (EINVAL); + } + vrele(rvp); + + if (attr_dvp->v_type != VDIR) { + printf("ufs_extattr_autostart: %s != VDIR\n", + UFS_EXTATTR_FSROOTSUBDIR); + goto return_vput_attr_dvp; + } + + error = ufs_extattr_start(mp, td); + if (error) { + printf("ufs_extattr_autostart: ufs_extattr_start failed (%d)\n", + error); + goto return_vput_attr_dvp; + } + + /* + * Look for two subdirectories: UFS_EXTATTR_SUBDIR_SYSTEM, + * UFS_EXTATTR_SUBDIR_USER. For each, iterate over the sub-directory, + * and start with appropriate type. Failures in either don't + * result in an over-all failure. attr_dvp is left locked to + * be cleaned up on exit. + */ + error = ufs_extattr_lookup(attr_dvp, UE_GETDIR_LOCKPARENT, + UFS_EXTATTR_SUBDIR_SYSTEM, &attr_system_dvp, td); + if (!error) { + error = ufs_extattr_iterate_directory(VFSTOUFS(mp), + attr_system_dvp, EXTATTR_NAMESPACE_SYSTEM, td); + if (error) + printf("ufs_extattr_iterate_directory returned %d\n", + error); + vput(attr_system_dvp); + } + + error = ufs_extattr_lookup(attr_dvp, UE_GETDIR_LOCKPARENT, + UFS_EXTATTR_SUBDIR_USER, &attr_user_dvp, td); + if (!error) { + error = ufs_extattr_iterate_directory(VFSTOUFS(mp), + attr_user_dvp, EXTATTR_NAMESPACE_USER, td); + if (error) + printf("ufs_extattr_iterate_directory returned %d\n", + error); + vput(attr_user_dvp); + } + + /* Mask startup failures in sub-directories. */ + error = 0; + +return_vput_attr_dvp: + vput(attr_dvp); + + return (error); +} +#endif /* !UFS_EXTATTR_AUTOSTART */ + +/* + * Stop extended attribute support on an FS. + */ +int +ufs_extattr_stop(struct mount *mp, struct thread *td) +{ + struct ufs_extattr_list_entry *uele; + struct ufsmount *ump = VFSTOUFS(mp); + int error = 0; + + ufs_extattr_uepm_lock(ump, td); + + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) { + error = EOPNOTSUPP; + goto unlock; + } + + while (LIST_FIRST(&ump->um_extattr.uepm_list) != NULL) { + uele = LIST_FIRST(&ump->um_extattr.uepm_list); + ufs_extattr_disable(ump, uele->uele_attrnamespace, + uele->uele_attrname, td); + } + + ump->um_extattr.uepm_flags &= ~UFS_EXTATTR_UEPM_STARTED; + + crfree(ump->um_extattr.uepm_ucred); + ump->um_extattr.uepm_ucred = NULL; + +unlock: + ufs_extattr_uepm_unlock(ump, td); + + return (error); +} + +/* + * Enable a named attribute on the specified filesystem; provide an + * unlocked backing vnode to hold the attribute data. + */ +static int +ufs_extattr_enable(struct ufsmount *ump, int attrnamespace, + const char *attrname, struct vnode *backing_vnode, struct thread *td) +{ + struct ufs_extattr_list_entry *attribute; + struct iovec aiov; + struct uio auio; + int error = 0; + + if (!ufs_extattr_valid_attrname(attrnamespace, attrname)) + return (EINVAL); + if (backing_vnode->v_type != VREG) + return (EINVAL); + + MALLOC(attribute, struct ufs_extattr_list_entry *, + sizeof(struct ufs_extattr_list_entry), M_UFS_EXTATTR, M_WAITOK); + if (attribute == NULL) + return (ENOMEM); + + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) { + error = EOPNOTSUPP; + goto free_exit; + } + + if (ufs_extattr_find_attr(ump, attrnamespace, attrname)) { + error = EEXIST; + goto free_exit; + } + + strncpy(attribute->uele_attrname, attrname, + UFS_EXTATTR_MAXEXTATTRNAME); + attribute->uele_attrnamespace = attrnamespace; + bzero(&attribute->uele_fileheader, + sizeof(struct ufs_extattr_fileheader)); + + attribute->uele_backing_vnode = backing_vnode; + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = (caddr_t) &attribute->uele_fileheader; + aiov.iov_len = sizeof(struct ufs_extattr_fileheader); + auio.uio_resid = sizeof(struct ufs_extattr_fileheader); + auio.uio_offset = (off_t) 0; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_td = td; + + VOP_LEASE(backing_vnode, td, td->td_ucred, LEASE_WRITE); + vn_lock(backing_vnode, LK_SHARED | LK_NOPAUSE | LK_RETRY, td); + error = VOP_READ(backing_vnode, &auio, IO_NODELOCKED, + ump->um_extattr.uepm_ucred); + + if (error) + goto unlock_free_exit; + + if (auio.uio_resid != 0) { + printf("ufs_extattr_enable: malformed attribute header\n"); + error = EINVAL; + goto unlock_free_exit; + } + + if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) { + printf("ufs_extattr_enable: invalid attribute header magic\n"); + error = EINVAL; + goto unlock_free_exit; + } + + if (attribute->uele_fileheader.uef_version != UFS_EXTATTR_VERSION) { + printf("ufs_extattr_enable: incorrect attribute header " + "version\n"); + error = EINVAL; + goto unlock_free_exit; + } + + ASSERT_VOP_LOCKED(backing_vnode, "ufs_extattr_enable"); + backing_vnode->v_vflag |= VV_SYSTEM; + LIST_INSERT_HEAD(&ump->um_extattr.uepm_list, attribute, + uele_entries); + + VOP_UNLOCK(backing_vnode, 0, td); + return (0); + +unlock_free_exit: + VOP_UNLOCK(backing_vnode, 0, td); + +free_exit: + FREE(attribute, M_UFS_EXTATTR); + return (error); +} + +/* + * Disable extended attribute support on an FS. + */ +static int +ufs_extattr_disable(struct ufsmount *ump, int attrnamespace, + const char *attrname, struct thread *td) +{ + struct ufs_extattr_list_entry *uele; + int error = 0; + + if (!ufs_extattr_valid_attrname(attrnamespace, attrname)) + return (EINVAL); + + uele = ufs_extattr_find_attr(ump, attrnamespace, attrname); + if (!uele) + return (ENOATTR); + + LIST_REMOVE(uele, uele_entries); + + vn_lock(uele->uele_backing_vnode, LK_SHARED | LK_NOPAUSE | LK_RETRY, + td); + ASSERT_VOP_LOCKED(uele->uele_backing_vnode, "ufs_extattr_disable"); + uele->uele_backing_vnode->v_vflag &= ~VV_SYSTEM; + VOP_UNLOCK(uele->uele_backing_vnode, 0, td); + error = vn_close(uele->uele_backing_vnode, FREAD|FWRITE, + td->td_ucred, td); + + FREE(uele, M_UFS_EXTATTR); + + return (error); +} + +/* + * VFS call to manage extended attributes in UFS. If filename_vp is + * non-NULL, it must be passed in locked, and regardless of errors in + * processing, will be unlocked. + */ +int +ufs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, + int attrnamespace, const char *attrname, struct thread *td) +{ + struct ufsmount *ump = VFSTOUFS(mp); + int error; + + /* + * Processes with privilege, but in jail, are not allowed to + * configure extended attributes. + */ + if ((error = suser(td))) { + if (filename_vp != NULL) + VOP_UNLOCK(filename_vp, 0, td); + return (error); + } + + switch(cmd) { + case UFS_EXTATTR_CMD_START: + if (filename_vp != NULL) { + VOP_UNLOCK(filename_vp, 0, td); + return (EINVAL); + } + if (attrname != NULL) + return (EINVAL); + + error = ufs_extattr_start(mp, td); + + return (error); + + case UFS_EXTATTR_CMD_STOP: + if (filename_vp != NULL) { + VOP_UNLOCK(filename_vp, 0, td); + return (EINVAL); + } + if (attrname != NULL) + return (EINVAL); + + error = ufs_extattr_stop(mp, td); + + return (error); + + case UFS_EXTATTR_CMD_ENABLE: + + if (filename_vp == NULL) + return (EINVAL); + if (attrname == NULL) { + VOP_UNLOCK(filename_vp, 0, td); + return (EINVAL); + } + + /* + * ufs_extattr_enable_with_open() will always unlock the + * vnode, regardless of failure. + */ + ufs_extattr_uepm_lock(ump, td); + error = ufs_extattr_enable_with_open(ump, filename_vp, + attrnamespace, attrname, td); + ufs_extattr_uepm_unlock(ump, td); + + return (error); + + case UFS_EXTATTR_CMD_DISABLE: + + if (filename_vp != NULL) { + VOP_UNLOCK(filename_vp, 0, td); + return (EINVAL); + } + if (attrname == NULL) + return (EINVAL); + + ufs_extattr_uepm_lock(ump, td); + error = ufs_extattr_disable(ump, attrnamespace, attrname, + td); + ufs_extattr_uepm_unlock(ump, td); + + return (error); + + default: + return (EINVAL); + } +} + +/* + * Vnode operating to retrieve a named extended attribute. + */ +int +ufs_getextattr(struct vop_getextattr_args *ap) +/* +vop_getextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + INOUT struct uio *a_uio; + OUT size_t *a_size; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +*/ +{ + struct mount *mp = ap->a_vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + int error; + + ufs_extattr_uepm_lock(ump, ap->a_td); + + error = ufs_extattr_get(ap->a_vp, ap->a_attrnamespace, ap->a_name, + ap->a_uio, ap->a_size, ap->a_cred, ap->a_td); + + ufs_extattr_uepm_unlock(ump, ap->a_td); + + return (error); +} + +/* + * Real work associated with retrieving a named attribute--assumes that + * the attribute lock has already been grabbed. + */ +static int +ufs_extattr_get(struct vnode *vp, int attrnamespace, const char *name, + struct uio *uio, size_t *size, struct ucred *cred, struct thread *td) +{ + struct ufs_extattr_list_entry *attribute; + struct ufs_extattr_header ueh; + struct iovec local_aiov; + struct uio local_aio; + struct mount *mp = vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + struct inode *ip = VTOI(vp); + off_t base_offset; + size_t len, old_len; + int error = 0; + + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) + return (EOPNOTSUPP); + + if (strlen(name) == 0) + return (EINVAL); + + error = extattr_check_cred(vp, attrnamespace, cred, td, IREAD); + if (error) + return (error); + + attribute = ufs_extattr_find_attr(ump, attrnamespace, name); + if (!attribute) + return (ENOATTR); + + /* + * Allow only offsets of zero to encourage the read/replace + * extended attribute semantic. Otherwise we can't guarantee + * atomicity, as we don't provide locks for extended attributes. + */ + if (uio != NULL && uio->uio_offset != 0) + return (ENXIO); + + /* + * Find base offset of header in file based on file header size, and + * data header size + maximum data size, indexed by inode number. + */ + base_offset = sizeof(struct ufs_extattr_fileheader) + + ip->i_number * (sizeof(struct ufs_extattr_header) + + attribute->uele_fileheader.uef_size); + + /* + * Read in the data header to see if the data is defined, and if so + * how much. + */ + bzero(&ueh, sizeof(struct ufs_extattr_header)); + local_aiov.iov_base = (caddr_t) &ueh; + local_aiov.iov_len = sizeof(struct ufs_extattr_header); + local_aio.uio_iov = &local_aiov; + local_aio.uio_iovcnt = 1; + local_aio.uio_rw = UIO_READ; + local_aio.uio_segflg = UIO_SYSSPACE; + local_aio.uio_td = td; + local_aio.uio_offset = base_offset; + local_aio.uio_resid = sizeof(struct ufs_extattr_header); + + /* + * Acquire locks. + */ + VOP_LEASE(attribute->uele_backing_vnode, td, cred, LEASE_READ); + /* + * Don't need to get a lock on the backing file if the getattr is + * being applied to the backing file, as the lock is already held. + */ + if (attribute->uele_backing_vnode != vp) + vn_lock(attribute->uele_backing_vnode, LK_SHARED | + LK_NOPAUSE | LK_RETRY, td); + + error = VOP_READ(attribute->uele_backing_vnode, &local_aio, + IO_NODELOCKED, ump->um_extattr.uepm_ucred); + if (error) + goto vopunlock_exit; + + /* Defined? */ + if ((ueh.ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0) { + error = ENOATTR; + goto vopunlock_exit; + } + + /* Valid for the current inode generation? */ + if (ueh.ueh_i_gen != ip->i_gen) { + /* + * The inode itself has a different generation number + * than the attribute data. For now, the best solution + * is to coerce this to undefined, and let it get cleaned + * up by the next write or extattrctl clean. + */ + printf("ufs_extattr_get (%s): inode number inconsistency (%d, %jd)\n", + mp->mnt_stat.f_mntonname, ueh.ueh_i_gen, (intmax_t)ip->i_gen); + error = ENOATTR; + goto vopunlock_exit; + } + + /* Local size consistency check. */ + if (ueh.ueh_len > attribute->uele_fileheader.uef_size) { + error = ENXIO; + goto vopunlock_exit; + } + + /* Return full data size if caller requested it. */ + if (size != NULL) + *size = ueh.ueh_len; + + /* Return data if the caller requested it. */ + if (uio != NULL) { + /* Allow for offset into the attribute data. */ + uio->uio_offset = base_offset + sizeof(struct + ufs_extattr_header); + + /* + * Figure out maximum to transfer -- use buffer size and + * local data limit. + */ + len = MIN(uio->uio_resid, ueh.ueh_len); + old_len = uio->uio_resid; + uio->uio_resid = len; + + error = VOP_READ(attribute->uele_backing_vnode, uio, + IO_NODELOCKED, ump->um_extattr.uepm_ucred); + if (error) + goto vopunlock_exit; + + uio->uio_resid = old_len - (len - uio->uio_resid); + } + +vopunlock_exit: + + if (uio != NULL) + uio->uio_offset = 0; + + if (attribute->uele_backing_vnode != vp) + VOP_UNLOCK(attribute->uele_backing_vnode, 0, td); + + return (error); +} + +/* + * Vnode operation to remove a named attribute. + */ +int +ufs_deleteextattr(struct vop_deleteextattr_args *ap) +/* +vop_deleteextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +*/ +{ + struct mount *mp = ap->a_vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + + int error; + + ufs_extattr_uepm_lock(ump, ap->a_td); + + error = ufs_extattr_rm(ap->a_vp, ap->a_attrnamespace, ap->a_name, + ap->a_cred, ap->a_td); + + + ufs_extattr_uepm_unlock(ump, ap->a_td); + + return (error); +} + +/* + * Vnode operation to set a named attribute. + */ +int +ufs_setextattr(struct vop_setextattr_args *ap) +/* +vop_setextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + INOUT struct uio *a_uio; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +*/ +{ + struct mount *mp = ap->a_vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + + int error; + + ufs_extattr_uepm_lock(ump, ap->a_td); + + /* + * XXX: No longer a supported way to delete extended attributes. + */ + if (ap->a_uio == NULL) + return (EINVAL); + + error = ufs_extattr_set(ap->a_vp, ap->a_attrnamespace, ap->a_name, + ap->a_uio, ap->a_cred, ap->a_td); + + ufs_extattr_uepm_unlock(ump, ap->a_td); + + return (error); +} + +/* + * Real work associated with setting a vnode's extended attributes; + * assumes that the attribute lock has already been grabbed. + */ +static int +ufs_extattr_set(struct vnode *vp, int attrnamespace, const char *name, + struct uio *uio, struct ucred *cred, struct thread *td) +{ + struct ufs_extattr_list_entry *attribute; + struct ufs_extattr_header ueh; + struct iovec local_aiov; + struct uio local_aio; + struct mount *mp = vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + struct inode *ip = VTOI(vp); + off_t base_offset; + int error = 0, ioflag; + + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) + return (EOPNOTSUPP); + if (!ufs_extattr_valid_attrname(attrnamespace, name)) + return (EINVAL); + + error = extattr_check_cred(vp, attrnamespace, cred, td, IWRITE); + if (error) + return (error); + + attribute = ufs_extattr_find_attr(ump, attrnamespace, name); + if (!attribute) + return (ENOATTR); + + /* + * Early rejection of invalid offsets/length. + * Reject: any offset but 0 (replace) + * Any size greater than attribute size limit + */ + if (uio->uio_offset != 0 || + uio->uio_resid > attribute->uele_fileheader.uef_size) + return (ENXIO); + + /* + * Find base offset of header in file based on file header size, and + * data header size + maximum data size, indexed by inode number. + */ + base_offset = sizeof(struct ufs_extattr_fileheader) + + ip->i_number * (sizeof(struct ufs_extattr_header) + + attribute->uele_fileheader.uef_size); + + /* + * Write out a data header for the data. + */ + ueh.ueh_len = uio->uio_resid; + ueh.ueh_flags = UFS_EXTATTR_ATTR_FLAG_INUSE; + ueh.ueh_i_gen = ip->i_gen; + local_aiov.iov_base = (caddr_t) &ueh; + local_aiov.iov_len = sizeof(struct ufs_extattr_header); + local_aio.uio_iov = &local_aiov; + local_aio.uio_iovcnt = 1; + local_aio.uio_rw = UIO_WRITE; + local_aio.uio_segflg = UIO_SYSSPACE; + local_aio.uio_td = td; + local_aio.uio_offset = base_offset; + local_aio.uio_resid = sizeof(struct ufs_extattr_header); + + /* + * Acquire locks. + */ + VOP_LEASE(attribute->uele_backing_vnode, td, cred, LEASE_WRITE); + + /* + * Don't need to get a lock on the backing file if the setattr is + * being applied to the backing file, as the lock is already held. + */ + if (attribute->uele_backing_vnode != vp) + vn_lock(attribute->uele_backing_vnode, + LK_EXCLUSIVE | LK_NOPAUSE | LK_RETRY, td); + + ioflag = IO_NODELOCKED; + if (ufs_extattr_sync) + ioflag |= IO_SYNC; + error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag, + ump->um_extattr.uepm_ucred); + if (error) + goto vopunlock_exit; + + if (local_aio.uio_resid != 0) { + error = ENXIO; + goto vopunlock_exit; + } + + /* + * Write out user data. + */ + uio->uio_offset = base_offset + sizeof(struct ufs_extattr_header); + + ioflag = IO_NODELOCKED; + if (ufs_extattr_sync) + ioflag |= IO_SYNC; + error = VOP_WRITE(attribute->uele_backing_vnode, uio, ioflag, + ump->um_extattr.uepm_ucred); + +vopunlock_exit: + uio->uio_offset = 0; + + if (attribute->uele_backing_vnode != vp) + VOP_UNLOCK(attribute->uele_backing_vnode, 0, td); + + return (error); +} + +/* + * Real work associated with removing an extended attribute from a vnode. + * Assumes the attribute lock has already been grabbed. + */ +static int +ufs_extattr_rm(struct vnode *vp, int attrnamespace, const char *name, + struct ucred *cred, struct thread *td) +{ + struct ufs_extattr_list_entry *attribute; + struct ufs_extattr_header ueh; + struct iovec local_aiov; + struct uio local_aio; + struct mount *mp = vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + struct inode *ip = VTOI(vp); + off_t base_offset; + int error = 0, ioflag; + + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) + return (EOPNOTSUPP); + if (!ufs_extattr_valid_attrname(attrnamespace, name)) + return (EINVAL); + + error = extattr_check_cred(vp, attrnamespace, cred, td, IWRITE); + if (error) + return (error); + + attribute = ufs_extattr_find_attr(ump, attrnamespace, name); + if (!attribute) + return (ENOATTR); + + /* + * Find base offset of header in file based on file header size, and + * data header size + maximum data size, indexed by inode number. + */ + base_offset = sizeof(struct ufs_extattr_fileheader) + + ip->i_number * (sizeof(struct ufs_extattr_header) + + attribute->uele_fileheader.uef_size); + + /* + * Check to see if currently defined. + */ + bzero(&ueh, sizeof(struct ufs_extattr_header)); + + local_aiov.iov_base = (caddr_t) &ueh; + local_aiov.iov_len = sizeof(struct ufs_extattr_header); + local_aio.uio_iov = &local_aiov; + local_aio.uio_iovcnt = 1; + local_aio.uio_rw = UIO_READ; + local_aio.uio_segflg = UIO_SYSSPACE; + local_aio.uio_td = td; + local_aio.uio_offset = base_offset; + local_aio.uio_resid = sizeof(struct ufs_extattr_header); + + VOP_LEASE(attribute->uele_backing_vnode, td, cred, LEASE_WRITE); + + /* + * Don't need to get the lock on the backing vnode if the vnode we're + * modifying is it, as we already hold the lock. + */ + if (attribute->uele_backing_vnode != vp) + vn_lock(attribute->uele_backing_vnode, + LK_EXCLUSIVE | LK_NOPAUSE | LK_RETRY, td); + + error = VOP_READ(attribute->uele_backing_vnode, &local_aio, + IO_NODELOCKED, ump->um_extattr.uepm_ucred); + if (error) + goto vopunlock_exit; + + /* Defined? */ + if ((ueh.ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0) { + error = ENOATTR; + goto vopunlock_exit; + } + + /* Valid for the current inode generation? */ + if (ueh.ueh_i_gen != ip->i_gen) { + /* + * The inode itself has a different generation number than + * the attribute data. For now, the best solution is to + * coerce this to undefined, and let it get cleaned up by + * the next write or extattrctl clean. + */ + printf("ufs_extattr_rm (%s): inode number inconsistency (%d, %jd)\n", + mp->mnt_stat.f_mntonname, ueh.ueh_i_gen, (intmax_t)ip->i_gen); + error = ENOATTR; + goto vopunlock_exit; + } + + /* Flag it as not in use. */ + ueh.ueh_flags = 0; + ueh.ueh_len = 0; + + local_aiov.iov_base = (caddr_t) &ueh; + local_aiov.iov_len = sizeof(struct ufs_extattr_header); + local_aio.uio_iov = &local_aiov; + local_aio.uio_iovcnt = 1; + local_aio.uio_rw = UIO_WRITE; + local_aio.uio_segflg = UIO_SYSSPACE; + local_aio.uio_td = td; + local_aio.uio_offset = base_offset; + local_aio.uio_resid = sizeof(struct ufs_extattr_header); + + ioflag = IO_NODELOCKED; + if (ufs_extattr_sync) + ioflag |= IO_SYNC; + error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag, + ump->um_extattr.uepm_ucred); + if (error) + goto vopunlock_exit; + + if (local_aio.uio_resid != 0) + error = ENXIO; + +vopunlock_exit: + VOP_UNLOCK(attribute->uele_backing_vnode, 0, td); + + return (error); +} + +/* + * Called by UFS when an inode is no longer active and should have its + * attributes stripped. + */ +void +ufs_extattr_vnode_inactive(struct vnode *vp, struct thread *td) +{ + struct ufs_extattr_list_entry *uele; + struct mount *mp = vp->v_mount; + struct ufsmount *ump = VFSTOUFS(mp); + + /* + * In that case, we cannot lock. We should not have any active vnodes + * on the fs if this is not yet initialized but is going to be, so + * this can go unlocked. + */ + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) + return; + + ufs_extattr_uepm_lock(ump, td); + + if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) { + ufs_extattr_uepm_unlock(ump, td); + return; + } + + LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries) + ufs_extattr_rm(vp, uele->uele_attrnamespace, + uele->uele_attrname, NULL, td); + + ufs_extattr_uepm_unlock(ump, td); +} + +#endif /* !UFS_EXTATTR */ diff --git a/src/sys/ufs/ufs/ufs_extern.h b/src/sys/ufs/ufs/ufs_extern.h new file mode 100644 index 0000000..e198732 --- /dev/null +++ b/src/sys/ufs/ufs/ufs_extern.h @@ -0,0 +1,125 @@ +/*- + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_extern.h 8.10 (Berkeley) 5/14/95 + * $FreeBSD: src/sys/ufs/ufs/ufs_extern.h,v 1.48 2002/10/18 22:52:41 dillon Exp $ + */ + +#ifndef _UFS_UFS_EXTERN_H_ +#define _UFS_UFS_EXTERN_H_ + +struct componentname; +struct direct; +struct indir; +struct inode; +struct mount; +struct netcred; +struct thread; +struct sockaddr; +struct ucred; +struct ufid; +struct vfsconf; +struct vnode; +struct vop_bmap_args; +struct vop_cachedlookup_args; +struct vop_generic_args; +struct vop_inactive_args; +struct vop_reclaim_args; + +int ufs_vnoperate(struct vop_generic_args *); +int ufs_vnoperatefifo(struct vop_generic_args *); +int ufs_vnoperatespec(struct vop_generic_args *); + +int ufs_bmap(struct vop_bmap_args *); +int ufs_bmaparray(struct vnode *, ufs2_daddr_t, ufs2_daddr_t *, + struct buf *, int *, int *); +int ufs_fhtovp(struct mount *, struct ufid *, struct vnode **); +int ufs_checkpath(struct inode *, struct inode *, struct ucred *); +void ufs_dirbad(struct inode *, doff_t, char *); +int ufs_dirbadentry(struct vnode *, struct direct *, int); +int ufs_dirempty(struct inode *, ino_t, struct ucred *); +int ufs_extread(struct vop_read_args *); +int ufs_extwrite(struct vop_write_args *); +void ufs_makedirentry(struct inode *, struct componentname *, + struct direct *); +int ufs_direnter(struct vnode *, struct vnode *, struct direct *, + struct componentname *, struct buf *); +int ufs_dirremove(struct vnode *, struct inode *, int, int); +int ufs_dirrewrite(struct inode *, struct inode *, ino_t, int, int); +int ufs_getlbns(struct vnode *, ufs2_daddr_t, struct indir *, int *); +int ufs_ihashget(dev_t, ino_t, int, struct vnode **); +void ufs_ihashinit(void); +int ufs_ihashins(struct inode *, int, struct vnode **); +struct vnode * + ufs_ihashlookup(dev_t, ino_t); +void ufs_ihashrem(struct inode *); +void ufs_ihashuninit(void); +int ufs_inactive(struct vop_inactive_args *); +int ufs_init(struct vfsconf *); +void ufs_itimes(struct vnode *vp); +int ufs_lookup(struct vop_cachedlookup_args *); +int ufs_readdir(struct vop_readdir_args *); +int ufs_reclaim(struct vop_reclaim_args *); +void ffs_snapgone(struct inode *); +vfs_root_t ufs_root; +vfs_start_t ufs_start; +int ufs_uninit(struct vfsconf *); +int ufs_vinit(struct mount *, vop_t **, vop_t **, struct vnode **); + +/* + * Soft update function prototypes. + */ +int softdep_setup_directory_add(struct buf *, struct inode *, off_t, + ino_t, struct buf *, int); +void softdep_change_directoryentry_offset(struct inode *, caddr_t, + caddr_t, caddr_t, int); +void softdep_setup_remove(struct buf *,struct inode *, struct inode *, int); +void softdep_setup_directory_change(struct buf *, struct inode *, + struct inode *, ino_t, int); +void softdep_change_linkcnt(struct inode *); +void softdep_releasefile(struct inode *); +int softdep_slowdown(struct vnode *); + +/* + * Flags to low-level allocation routines. The low 16-bits are reserved + * for IO_ flags from vnode.h. + * + * Note: The general vfs code typically limits the sequential heuristic + * count to 127. See sequential_heuristic() in kern/vfs_vnops.c + */ +#define BA_CLRBUF 0x00010000 /* Clear invalid areas of buffer. */ +#define BA_METAONLY 0x00020000 /* Return indirect block buffer. */ +#define BA_SEQMASK 0x7F000000 /* Bits holding seq heuristic. */ +#define BA_SEQSHIFT 24 +#define BA_SEQMAX 0x7F + +#endif /* !_UFS_UFS_EXTERN_H_ */ diff --git a/src/sys/ufs/ufs/ufs_ihash.c b/src/sys/ufs/ufs/ufs_ihash.c new file mode 100644 index 0000000..ab2638f --- /dev/null +++ b/src/sys/ufs/ufs/ufs_ihash.c @@ -0,0 +1,196 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_ihash.c 8.7 (Berkeley) 5/17/95 + */ + +#include +__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_ihash.c,v 1.37 2003/10/04 14:03:28 jeff Exp $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static MALLOC_DEFINE(M_UFSIHASH, "UFS ihash", "UFS Inode hash tables"); +/* + * Structures associated with inode cacheing. + */ +static LIST_HEAD(ihashhead, inode) *ihashtbl; +static u_long ihash; /* size of hash table - 1 */ +#define INOHASH(device, inum) (&ihashtbl[(minor(device) + (inum)) & ihash]) +static struct mtx ufs_ihash_mtx; + +/* + * Initialize inode hash table. + */ +void +ufs_ihashinit() +{ + + ihashtbl = hashinit(desiredvnodes, M_UFSIHASH, &ihash); + mtx_init(&ufs_ihash_mtx, "ufs ihash", NULL, MTX_DEF); +} + +/* + * Destroy the inode hash table. + */ +void +ufs_ihashuninit() +{ + + hashdestroy(ihashtbl, M_UFSIHASH, ihash); + mtx_destroy(&ufs_ihash_mtx); +} + +/* + * Use the device/inum pair to find the incore inode, and return a pointer + * to it. If it is in core, return it, even if it is locked. + */ +struct vnode * +ufs_ihashlookup(dev, inum) + dev_t dev; + ino_t inum; +{ + struct inode *ip; + + mtx_lock(&ufs_ihash_mtx); + LIST_FOREACH(ip, INOHASH(dev, inum), i_hash) + if (inum == ip->i_number && dev == ip->i_dev) + break; + mtx_unlock(&ufs_ihash_mtx); + + if (ip) + return (ITOV(ip)); + return (NULLVP); +} + +/* + * Use the device/inum pair to find the incore inode, and return a pointer + * to it. If it is in core, but locked, wait for it. + */ +int +ufs_ihashget(dev, inum, flags, vpp) + dev_t dev; + ino_t inum; + int flags; + struct vnode **vpp; +{ + struct thread *td = curthread; /* XXX */ + struct inode *ip; + struct vnode *vp; + int error; + + *vpp = NULL; +loop: + mtx_lock(&ufs_ihash_mtx); + LIST_FOREACH(ip, INOHASH(dev, inum), i_hash) { + if (inum == ip->i_number && dev == ip->i_dev) { + vp = ITOV(ip); + VI_LOCK(vp); + mtx_unlock(&ufs_ihash_mtx); + error = vget(vp, flags | LK_INTERLOCK, td); + if (error == ENOENT) + goto loop; + if (error) + return (error); + *vpp = vp; + return (0); + } + } + mtx_unlock(&ufs_ihash_mtx); + return (0); +} + +/* + * Check hash for duplicate of passed inode, and add if there is no one. + * if there is a duplicate, vget() it and return to the caller. + */ +int +ufs_ihashins(ip, flags, ovpp) + struct inode *ip; + int flags; + struct vnode **ovpp; +{ + struct thread *td = curthread; /* XXX */ + struct ihashhead *ipp; + struct inode *oip; + struct vnode *ovp; + int error; + +loop: + mtx_lock(&ufs_ihash_mtx); + ipp = INOHASH(ip->i_dev, ip->i_number); + LIST_FOREACH(oip, ipp, i_hash) { + if (ip->i_number == oip->i_number && ip->i_dev == oip->i_dev) { + ovp = ITOV(oip); + VI_LOCK(ovp); + mtx_unlock(&ufs_ihash_mtx); + error = vget(ovp, flags | LK_INTERLOCK, td); + if (error == ENOENT) + goto loop; + if (error) + return (error); + *ovpp = ovp; + return (0); + } + } + LIST_INSERT_HEAD(ipp, ip, i_hash); + ip->i_flag |= IN_HASHED; + mtx_unlock(&ufs_ihash_mtx); + *ovpp = NULL; + return (0); +} + +/* + * Remove the inode from the hash table. + */ +void +ufs_ihashrem(ip) + struct inode *ip; +{ + mtx_lock(&ufs_ihash_mtx); + if (ip->i_flag & IN_HASHED) { + ip->i_flag &= ~IN_HASHED; + LIST_REMOVE(ip, i_hash); + } + mtx_unlock(&ufs_ihash_mtx); +} diff --git a/src/sys/ufs/ufs/ufs_inode.c b/src/sys/ufs/ufs/ufs_inode.c new file mode 100644 index 0000000..ffab94a --- /dev/null +++ b/src/sys/ufs/ufs/ufs_inode.c @@ -0,0 +1,187 @@ +/* + * Copyright (c) 1991, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_inode.c 8.9 (Berkeley) 5/14/95 + */ + +#include +__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_inode.c,v 1.52 2003/10/05 02:45:00 jeff Exp $"); + +#include "opt_quota.h" +#include "opt_ufs.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#ifdef UFS_DIRHASH +#include +#include +#endif + +/* + * Last reference to an inode. If necessary, write or delete it. + */ +int +ufs_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct thread *td = ap->a_td; + mode_t mode; + int error = 0; + + VI_LOCK(vp); + if (prtactive && vp->v_usecount != 0) + vprint("ufs_inactive: pushing active", vp); + VI_UNLOCK(vp); + + /* + * Ignore inodes related to stale file handles. + */ + if (ip->i_mode == 0) + goto out; + if (ip->i_effnlink == 0 && DOINGSOFTDEP(vp)) + softdep_releasefile(ip); + if (ip->i_nlink <= 0) { + (void) vn_write_suspend_wait(vp, NULL, V_WAIT); +#ifdef QUOTA + if (!getinoquota(ip)) + (void)chkiq(ip, -1, NOCRED, FORCE); +#endif +#ifdef UFS_EXTATTR + ufs_extattr_vnode_inactive(vp, td); +#endif + error = UFS_TRUNCATE(vp, (off_t)0, IO_EXT | IO_NORMAL, + NOCRED, td); + /* + * Setting the mode to zero needs to wait for the inode + * to be written just as does a change to the link count. + * So, rather than creating a new entry point to do the + * same thing, we just use softdep_change_linkcnt(). + */ + DIP(ip, i_rdev) = 0; + mode = ip->i_mode; + ip->i_mode = 0; + DIP(ip, i_mode) = 0; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (DOINGSOFTDEP(vp)) + softdep_change_linkcnt(ip); + UFS_VFREE(vp, ip->i_number, mode); + } + if (ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) { + if ((ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) == 0 && + vn_write_suspend_wait(vp, NULL, V_NOWAIT)) { + ip->i_flag &= ~IN_ACCESS; + } else { + (void) vn_write_suspend_wait(vp, NULL, V_WAIT); + UFS_UPDATE(vp, 0); + } + } +out: + VOP_UNLOCK(vp, 0, td); + /* + * If we are done with the inode, reclaim it + * so that it can be reused immediately. + */ + if (ip->i_mode == 0) + vrecycle(vp, NULL, td); + return (error); +} + +/* + * Reclaim an inode so that it can be used for other purposes. + */ +int +ufs_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + struct thread *a_td; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct ufsmount *ump = ip->i_ump; +#ifdef QUOTA + int i; +#endif + + VI_LOCK(vp); + if (prtactive && vp->v_usecount != 0) + vprint("ufs_reclaim: pushing active", vp); + VI_UNLOCK(vp); + if (ip->i_flag & IN_LAZYMOD) { + ip->i_flag |= IN_MODIFIED; + UFS_UPDATE(vp, 0); + } + /* + * Remove the inode from its hash chain. + */ + ufs_ihashrem(ip); + /* + * Purge old data structures associated with the inode. + */ + vrele(ip->i_devvp); +#ifdef QUOTA + for (i = 0; i < MAXQUOTAS; i++) { + if (ip->i_dquot[i] != NODQUOT) { + dqrele(vp, ip->i_dquot[i]); + ip->i_dquot[i] = NODQUOT; + } + } +#endif +#ifdef UFS_DIRHASH + if (ip->i_dirhash != NULL) + ufsdirhash_free(ip); +#endif + UFS_IFREE(ump, ip); + vp->v_data = 0; + return (0); +} diff --git a/src/sys/ufs/ufs/ufs_lookup.c b/src/sys/ufs/ufs/ufs_lookup.c new file mode 100644 index 0000000..bcaf30a --- /dev/null +++ b/src/sys/ufs/ufs/ufs_lookup.c @@ -0,0 +1,1269 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_lookup.c 8.15 (Berkeley) 6/16/95 + */ + +#include +__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_lookup.c,v 1.68 2003/06/11 06:34:30 obrien Exp $"); + +#include "opt_ffs_broken_fixme.h" +#include "opt_ufs.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#ifdef UFS_DIRHASH +#include +#endif +#include +#include + +#ifdef DIAGNOSTIC +static int dirchk = 1; +#else +static int dirchk = 0; +#endif + +SYSCTL_INT(_debug, OID_AUTO, dircheck, CTLFLAG_RW, &dirchk, 0, ""); + +/* true if old FS format...*/ +#define OFSFMT(vp) ((vp)->v_mount->mnt_maxsymlinklen <= 0) + +/* + * Convert a component of a pathname into a pointer to a locked inode. + * This is a very central and rather complicated routine. + * If the filesystem is not maintained in a strict tree hierarchy, + * this can result in a deadlock situation (see comments in code below). + * + * The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending + * on whether the name is to be looked up, created, renamed, or deleted. + * When CREATE, RENAME, or DELETE is specified, information usable in + * creating, renaming, or deleting a directory entry may be calculated. + * If flag has LOCKPARENT or'ed into it and the target of the pathname + * exists, lookup returns both the target and its parent directory locked. + * When creating or renaming and LOCKPARENT is specified, the target may + * not be ".". When deleting and LOCKPARENT is specified, the target may + * be "."., but the caller must check to ensure it does an vrele and vput + * instead of two vputs. + * + * This routine is actually used as VOP_CACHEDLOOKUP method, and the + * filesystem employs the generic vfs_cache_lookup() as VOP_LOOKUP + * method. + * + * vfs_cache_lookup() performs the following for us: + * check that it is a directory + * check accessibility of directory + * check for modification attempts on read-only mounts + * if name found in cache + * if at end of path and deleting or creating + * drop it + * else + * return name. + * return VOP_CACHEDLOOKUP() + * + * Overall outline of ufs_lookup: + * + * search for name in directory, to found or notfound + * notfound: + * if creating, return locked directory, leaving info on available slots + * else return error + * found: + * if at end of path and deleting, return information to allow delete + * if at end of path and rewriting (RENAME and LOCKPARENT), lock target + * inode and return info to allow rewrite + * if not at end, add name to cache; if at end and neither creating + * nor deleting, add name to cache + */ +int +ufs_lookup(ap) + struct vop_cachedlookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + struct vnode *vdp; /* vnode for directory being searched */ + struct inode *dp; /* inode for directory being searched */ + struct buf *bp; /* a buffer of directory entries */ + struct direct *ep; /* the current directory entry */ + int entryoffsetinblock; /* offset of ep in bp's buffer */ + enum {NONE, COMPACT, FOUND} slotstatus; + doff_t slotoffset; /* offset of area with free space */ + int slotsize; /* size of area at slotoffset */ + int slotfreespace; /* amount of space free in slot */ + int slotneeded; /* size of the entry we're seeking */ + int numdirpasses; /* strategy for directory search */ + doff_t endsearch; /* offset to end directory search */ + doff_t prevoff; /* prev entry dp->i_offset */ + struct vnode *pdp; /* saved dp during symlink work */ + struct vnode *tdp; /* returned by VFS_VGET */ + doff_t enduseful; /* pointer past last used dir slot */ + u_long bmask; /* block offset mask */ + int lockparent; /* 1 => lockparent flag is set */ + int wantparent; /* 1 => wantparent or lockparent flag */ + int namlen, error; + struct vnode **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + struct ucred *cred = cnp->cn_cred; + int flags = cnp->cn_flags; + int nameiop = cnp->cn_nameiop; + struct thread *td = cnp->cn_thread; + + bp = NULL; + slotoffset = -1; + cnp->cn_flags &= ~PDIRUNLOCK; +/* + * XXX there was a soft-update diff about this I couldn't merge. + * I think this was the equiv. + */ + *vpp = NULL; + + vdp = ap->a_dvp; + dp = VTOI(vdp); + lockparent = flags & LOCKPARENT; + wantparent = flags & (LOCKPARENT|WANTPARENT); + + /* + * We now have a segment name to search for, and a directory to search. + * + * Suppress search for slots unless creating + * file and at end of pathname, in which case + * we watch for a place to put the new file in + * case it doesn't already exist. + */ + slotstatus = FOUND; + slotfreespace = slotsize = slotneeded = 0; + if ((nameiop == CREATE || nameiop == RENAME) && + (flags & ISLASTCN)) { + slotstatus = NONE; + slotneeded = DIRECTSIZ(cnp->cn_namelen); + } + bmask = VFSTOUFS(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1; + +#ifdef UFS_DIRHASH + /* + * Use dirhash for fast operations on large directories. The logic + * to determine whether to hash the directory is contained within + * ufsdirhash_build(); a zero return means that it decided to hash + * this directory and it successfully built up the hash table. + */ + if (ufsdirhash_build(dp) == 0) { + /* Look for a free slot if needed. */ + enduseful = dp->i_size; + if (slotstatus != FOUND) { + slotoffset = ufsdirhash_findfree(dp, slotneeded, + &slotsize); + if (slotoffset >= 0) { + slotstatus = COMPACT; + enduseful = ufsdirhash_enduseful(dp); + if (enduseful < 0) + enduseful = dp->i_size; + } + } + /* Look up the component. */ + numdirpasses = 1; + entryoffsetinblock = 0; /* silence compiler warning */ + switch (ufsdirhash_lookup(dp, cnp->cn_nameptr, cnp->cn_namelen, + &dp->i_offset, &bp, nameiop == DELETE ? &prevoff : NULL)) { + case 0: + ep = (struct direct *)((char *)bp->b_data + + (dp->i_offset & bmask)); + goto foundentry; + case ENOENT: + dp->i_offset = roundup2(dp->i_size, DIRBLKSIZ); + goto notfound; + default: + /* Something failed; just do a linear search. */ + break; + } + } +#endif /* UFS_DIRHASH */ + /* + * If there is cached information on a previous search of + * this directory, pick up where we last left off. + * We cache only lookups as these are the most common + * and have the greatest payoff. Caching CREATE has little + * benefit as it usually must search the entire directory + * to determine that the entry does not exist. Caching the + * location of the last DELETE or RENAME has not reduced + * profiling time and hence has been removed in the interest + * of simplicity. + */ + if (nameiop != LOOKUP || dp->i_diroff == 0 || + dp->i_diroff >= dp->i_size) { + entryoffsetinblock = 0; + dp->i_offset = 0; + numdirpasses = 1; + } else { + dp->i_offset = dp->i_diroff; + if ((entryoffsetinblock = dp->i_offset & bmask) && + (error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, NULL, &bp))) + return (error); + numdirpasses = 2; + nchstats.ncs_2passes++; + } + prevoff = dp->i_offset; + endsearch = roundup2(dp->i_size, DIRBLKSIZ); + enduseful = 0; + +searchloop: + while (dp->i_offset < endsearch) { + /* + * If necessary, get the next directory block. + */ + if ((dp->i_offset & bmask) == 0) { + if (bp != NULL) + brelse(bp); + error = + UFS_BLKATOFF(vdp, (off_t)dp->i_offset, NULL, &bp); + if (error) + return (error); + entryoffsetinblock = 0; + } + /* + * If still looking for a slot, and at a DIRBLKSIZE + * boundary, have to start looking for free space again. + */ + if (slotstatus == NONE && + (entryoffsetinblock & (DIRBLKSIZ - 1)) == 0) { + slotoffset = -1; + slotfreespace = 0; + } + /* + * Get pointer to next entry. + * Full validation checks are slow, so we only check + * enough to insure forward progress through the + * directory. Complete checks can be run by patching + * "dirchk" to be true. + */ + ep = (struct direct *)((char *)bp->b_data + entryoffsetinblock); + if (ep->d_reclen == 0 || ep->d_reclen > + DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) || + (dirchk && ufs_dirbadentry(vdp, ep, entryoffsetinblock))) { + int i; + + ufs_dirbad(dp, dp->i_offset, "mangled entry"); + i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); + dp->i_offset += i; + entryoffsetinblock += i; + continue; + } + + /* + * If an appropriate sized slot has not yet been found, + * check to see if one is available. Also accumulate space + * in the current block so that we can determine if + * compaction is viable. + */ + if (slotstatus != FOUND) { + int size = ep->d_reclen; + + if (ep->d_ino != 0) + size -= DIRSIZ(OFSFMT(vdp), ep); + if (size > 0) { + if (size >= slotneeded) { + slotstatus = FOUND; + slotoffset = dp->i_offset; + slotsize = ep->d_reclen; + } else if (slotstatus == NONE) { + slotfreespace += size; + if (slotoffset == -1) + slotoffset = dp->i_offset; + if (slotfreespace >= slotneeded) { + slotstatus = COMPACT; + slotsize = dp->i_offset + + ep->d_reclen - slotoffset; + } + } + } + } + + /* + * Check for a name match. + */ + if (ep->d_ino) { +# if (BYTE_ORDER == LITTLE_ENDIAN) + if (OFSFMT(vdp)) + namlen = ep->d_type; + else + namlen = ep->d_namlen; +# else + namlen = ep->d_namlen; +# endif + if (namlen == cnp->cn_namelen && + (cnp->cn_nameptr[0] == ep->d_name[0]) && + !bcmp(cnp->cn_nameptr, ep->d_name, + (unsigned)namlen)) { +#ifdef UFS_DIRHASH +foundentry: +#endif + /* + * Save directory entry's inode number and + * reclen in ndp->ni_ufs area, and release + * directory buffer. + */ + if (vdp->v_mount->mnt_maxsymlinklen > 0 && + ep->d_type == DT_WHT) { + slotstatus = FOUND; + slotoffset = dp->i_offset; + slotsize = ep->d_reclen; + dp->i_reclen = slotsize; + enduseful = dp->i_size; + ap->a_cnp->cn_flags |= ISWHITEOUT; + numdirpasses--; + goto notfound; + } + dp->i_ino = ep->d_ino; + dp->i_reclen = ep->d_reclen; + goto found; + } + } + prevoff = dp->i_offset; + dp->i_offset += ep->d_reclen; + entryoffsetinblock += ep->d_reclen; + if (ep->d_ino) + enduseful = dp->i_offset; + } +notfound: + /* + * If we started in the middle of the directory and failed + * to find our target, we must check the beginning as well. + */ + if (numdirpasses == 2) { + numdirpasses--; + dp->i_offset = 0; + endsearch = dp->i_diroff; + goto searchloop; + } + if (bp != NULL) + brelse(bp); + /* + * If creating, and at end of pathname and current + * directory has not been removed, then can consider + * allowing file to be created. + */ + if ((nameiop == CREATE || nameiop == RENAME || + (nameiop == DELETE && + (ap->a_cnp->cn_flags & DOWHITEOUT) && + (ap->a_cnp->cn_flags & ISWHITEOUT))) && + (flags & ISLASTCN) && dp->i_effnlink != 0) { + /* + * Access for write is interpreted as allowing + * creation of files in the directory. + */ + error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread); + if (error) + return (error); + /* + * Return an indication of where the new directory + * entry should be put. If we didn't find a slot, + * then set dp->i_count to 0 indicating + * that the new slot belongs at the end of the + * directory. If we found a slot, then the new entry + * can be put in the range from dp->i_offset to + * dp->i_offset + dp->i_count. + */ + if (slotstatus == NONE) { + dp->i_offset = roundup2(dp->i_size, DIRBLKSIZ); + dp->i_count = 0; + enduseful = dp->i_offset; + } else if (nameiop == DELETE) { + dp->i_offset = slotoffset; + if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0) + dp->i_count = 0; + else + dp->i_count = dp->i_offset - prevoff; + } else { + dp->i_offset = slotoffset; + dp->i_count = slotsize; + if (enduseful < slotoffset + slotsize) + enduseful = slotoffset + slotsize; + } + dp->i_endoff = roundup2(enduseful, DIRBLKSIZ); + dp->i_flag |= IN_CHANGE | IN_UPDATE; + /* + * We return with the directory locked, so that + * the parameters we set up above will still be + * valid if we actually decide to do a direnter(). + * We return ni_vp == NULL to indicate that the entry + * does not currently exist; we leave a pointer to + * the (locked) directory inode in ndp->ni_dvp. + * The pathname buffer is saved so that the name + * can be obtained later. + * + * NB - if the directory is unlocked, then this + * information cannot be used. + */ + cnp->cn_flags |= SAVENAME; + if (!lockparent) { + VOP_UNLOCK(vdp, 0, td); + cnp->cn_flags |= PDIRUNLOCK; + } + return (EJUSTRETURN); + } + /* + * Insert name into cache (as non-existent) if appropriate. + */ + if ((cnp->cn_flags & MAKEENTRY) && nameiop != CREATE) + cache_enter(vdp, *vpp, cnp); + return (ENOENT); + +found: + if (numdirpasses == 2) + nchstats.ncs_pass2++; + /* + * Check that directory length properly reflects presence + * of this entry. + */ + if (dp->i_offset + DIRSIZ(OFSFMT(vdp), ep) > dp->i_size) { + ufs_dirbad(dp, dp->i_offset, "i_size too small"); + dp->i_size = dp->i_offset + DIRSIZ(OFSFMT(vdp), ep); + DIP(dp, i_size) = dp->i_size; + dp->i_flag |= IN_CHANGE | IN_UPDATE; + } + brelse(bp); + + /* + * Found component in pathname. + * If the final component of path name, save information + * in the cache as to where the entry was found. + */ + if ((flags & ISLASTCN) && nameiop == LOOKUP) + dp->i_diroff = dp->i_offset &~ (DIRBLKSIZ - 1); + + /* + * If deleting, and at end of pathname, return + * parameters which can be used to remove file. + * If the wantparent flag isn't set, we return only + * the directory (in ndp->ni_dvp), otherwise we go + * on and lock the inode, being careful with ".". + */ + if (nameiop == DELETE && (flags & ISLASTCN)) { + /* + * Write access to directory required to delete files. + */ + error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread); + if (error) + return (error); + /* + * Return pointer to current entry in dp->i_offset, + * and distance past previous entry (if there + * is a previous entry in this block) in dp->i_count. + * Save directory inode pointer in ndp->ni_dvp for dirremove(). + */ + if ((dp->i_offset & (DIRBLKSIZ - 1)) == 0) + dp->i_count = 0; + else + dp->i_count = dp->i_offset - prevoff; + if (dp->i_number == dp->i_ino) { + VREF(vdp); + *vpp = vdp; + return (0); + } + if ((error = VFS_VGET(vdp->v_mount, dp->i_ino, + LK_EXCLUSIVE, &tdp)) != 0) + return (error); + /* + * If directory is "sticky", then user must own + * the directory, or the file in it, else she + * may not delete it (unless she's root). This + * implements append-only directories. + */ + if ((dp->i_mode & ISVTX) && + VOP_ACCESS(vdp, VADMIN, cred, cnp->cn_thread) && + VOP_ACCESS(tdp, VADMIN, cred, cnp->cn_thread)) { + vput(tdp); + return (EPERM); + } + *vpp = tdp; + if (!lockparent) { + VOP_UNLOCK(vdp, 0, td); + cnp->cn_flags |= PDIRUNLOCK; + } + return (0); + } + + /* + * If rewriting (RENAME), return the inode and the + * information required to rewrite the present directory + * Must get inode of directory entry to verify it's a + * regular file, or empty directory. + */ + if (nameiop == RENAME && wantparent && (flags & ISLASTCN)) { + if ((error = VOP_ACCESS(vdp, VWRITE, cred, cnp->cn_thread))) + return (error); + /* + * Careful about locking second inode. + * This can only occur if the target is ".". + */ + if (dp->i_number == dp->i_ino) + return (EISDIR); + if ((error = VFS_VGET(vdp->v_mount, dp->i_ino, + LK_EXCLUSIVE, &tdp)) != 0) + return (error); + *vpp = tdp; + cnp->cn_flags |= SAVENAME; + if (!lockparent) { + VOP_UNLOCK(vdp, 0, td); + cnp->cn_flags |= PDIRUNLOCK; + } + return (0); + } + + /* + * Step through the translation in the name. We do not `vput' the + * directory because we may need it again if a symbolic link + * is relative to the current directory. Instead we save it + * unlocked as "pdp". We must get the target inode before unlocking + * the directory to insure that the inode will not be removed + * before we get it. We prevent deadlock by always fetching + * inodes from the root, moving down the directory tree. Thus + * when following backward pointers ".." we must unlock the + * parent directory before getting the requested directory. + * There is a potential race condition here if both the current + * and parent directories are removed before the VFS_VGET for the + * inode associated with ".." returns. We hope that this occurs + * infrequently since we cannot avoid this race condition without + * implementing a sophisticated deadlock detection algorithm. + * Note also that this simple deadlock detection scheme will not + * work if the filesystem has any hard links other than ".." + * that point backwards in the directory structure. + */ + pdp = vdp; + if (flags & ISDOTDOT) { + if ((VFS_VGET(pdp->v_mount, dp->i_ino, LK_NOWAIT | LK_EXCLUSIVE, + &tdp)) != 0) { + VOP_UNLOCK(pdp, 0, td); /* race to get the inode */ + error = VFS_VGET(pdp->v_mount, dp->i_ino, + LK_EXCLUSIVE, &tdp); + vn_lock(pdp, LK_EXCLUSIVE | LK_RETRY, td); + if (error) + return (error); + } + if (!lockparent || !(flags & ISLASTCN)) { + VOP_UNLOCK(pdp, 0, td); + cnp->cn_flags |= PDIRUNLOCK; + } + *vpp = tdp; + } else if (dp->i_number == dp->i_ino) { + VREF(vdp); /* we want ourself, ie "." */ + *vpp = vdp; + } else { + error = VFS_VGET(pdp->v_mount, dp->i_ino, LK_EXCLUSIVE, &tdp); + if (error) + return (error); + if (!lockparent || !(flags & ISLASTCN)) { + VOP_UNLOCK(pdp, 0, td); + cnp->cn_flags |= PDIRUNLOCK; + } + *vpp = tdp; + } + + /* + * Insert name into cache if appropriate. + */ + if (cnp->cn_flags & MAKEENTRY) + cache_enter(vdp, *vpp, cnp); + return (0); +} + +void +ufs_dirbad(ip, offset, how) + struct inode *ip; + doff_t offset; + char *how; +{ + struct mount *mp; + + mp = ITOV(ip)->v_mount; + (void)printf("%s: bad dir ino %lu at offset %ld: %s\n", + mp->mnt_stat.f_mntonname, (u_long)ip->i_number, (long)offset, how); + if ((mp->mnt_flag & MNT_RDONLY) == 0) + panic("ufs_dirbad: bad dir"); +} + +/* + * Do consistency checking on a directory entry: + * record length must be multiple of 4 + * entry must fit in rest of its DIRBLKSIZ block + * record must be large enough to contain entry + * name is not longer than MAXNAMLEN + * name must be as long as advertised, and null terminated + */ +int +ufs_dirbadentry(dp, ep, entryoffsetinblock) + struct vnode *dp; + struct direct *ep; + int entryoffsetinblock; +{ + int i, namlen; + +# if (BYTE_ORDER == LITTLE_ENDIAN) + if (OFSFMT(dp)) + namlen = ep->d_type; + else + namlen = ep->d_namlen; +# else + namlen = ep->d_namlen; +# endif + if ((ep->d_reclen & 0x3) != 0 || + ep->d_reclen > DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)) || + ep->d_reclen < DIRSIZ(OFSFMT(dp), ep) || namlen > MAXNAMLEN) { + /*return (1); */ + printf("First bad\n"); + goto bad; + } + if (ep->d_ino == 0) + return (0); + for (i = 0; i < namlen; i++) + if (ep->d_name[i] == '\0') { + /*return (1); */ + printf("Second bad\n"); + goto bad; + } + if (ep->d_name[i]) + goto bad; + return (0); +bad: + return (1); +} + +/* + * Construct a new directory entry after a call to namei, using the + * parameters that it left in the componentname argument cnp. The + * argument ip is the inode to which the new directory entry will refer. + */ +void +ufs_makedirentry(ip, cnp, newdirp) + struct inode *ip; + struct componentname *cnp; + struct direct *newdirp; +{ + +#ifdef DIAGNOSTIC + if ((cnp->cn_flags & SAVENAME) == 0) + panic("ufs_makedirentry: missing name"); +#endif + newdirp->d_ino = ip->i_number; + newdirp->d_namlen = cnp->cn_namelen; + bcopy(cnp->cn_nameptr, newdirp->d_name, (unsigned)cnp->cn_namelen + 1); + if (ITOV(ip)->v_mount->mnt_maxsymlinklen > 0) + newdirp->d_type = IFTODT(ip->i_mode); + else { + newdirp->d_type = 0; +# if (BYTE_ORDER == LITTLE_ENDIAN) + { u_char tmp = newdirp->d_namlen; + newdirp->d_namlen = newdirp->d_type; + newdirp->d_type = tmp; } +# endif + } +} + +/* + * Write a directory entry after a call to namei, using the parameters + * that it left in nameidata. The argument dirp is the new directory + * entry contents. Dvp is a pointer to the directory to be written, + * which was left locked by namei. Remaining parameters (dp->i_offset, + * dp->i_count) indicate how the space for the new entry is to be obtained. + * Non-null bp indicates that a directory is being created (for the + * soft dependency code). + */ +int +ufs_direnter(dvp, tvp, dirp, cnp, newdirbp) + struct vnode *dvp; + struct vnode *tvp; + struct direct *dirp; + struct componentname *cnp; + struct buf *newdirbp; +{ + struct ucred *cr; + struct thread *td; + int newentrysize; + struct inode *dp; + struct buf *bp; + u_int dsize; + struct direct *ep, *nep; + int error, ret, blkoff, loc, spacefree, flags; + char *dirbuf; + + td = curthread; /* XXX */ + cr = td->td_ucred; + + dp = VTOI(dvp); + newentrysize = DIRSIZ(OFSFMT(dvp), dirp); + + if (dp->i_count == 0) { + /* + * If dp->i_count is 0, then namei could find no + * space in the directory. Here, dp->i_offset will + * be on a directory block boundary and we will write the + * new entry into a fresh block. + */ + if (dp->i_offset & (DIRBLKSIZ - 1)) + panic("ufs_direnter: newblk"); + flags = BA_CLRBUF; + if (!DOINGSOFTDEP(dvp) && !DOINGASYNC(dvp)) + flags |= IO_SYNC; + if ((error = UFS_BALLOC(dvp, (off_t)dp->i_offset, DIRBLKSIZ, + cr, flags, &bp)) != 0) { + if (DOINGSOFTDEP(dvp) && newdirbp != NULL) + bdwrite(newdirbp); + return (error); + } + dp->i_size = dp->i_offset + DIRBLKSIZ; + DIP(dp, i_size) = dp->i_size; + dp->i_flag |= IN_CHANGE | IN_UPDATE; + vnode_pager_setsize(dvp, (u_long)dp->i_size); + dirp->d_reclen = DIRBLKSIZ; + blkoff = dp->i_offset & + (VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_iosize - 1); + bcopy((caddr_t)dirp, (caddr_t)bp->b_data + blkoff,newentrysize); +#ifdef UFS_DIRHASH + if (dp->i_dirhash != NULL) { + ufsdirhash_newblk(dp, dp->i_offset); + ufsdirhash_add(dp, dirp, dp->i_offset); + ufsdirhash_checkblock(dp, (char *)bp->b_data + blkoff, + dp->i_offset); + } +#endif + if (DOINGSOFTDEP(dvp)) { + /* + * Ensure that the entire newly allocated block is a + * valid directory so that future growth within the + * block does not have to ensure that the block is + * written before the inode. + */ + blkoff += DIRBLKSIZ; + while (blkoff < bp->b_bcount) { + ((struct direct *) + (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ; + blkoff += DIRBLKSIZ; + } + if (softdep_setup_directory_add(bp, dp, dp->i_offset, + dirp->d_ino, newdirbp, 1) == 0) { + bdwrite(bp); + return (UFS_UPDATE(dvp, 0)); + } + /* We have just allocated a directory block in an + * indirect block. Rather than tracking when it gets + * claimed by the inode, we simply do a VOP_FSYNC + * now to ensure that it is there (in case the user + * does a future fsync). Note that we have to unlock + * the inode for the entry that we just entered, as + * the VOP_FSYNC may need to lock other inodes which + * can lead to deadlock if we also hold a lock on + * the newly entered node. + */ + if ((error = BUF_WRITE(bp))) + return (error); + if (tvp != NULL) + VOP_UNLOCK(tvp, 0, td); + error = VOP_FSYNC(dvp, td->td_ucred, MNT_WAIT, td); + if (tvp != NULL) + vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, td); + return (error); + } + if (DOINGASYNC(dvp)) { + bdwrite(bp); + return (UFS_UPDATE(dvp, 0)); + } + error = BUF_WRITE(bp); + ret = UFS_UPDATE(dvp, 1); + if (error == 0) + return (ret); + return (error); + } + + /* + * If dp->i_count is non-zero, then namei found space for the new + * entry in the range dp->i_offset to dp->i_offset + dp->i_count + * in the directory. To use this space, we may have to compact + * the entries located there, by copying them together towards the + * beginning of the block, leaving the free space in one usable + * chunk at the end. + */ + + /* + * Increase size of directory if entry eats into new space. + * This should never push the size past a new multiple of + * DIRBLKSIZE. + * + * N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN. + */ + if (dp->i_offset + dp->i_count > dp->i_size) { + dp->i_size = dp->i_offset + dp->i_count; + DIP(dp, i_size) = dp->i_size; + } + /* + * Get the block containing the space for the new directory entry. + */ + error = UFS_BLKATOFF(dvp, (off_t)dp->i_offset, &dirbuf, &bp); + if (error) { + if (DOINGSOFTDEP(dvp) && newdirbp != NULL) + bdwrite(newdirbp); + return (error); + } + /* + * Find space for the new entry. In the simple case, the entry at + * offset base will have the space. If it does not, then namei + * arranged that compacting the region dp->i_offset to + * dp->i_offset + dp->i_count would yield the space. + */ + ep = (struct direct *)dirbuf; + dsize = ep->d_ino ? DIRSIZ(OFSFMT(dvp), ep) : 0; + spacefree = ep->d_reclen - dsize; + for (loc = ep->d_reclen; loc < dp->i_count; ) { + nep = (struct direct *)(dirbuf + loc); + + /* Trim the existing slot (NB: dsize may be zero). */ + ep->d_reclen = dsize; + ep = (struct direct *)((char *)ep + dsize); + + /* Read nep->d_reclen now as the bcopy() may clobber it. */ + loc += nep->d_reclen; + if (nep->d_ino == 0) { + /* + * A mid-block unused entry. Such entries are + * never created by the kernel, but fsck_ffs + * can create them (and it doesn't fix them). + * + * Add up the free space, and initialise the + * relocated entry since we don't bcopy it. + */ + spacefree += nep->d_reclen; + ep->d_ino = 0; + dsize = 0; + continue; + } + dsize = DIRSIZ(OFSFMT(dvp), nep); + spacefree += nep->d_reclen - dsize; +#ifdef UFS_DIRHASH + if (dp->i_dirhash != NULL) + ufsdirhash_move(dp, nep, + dp->i_offset + ((char *)nep - dirbuf), + dp->i_offset + ((char *)ep - dirbuf)); +#endif + if (DOINGSOFTDEP(dvp)) + softdep_change_directoryentry_offset(dp, dirbuf, + (caddr_t)nep, (caddr_t)ep, dsize); + else + bcopy((caddr_t)nep, (caddr_t)ep, dsize); + } + /* + * Here, `ep' points to a directory entry containing `dsize' in-use + * bytes followed by `spacefree' unused bytes. If ep->d_ino == 0, + * then the entry is completely unused (dsize == 0). The value + * of ep->d_reclen is always indeterminate. + * + * Update the pointer fields in the previous entry (if any), + * copy in the new entry, and write out the block. + */ + if (ep->d_ino == 0 || + (ep->d_ino == WINO && + bcmp(ep->d_name, dirp->d_name, dirp->d_namlen) == 0)) { + if (spacefree + dsize < newentrysize) + panic("ufs_direnter: compact1"); + dirp->d_reclen = spacefree + dsize; + } else { + if (spacefree < newentrysize) + panic("ufs_direnter: compact2"); + dirp->d_reclen = spacefree; + ep->d_reclen = dsize; + ep = (struct direct *)((char *)ep + dsize); + } +#ifdef UFS_DIRHASH + if (dp->i_dirhash != NULL && (ep->d_ino == 0 || + dirp->d_reclen == spacefree)) + ufsdirhash_add(dp, dirp, dp->i_offset + ((char *)ep - dirbuf)); +#endif + bcopy((caddr_t)dirp, (caddr_t)ep, (u_int)newentrysize); +#ifdef UFS_DIRHASH + if (dp->i_dirhash != NULL) + ufsdirhash_checkblock(dp, dirbuf - + (dp->i_offset & (DIRBLKSIZ - 1)), + dp->i_offset & ~(DIRBLKSIZ - 1)); +#endif + + if (DOINGSOFTDEP(dvp)) { + (void) softdep_setup_directory_add(bp, dp, + dp->i_offset + (caddr_t)ep - dirbuf, + dirp->d_ino, newdirbp, 0); + bdwrite(bp); + } else { + if (DOINGASYNC(dvp)) { + bdwrite(bp); + error = 0; + } else { + error = BUF_WRITE(bp); + } + } + dp->i_flag |= IN_CHANGE | IN_UPDATE; + /* + * If all went well, and the directory can be shortened, proceed + * with the truncation. Note that we have to unlock the inode for + * the entry that we just entered, as the truncation may need to + * lock other inodes which can lead to deadlock if we also hold a + * lock on the newly entered node. + */ + if (error == 0 && dp->i_endoff && dp->i_endoff < dp->i_size) { + if (tvp != NULL) + VOP_UNLOCK(tvp, 0, td); +#ifdef UFS_DIRHASH + if (dp->i_dirhash != NULL) + ufsdirhash_dirtrunc(dp, dp->i_endoff); +#endif + (void) UFS_TRUNCATE(dvp, (off_t)dp->i_endoff, + IO_NORMAL | IO_SYNC, cr, td); + if (tvp != NULL) + vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, td); + } + return (error); +} + +/* + * Remove a directory entry after a call to namei, using + * the parameters which it left in nameidata. The entry + * dp->i_offset contains the offset into the directory of the + * entry to be eliminated. The dp->i_count field contains the + * size of the previous record in the directory. If this + * is 0, the first entry is being deleted, so we need only + * zero the inode number to mark the entry as free. If the + * entry is not the first in the directory, we must reclaim + * the space of the now empty record by adding the record size + * to the size of the previous entry. + */ +int +ufs_dirremove(dvp, ip, flags, isrmdir) + struct vnode *dvp; + struct inode *ip; + int flags; + int isrmdir; +{ + struct inode *dp; + struct direct *ep; + struct buf *bp; + int error; + + dp = VTOI(dvp); + + if (flags & DOWHITEOUT) { + /* + * Whiteout entry: set d_ino to WINO. + */ + if ((error = + UFS_BLKATOFF(dvp, (off_t)dp->i_offset, (char **)&ep, &bp)) != 0) + return (error); + ep->d_ino = WINO; + ep->d_type = DT_WHT; + goto out; + } + + if ((error = UFS_BLKATOFF(dvp, + (off_t)(dp->i_offset - dp->i_count), (char **)&ep, &bp)) != 0) + return (error); +#ifdef UFS_DIRHASH + /* + * Remove the dirhash entry. This is complicated by the fact + * that `ep' is the previous entry when dp->i_count != 0. + */ + if (dp->i_dirhash != NULL) + ufsdirhash_remove(dp, (dp->i_count == 0) ? ep : + (struct direct *)((char *)ep + ep->d_reclen), dp->i_offset); +#endif + if (dp->i_count == 0) { + /* + * First entry in block: set d_ino to zero. + */ + ep->d_ino = 0; + } else { + /* + * Collapse new free space into previous entry. + */ + ep->d_reclen += dp->i_reclen; + } +#ifdef UFS_DIRHASH + if (dp->i_dirhash != NULL) + ufsdirhash_checkblock(dp, (char *)ep - + ((dp->i_offset - dp->i_count) & (DIRBLKSIZ - 1)), + dp->i_offset & ~(DIRBLKSIZ - 1)); +#endif +out: + if (DOINGSOFTDEP(dvp)) { + if (ip) { + ip->i_effnlink--; + softdep_change_linkcnt(ip); + softdep_setup_remove(bp, dp, ip, isrmdir); + } + if (softdep_slowdown(dvp)) { + error = BUF_WRITE(bp); + } else { + bdwrite(bp); + error = 0; + } + } else { + if (ip) { + ip->i_effnlink--; + ip->i_nlink--; + DIP(ip, i_nlink) = ip->i_nlink; + ip->i_flag |= IN_CHANGE; + } + if (flags & DOWHITEOUT) + error = BUF_WRITE(bp); + else if (DOINGASYNC(dvp) && dp->i_count != 0) { + bdwrite(bp); + error = 0; + } else + error = BUF_WRITE(bp); + } + dp->i_flag |= IN_CHANGE | IN_UPDATE; + /* + * If the last named reference to a snapshot goes away, + * drop its snapshot reference so that it will be reclaimed + * when last open reference goes away. + */ +#if defined(FFS) || defined(IFS) + if (ip != 0 && (ip->i_flags & SF_SNAPSHOT) != 0 && ip->i_effnlink == 0) + ffs_snapgone(ip); +#endif + return (error); +} + +/* + * Rewrite an existing directory entry to point at the inode + * supplied. The parameters describing the directory entry are + * set up by a call to namei. + */ +int +ufs_dirrewrite(dp, oip, newinum, newtype, isrmdir) + struct inode *dp, *oip; + ino_t newinum; + int newtype; + int isrmdir; +{ + struct buf *bp; + struct direct *ep; + struct vnode *vdp = ITOV(dp); + int error; + + error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset, (char **)&ep, &bp); + if (error) + return (error); + ep->d_ino = newinum; + if (!OFSFMT(vdp)) + ep->d_type = newtype; + oip->i_effnlink--; + if (DOINGSOFTDEP(vdp)) { + softdep_change_linkcnt(oip); + softdep_setup_directory_change(bp, dp, oip, newinum, isrmdir); + bdwrite(bp); + } else { + oip->i_nlink--; + DIP(oip, i_nlink) = oip->i_nlink; + oip->i_flag |= IN_CHANGE; + if (DOINGASYNC(vdp)) { + bdwrite(bp); + error = 0; + } else { + error = BUF_WRITE(bp); + } + } + dp->i_flag |= IN_CHANGE | IN_UPDATE; + /* + * If the last named reference to a snapshot goes away, + * drop its snapshot reference so that it will be reclaimed + * when last open reference goes away. + */ +#if defined(FFS) || defined(IFS) + if ((oip->i_flags & SF_SNAPSHOT) != 0 && oip->i_effnlink == 0) + ffs_snapgone(oip); +#endif + return (error); +} + +/* + * Check if a directory is empty or not. + * Inode supplied must be locked. + * + * Using a struct dirtemplate here is not precisely + * what we want, but better than using a struct direct. + * + * NB: does not handle corrupted directories. + */ +int +ufs_dirempty(ip, parentino, cred) + struct inode *ip; + ino_t parentino; + struct ucred *cred; +{ + doff_t off; + struct dirtemplate dbuf; + struct direct *dp = (struct direct *)&dbuf; + int error, count, namlen; +#define MINDIRSIZ (sizeof (struct dirtemplate) / 2) + + for (off = 0; off < ip->i_size; off += dp->d_reclen) { + error = vn_rdwr(UIO_READ, ITOV(ip), (caddr_t)dp, MINDIRSIZ, + off, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, cred, + NOCRED, &count, (struct thread *)0); + /* + * Since we read MINDIRSIZ, residual must + * be 0 unless we're at end of file. + */ + if (error || count != 0) + return (0); + /* avoid infinite loops */ + if (dp->d_reclen == 0) + return (0); + /* skip empty entries */ + if (dp->d_ino == 0 || dp->d_ino == WINO) + continue; + /* accept only "." and ".." */ +# if (BYTE_ORDER == LITTLE_ENDIAN) + if (OFSFMT(ITOV(ip))) + namlen = dp->d_type; + else + namlen = dp->d_namlen; +# else + namlen = dp->d_namlen; +# endif + if (namlen > 2) + return (0); + if (dp->d_name[0] != '.') + return (0); + /* + * At this point namlen must be 1 or 2. + * 1 implies ".", 2 implies ".." if second + * char is also "." + */ + if (namlen == 1 && dp->d_ino == ip->i_number) + continue; + if (dp->d_name[1] == '.' && dp->d_ino == parentino) + continue; + return (0); + } + return (1); +} + +/* + * Check if source directory is in the path of the target directory. + * Target is supplied locked, source is unlocked. + * The target is always vput before returning. + */ +int +ufs_checkpath(source, target, cred) + struct inode *source, *target; + struct ucred *cred; +{ + struct vnode *vp; + int error, namlen; + ino_t rootino; + struct dirtemplate dirbuf; + + vp = ITOV(target); + if (target->i_number == source->i_number) { + error = EEXIST; + goto out; + } + rootino = ROOTINO; + error = 0; + if (target->i_number == rootino) + goto out; + + for (;;) { + if (vp->v_type != VDIR) { + error = ENOTDIR; + break; + } + error = vn_rdwr(UIO_READ, vp, (caddr_t)&dirbuf, + sizeof (struct dirtemplate), (off_t)0, UIO_SYSSPACE, + IO_NODELOCKED | IO_NOMACCHECK, cred, NOCRED, (int *)0, + (struct thread *)0); + if (error != 0) + break; +# if (BYTE_ORDER == LITTLE_ENDIAN) + if (OFSFMT(vp)) + namlen = dirbuf.dotdot_type; + else + namlen = dirbuf.dotdot_namlen; +# else + namlen = dirbuf.dotdot_namlen; +# endif + if (namlen != 2 || + dirbuf.dotdot_name[0] != '.' || + dirbuf.dotdot_name[1] != '.') { + error = ENOTDIR; + break; + } + if (dirbuf.dotdot_ino == source->i_number) { + error = EINVAL; + break; + } + if (dirbuf.dotdot_ino == rootino) + break; + vput(vp); + error = VFS_VGET(vp->v_mount, dirbuf.dotdot_ino, + LK_EXCLUSIVE, &vp); + if (error) { + vp = NULL; + break; + } + } + +out: + if (error == ENOTDIR) + printf("checkpath: .. not a directory\n"); + if (vp != NULL) + vput(vp); + return (error); +} diff --git a/src/sys/ufs/ufs/ufs_quota.c b/src/sys/ufs/ufs/ufs_quota.c new file mode 100644 index 0000000..ce03528 --- /dev/null +++ b/src/sys/ufs/ufs/ufs_quota.c @@ -0,0 +1,1061 @@ +/* + * Copyright (c) 1982, 1986, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Robert Elz at The University of Melbourne. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_quota.c 8.5 (Berkeley) 5/20/95 + */ + +#include +__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_quota.c,v 1.70 2003/11/05 04:30:08 kan Exp $"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +SYSCTL_DECL(_security_bsd); + +static int unprivileged_get_quota = 0; +SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_get_quota, CTLFLAG_RW, + &unprivileged_get_quota, 0, + "Unprivileged processes may retrieve quotas for other uids and gids"); + +static MALLOC_DEFINE(M_DQUOT, "UFS quota", "UFS quota entries"); + +/* + * Quota name to error message mapping. + */ +static char *quotatypes[] = INITQFNAMES; + +static int chkdqchg(struct inode *, ufs2_daddr_t, struct ucred *, int); +static int chkiqchg(struct inode *, ino_t, struct ucred *, int); +static int dqget(struct vnode *, + u_long, struct ufsmount *, int, struct dquot **); +static int dqsync(struct vnode *, struct dquot *); +static void dqflush(struct vnode *); + +#ifdef DIAGNOSTIC +static void dqref(struct dquot *); +static void chkdquot(struct inode *); +#endif + +/* + * Set up the quotas for an inode. + * + * This routine completely defines the semantics of quotas. + * If other criterion want to be used to establish quotas, the + * MAXQUOTAS value in quotas.h should be increased, and the + * additional dquots set up here. + */ +int +getinoquota(ip) + struct inode *ip; +{ + struct ufsmount *ump; + struct vnode *vp = ITOV(ip); + int error; + + ump = VFSTOUFS(vp->v_mount); + /* + * Set up the user quota based on file uid. + * EINVAL means that quotas are not enabled. + */ + if (ip->i_dquot[USRQUOTA] == NODQUOT && + (error = + dqget(vp, ip->i_uid, ump, USRQUOTA, &ip->i_dquot[USRQUOTA])) && + error != EINVAL) + return (error); + /* + * Set up the group quota based on file gid. + * EINVAL means that quotas are not enabled. + */ + if (ip->i_dquot[GRPQUOTA] == NODQUOT && + (error = + dqget(vp, ip->i_gid, ump, GRPQUOTA, &ip->i_dquot[GRPQUOTA])) && + error != EINVAL) + return (error); + return (0); +} + +/* + * Update disk usage, and take corrective action. + */ +int +chkdq(ip, change, cred, flags) + struct inode *ip; + ufs2_daddr_t change; + struct ucred *cred; + int flags; +{ + struct dquot *dq; + ufs2_daddr_t ncurblocks; + int i, error; + +#ifdef DIAGNOSTIC + if ((flags & CHOWN) == 0) + chkdquot(ip); +#endif + if (change == 0) + return (0); + if (change < 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + (void) tsleep(dq, PINOD+1, "chkdq1", 0); + } + ncurblocks = dq->dq_curblocks + change; + if (ncurblocks >= 0) + dq->dq_curblocks = ncurblocks; + else + dq->dq_curblocks = 0; + dq->dq_flags &= ~DQ_BLKS; + dq->dq_flags |= DQ_MOD; + } + return (0); + } + if ((flags & FORCE) == 0 && suser_cred(cred, 0)) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + error = chkdqchg(ip, change, cred, i); + if (error) + return (error); + } + } + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + (void) tsleep(dq, PINOD+1, "chkdq2", 0); + } + /* Reset timer when crossing soft limit */ + if (dq->dq_curblocks + change >= dq->dq_bsoftlimit && + dq->dq_curblocks < dq->dq_bsoftlimit) + dq->dq_btime = time_second + + VFSTOUFS(ITOV(ip)->v_mount)->um_btime[i]; + dq->dq_curblocks += change; + dq->dq_flags |= DQ_MOD; + } + return (0); +} + +/* + * Check for a valid change to a users allocation. + * Issue an error message if appropriate. + */ +static int +chkdqchg(ip, change, cred, type) + struct inode *ip; + ufs2_daddr_t change; + struct ucred *cred; + int type; +{ + struct dquot *dq = ip->i_dquot[type]; + ufs2_daddr_t ncurblocks = dq->dq_curblocks + change; + + /* + * If user would exceed their hard limit, disallow space allocation. + */ + if (ncurblocks >= dq->dq_bhardlimit && dq->dq_bhardlimit) { + if ((dq->dq_flags & DQ_BLKS) == 0 && + ip->i_uid == cred->cr_uid) { + uprintf("\n%s: write failed, %s disk limit reached\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type]); + dq->dq_flags |= DQ_BLKS; + } + return (EDQUOT); + } + /* + * If user is over their soft limit for too long, disallow space + * allocation. Reset time limit as they cross their soft limit. + */ + if (ncurblocks >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) { + if (dq->dq_curblocks < dq->dq_bsoftlimit) { + dq->dq_btime = time_second + + VFSTOUFS(ITOV(ip)->v_mount)->um_btime[type]; + if (ip->i_uid == cred->cr_uid) + uprintf("\n%s: warning, %s %s\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type], "disk quota exceeded"); + return (0); + } + if (time_second > dq->dq_btime) { + if ((dq->dq_flags & DQ_BLKS) == 0 && + ip->i_uid == cred->cr_uid) { + uprintf("\n%s: write failed, %s %s\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type], + "disk quota exceeded for too long"); + dq->dq_flags |= DQ_BLKS; + } + return (EDQUOT); + } + } + return (0); +} + +/* + * Check the inode limit, applying corrective action. + */ +int +chkiq(ip, change, cred, flags) + struct inode *ip; + ino_t change; + struct ucred *cred; + int flags; +{ + struct dquot *dq; + ino_t ncurinodes; + int i, error; + +#ifdef DIAGNOSTIC + if ((flags & CHOWN) == 0) + chkdquot(ip); +#endif + if (change == 0) + return (0); + /* XXX: change is unsigned */ + if (change < 0) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + (void) tsleep(dq, PINOD+1, "chkiq1", 0); + } + ncurinodes = dq->dq_curinodes + change; + /* XXX: ncurinodes is unsigned */ + if (ncurinodes >= 0) + dq->dq_curinodes = ncurinodes; + else + dq->dq_curinodes = 0; + dq->dq_flags &= ~DQ_INODS; + dq->dq_flags |= DQ_MOD; + } + return (0); + } + if ((flags & FORCE) == 0 && suser_cred(cred, 0)) { + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + error = chkiqchg(ip, change, cred, i); + if (error) + return (error); + } + } + for (i = 0; i < MAXQUOTAS; i++) { + if ((dq = ip->i_dquot[i]) == NODQUOT) + continue; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + (void) tsleep(dq, PINOD+1, "chkiq2", 0); + } + /* Reset timer when crossing soft limit */ + if (dq->dq_curinodes + change >= dq->dq_isoftlimit && + dq->dq_curinodes < dq->dq_isoftlimit) + dq->dq_itime = time_second + + VFSTOUFS(ITOV(ip)->v_mount)->um_itime[i]; + dq->dq_curinodes += change; + dq->dq_flags |= DQ_MOD; + } + return (0); +} + +/* + * Check for a valid change to a users allocation. + * Issue an error message if appropriate. + */ +static int +chkiqchg(ip, change, cred, type) + struct inode *ip; + ino_t change; + struct ucred *cred; + int type; +{ + struct dquot *dq = ip->i_dquot[type]; + ino_t ncurinodes = dq->dq_curinodes + change; + + /* + * If user would exceed their hard limit, disallow inode allocation. + */ + if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) { + if ((dq->dq_flags & DQ_INODS) == 0 && + ip->i_uid == cred->cr_uid) { + uprintf("\n%s: write failed, %s inode limit reached\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type]); + dq->dq_flags |= DQ_INODS; + } + return (EDQUOT); + } + /* + * If user is over their soft limit for too long, disallow inode + * allocation. Reset time limit as they cross their soft limit. + */ + if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) { + if (dq->dq_curinodes < dq->dq_isoftlimit) { + dq->dq_itime = time_second + + VFSTOUFS(ITOV(ip)->v_mount)->um_itime[type]; + if (ip->i_uid == cred->cr_uid) + uprintf("\n%s: warning, %s %s\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type], "inode quota exceeded"); + return (0); + } + if (time_second > dq->dq_itime) { + if ((dq->dq_flags & DQ_INODS) == 0 && + ip->i_uid == cred->cr_uid) { + uprintf("\n%s: write failed, %s %s\n", + ITOV(ip)->v_mount->mnt_stat.f_mntonname, + quotatypes[type], + "inode quota exceeded for too long"); + dq->dq_flags |= DQ_INODS; + } + return (EDQUOT); + } + } + return (0); +} + +#ifdef DIAGNOSTIC +/* + * On filesystems with quotas enabled, it is an error for a file to change + * size and not to have a dquot structure associated with it. + */ +static void +chkdquot(ip) + struct inode *ip; +{ + struct ufsmount *ump = VFSTOUFS(ITOV(ip)->v_mount); + int i; + + for (i = 0; i < MAXQUOTAS; i++) { + if (ump->um_quotas[i] == NULLVP || + (ump->um_qflags[i] & (QTF_OPENING|QTF_CLOSING))) + continue; + if (ip->i_dquot[i] == NODQUOT) { + vprint("chkdquot: missing dquot", ITOV(ip)); + panic("chkdquot: missing dquot"); + } + } +} +#endif + +/* + * Code to process quotactl commands. + */ + +/* + * Q_QUOTAON - set up a quota file for a particular filesystem. + */ +int +quotaon(td, mp, type, fname) + struct thread *td; + struct mount *mp; + int type; + caddr_t fname; +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct vnode *vp, **vpp; + struct vnode *nextvp; + struct dquot *dq; + int error, flags; + struct nameidata nd; + + error = suser_cred(td->td_ucred, PRISON_ROOT); + if (error) + return (error); + + vpp = &ump->um_quotas[type]; + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, fname, td); + flags = FREAD | FWRITE; + error = vn_open(&nd, &flags, 0, -1); + if (error) + return (error); + NDFREE(&nd, NDF_ONLY_PNBUF); + vp = nd.ni_vp; + VOP_UNLOCK(vp, 0, td); + if (vp->v_type != VREG) { + (void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td); + return (EACCES); + } + if (*vpp != vp) + quotaoff(td, mp, type); + ump->um_qflags[type] |= QTF_OPENING; + mp->mnt_flag |= MNT_QUOTA; + ASSERT_VOP_LOCKED(vp, "quotaon"); + vp->v_vflag |= VV_SYSTEM; + *vpp = vp; + /* + * Save the credential of the process that turned on quotas. + * Set up the time limits for this quota. + */ + ump->um_cred[type] = crhold(td->td_ucred); + ump->um_btime[type] = MAX_DQ_TIME; + ump->um_itime[type] = MAX_IQ_TIME; + if (dqget(NULLVP, 0, ump, type, &dq) == 0) { + if (dq->dq_btime > 0) + ump->um_btime[type] = dq->dq_btime; + if (dq->dq_itime > 0) + ump->um_itime[type] = dq->dq_itime; + dqrele(NULLVP, dq); + } + /* + * Search vnodes associated with this mount point, + * adding references to quota file being opened. + * NB: only need to add dquot's for inodes being modified. + */ + MNT_ILOCK(mp); +again: + for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nextvp) { + if (vp->v_mount != mp) + goto again; + nextvp = TAILQ_NEXT(vp, v_nmntvnodes); + VI_LOCK(vp); + MNT_IUNLOCK(mp); + if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { + MNT_ILOCK(mp); + goto again; + } + if (vp->v_type == VNON || vp->v_writecount == 0) { + VOP_UNLOCK(vp, 0, td); + vrele(vp); + MNT_ILOCK(mp); + continue; + } + error = getinoquota(VTOI(vp)); + VOP_UNLOCK(vp, 0, td); + vrele(vp); + MNT_ILOCK(mp); + if (error) + break; + if (TAILQ_NEXT(vp, v_nmntvnodes) != nextvp) + goto again; + } + MNT_IUNLOCK(mp); + ump->um_qflags[type] &= ~QTF_OPENING; + if (error) + quotaoff(td, mp, type); + return (error); +} + +/* + * Q_QUOTAOFF - turn off disk quotas for a filesystem. + */ +int +quotaoff(td, mp, type) + struct thread *td; + struct mount *mp; + int type; +{ + struct vnode *vp; + struct vnode *qvp, *nextvp; + struct ufsmount *ump = VFSTOUFS(mp); + struct dquot *dq; + struct inode *ip; + int error; + + error = suser_cred(td->td_ucred, PRISON_ROOT); + if (error) + return (error); + + if ((qvp = ump->um_quotas[type]) == NULLVP) + return (0); + ump->um_qflags[type] |= QTF_CLOSING; + /* + * Search vnodes associated with this mount point, + * deleting any references to quota file being closed. + */ + MNT_ILOCK(mp); +again: + for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nextvp) { + if (vp->v_mount != mp) + goto again; + nextvp = TAILQ_NEXT(vp, v_nmntvnodes); + + VI_LOCK(vp); + MNT_IUNLOCK(mp); + if (vp->v_type == VNON) { + VI_UNLOCK(vp); + MNT_ILOCK(mp); + continue; + } + if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { + MNT_ILOCK(mp); + goto again; + } + ip = VTOI(vp); + dq = ip->i_dquot[type]; + ip->i_dquot[type] = NODQUOT; + dqrele(vp, dq); + VOP_UNLOCK(vp, 0, td); + vrele(vp); + MNT_ILOCK(mp); + if (TAILQ_NEXT(vp, v_nmntvnodes) != nextvp) + goto again; + } + MNT_IUNLOCK(mp); + dqflush(qvp); + ASSERT_VOP_LOCKED(qvp, "quotaoff"); + qvp->v_vflag &= ~VV_SYSTEM; + error = vn_close(qvp, FREAD|FWRITE, td->td_ucred, td); + ump->um_quotas[type] = NULLVP; + crfree(ump->um_cred[type]); + ump->um_cred[type] = NOCRED; + ump->um_qflags[type] &= ~QTF_CLOSING; + for (type = 0; type < MAXQUOTAS; type++) + if (ump->um_quotas[type] != NULLVP) + break; + if (type == MAXQUOTAS) + mp->mnt_flag &= ~MNT_QUOTA; + return (error); +} + +/* + * Q_GETQUOTA - return current values in a dqblk structure. + */ +int +getquota(td, mp, id, type, addr) + struct thread *td; + struct mount *mp; + u_long id; + int type; + caddr_t addr; +{ + struct dquot *dq; + int error; + + switch (type) { + case USRQUOTA: + if ((td->td_ucred->cr_uid != id) && !unprivileged_get_quota) { + error = suser_cred(td->td_ucred, PRISON_ROOT); + if (error) + return (error); + } + break; + + case GRPQUOTA: + if (!groupmember(id, td->td_ucred) && !unprivileged_get_quota) { + error = suser_cred(td->td_ucred, PRISON_ROOT); + if (error) + return (error); + } + break; + + default: + return (EINVAL); + } + + error = dqget(NULLVP, id, VFSTOUFS(mp), type, &dq); + if (error) + return (error); + error = copyout((caddr_t)&dq->dq_dqb, addr, sizeof (struct dqblk)); + dqrele(NULLVP, dq); + return (error); +} + +/* + * Q_SETQUOTA - assign an entire dqblk structure. + */ +int +setquota(td, mp, id, type, addr) + struct thread *td; + struct mount *mp; + u_long id; + int type; + caddr_t addr; +{ + struct dquot *dq; + struct dquot *ndq; + struct ufsmount *ump = VFSTOUFS(mp); + struct dqblk newlim; + int error; + + error = suser_cred(td->td_ucred, PRISON_ROOT); + if (error) + return (error); + + error = copyin(addr, (caddr_t)&newlim, sizeof (struct dqblk)); + if (error) + return (error); + error = dqget(NULLVP, id, ump, type, &ndq); + if (error) + return (error); + dq = ndq; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + (void) tsleep(dq, PINOD+1, "setqta", 0); + } + /* + * Copy all but the current values. + * Reset time limit if previously had no soft limit or were + * under it, but now have a soft limit and are over it. + */ + newlim.dqb_curblocks = dq->dq_curblocks; + newlim.dqb_curinodes = dq->dq_curinodes; + if (dq->dq_id != 0) { + newlim.dqb_btime = dq->dq_btime; + newlim.dqb_itime = dq->dq_itime; + } + if (newlim.dqb_bsoftlimit && + dq->dq_curblocks >= newlim.dqb_bsoftlimit && + (dq->dq_bsoftlimit == 0 || dq->dq_curblocks < dq->dq_bsoftlimit)) + newlim.dqb_btime = time_second + ump->um_btime[type]; + if (newlim.dqb_isoftlimit && + dq->dq_curinodes >= newlim.dqb_isoftlimit && + (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit)) + newlim.dqb_itime = time_second + ump->um_itime[type]; + dq->dq_dqb = newlim; + if (dq->dq_curblocks < dq->dq_bsoftlimit) + dq->dq_flags &= ~DQ_BLKS; + if (dq->dq_curinodes < dq->dq_isoftlimit) + dq->dq_flags &= ~DQ_INODS; + if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && + dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) + dq->dq_flags |= DQ_FAKE; + else + dq->dq_flags &= ~DQ_FAKE; + dq->dq_flags |= DQ_MOD; + dqrele(NULLVP, dq); + return (0); +} + +/* + * Q_SETUSE - set current inode and block usage. + */ +int +setuse(td, mp, id, type, addr) + struct thread *td; + struct mount *mp; + u_long id; + int type; + caddr_t addr; +{ + struct dquot *dq; + struct ufsmount *ump = VFSTOUFS(mp); + struct dquot *ndq; + struct dqblk usage; + int error; + + error = suser_cred(td->td_ucred, PRISON_ROOT); + if (error) + return (error); + + error = copyin(addr, (caddr_t)&usage, sizeof (struct dqblk)); + if (error) + return (error); + error = dqget(NULLVP, id, ump, type, &ndq); + if (error) + return (error); + dq = ndq; + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + (void) tsleep(dq, PINOD+1, "setuse", 0); + } + /* + * Reset time limit if have a soft limit and were + * previously under it, but are now over it. + */ + if (dq->dq_bsoftlimit && dq->dq_curblocks < dq->dq_bsoftlimit && + usage.dqb_curblocks >= dq->dq_bsoftlimit) + dq->dq_btime = time_second + ump->um_btime[type]; + if (dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit && + usage.dqb_curinodes >= dq->dq_isoftlimit) + dq->dq_itime = time_second + ump->um_itime[type]; + dq->dq_curblocks = usage.dqb_curblocks; + dq->dq_curinodes = usage.dqb_curinodes; + if (dq->dq_curblocks < dq->dq_bsoftlimit) + dq->dq_flags &= ~DQ_BLKS; + if (dq->dq_curinodes < dq->dq_isoftlimit) + dq->dq_flags &= ~DQ_INODS; + dq->dq_flags |= DQ_MOD; + dqrele(NULLVP, dq); + return (0); +} + +/* + * Q_SYNC - sync quota files to disk. + */ +int +qsync(mp) + struct mount *mp; +{ + struct ufsmount *ump = VFSTOUFS(mp); + struct thread *td = curthread; /* XXX */ + struct vnode *vp, *nextvp; + struct dquot *dq; + int i, error; + + /* + * Check if the mount point has any quotas. + * If not, simply return. + */ + for (i = 0; i < MAXQUOTAS; i++) + if (ump->um_quotas[i] != NULLVP) + break; + if (i == MAXQUOTAS) + return (0); + /* + * Search vnodes associated with this mount point, + * synchronizing any modified dquot structures. + */ + MNT_ILOCK(mp); +again: + for (vp = TAILQ_FIRST(&mp->mnt_nvnodelist); vp != NULL; vp = nextvp) { + if (vp->v_mount != mp) + goto again; + nextvp = TAILQ_NEXT(vp, v_nmntvnodes); + VI_LOCK(vp); + MNT_IUNLOCK(mp); + if (vp->v_type == VNON) { + VI_UNLOCK(vp); + MNT_ILOCK(mp); + continue; + } + error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td); + if (error) { + MNT_ILOCK(mp); + if (error == ENOENT) + goto again; + continue; + } + for (i = 0; i < MAXQUOTAS; i++) { + dq = VTOI(vp)->i_dquot[i]; + if (dq != NODQUOT && (dq->dq_flags & DQ_MOD)) + dqsync(vp, dq); + } + vput(vp); + MNT_ILOCK(mp); + if (TAILQ_NEXT(vp, v_nmntvnodes) != nextvp) + goto again; + } + MNT_IUNLOCK(mp); + return (0); +} + +/* + * Code pertaining to management of the in-core dquot data structures. + */ +#define DQHASH(dqvp, id) \ + (&dqhashtbl[((((intptr_t)(dqvp)) >> 8) + id) & dqhash]) +static LIST_HEAD(dqhash, dquot) *dqhashtbl; +static u_long dqhash; + +/* + * Dquot free list. + */ +#define DQUOTINC 5 /* minimum free dquots desired */ +static TAILQ_HEAD(dqfreelist, dquot) dqfreelist; +static long numdquot, desireddquot = DQUOTINC; + +/* + * Initialize the quota system. + */ +void +dqinit() +{ + + dqhashtbl = hashinit(desiredvnodes, M_DQUOT, &dqhash); + TAILQ_INIT(&dqfreelist); +} + +/* + * Shut down the quota system. + */ +void +dquninit() +{ + struct dquot *dq; + + hashdestroy(dqhashtbl, M_DQUOT, dqhash); + while ((dq = TAILQ_FIRST(&dqfreelist)) != NULL) { + TAILQ_REMOVE(&dqfreelist, dq, dq_freelist); + free(dq, M_DQUOT); + } +} + +/* + * Obtain a dquot structure for the specified identifier and quota file + * reading the information from the file if necessary. + */ +static int +dqget(vp, id, ump, type, dqp) + struct vnode *vp; + u_long id; + struct ufsmount *ump; + int type; + struct dquot **dqp; +{ + struct thread *td = curthread; /* XXX */ + struct dquot *dq; + struct dqhash *dqh; + struct vnode *dqvp; + struct iovec aiov; + struct uio auio; + int error; + + dqvp = ump->um_quotas[type]; + if (dqvp == NULLVP || (ump->um_qflags[type] & QTF_CLOSING)) { + *dqp = NODQUOT; + return (EINVAL); + } + /* + * Check the cache first. + */ + dqh = DQHASH(dqvp, id); + LIST_FOREACH(dq, dqh, dq_hash) { + if (dq->dq_id != id || + dq->dq_ump->um_quotas[dq->dq_type] != dqvp) + continue; + /* + * Cache hit with no references. Take + * the structure off the free list. + */ + if (dq->dq_cnt == 0) + TAILQ_REMOVE(&dqfreelist, dq, dq_freelist); + DQREF(dq); + *dqp = dq; + return (0); + } + /* + * Not in cache, allocate a new one. + */ + if (TAILQ_FIRST(&dqfreelist) == NODQUOT && + numdquot < MAXQUOTAS * desiredvnodes) + desireddquot += DQUOTINC; + if (numdquot < desireddquot) { + dq = (struct dquot *)malloc(sizeof *dq, M_DQUOT, + M_WAITOK | M_ZERO); + numdquot++; + } else { + if ((dq = TAILQ_FIRST(&dqfreelist)) == NULL) { + tablefull("dquot"); + *dqp = NODQUOT; + return (EUSERS); + } + if (dq->dq_cnt || (dq->dq_flags & DQ_MOD)) + panic("dqget: free dquot isn't"); + TAILQ_REMOVE(&dqfreelist, dq, dq_freelist); + if (dq->dq_ump != NULL) + LIST_REMOVE(dq, dq_hash); + } + /* + * Initialize the contents of the dquot structure. + */ + if (vp != dqvp) + vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY, td); + LIST_INSERT_HEAD(dqh, dq, dq_hash); + DQREF(dq); + dq->dq_flags = DQ_LOCK; + dq->dq_id = id; + dq->dq_ump = ump; + dq->dq_type = type; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = (caddr_t)&dq->dq_dqb; + aiov.iov_len = sizeof (struct dqblk); + auio.uio_resid = sizeof (struct dqblk); + auio.uio_offset = (off_t)(id * sizeof (struct dqblk)); + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_READ; + auio.uio_td = (struct thread *)0; + error = VOP_READ(dqvp, &auio, 0, ump->um_cred[type]); + if (auio.uio_resid == sizeof(struct dqblk) && error == 0) + bzero((caddr_t)&dq->dq_dqb, sizeof(struct dqblk)); + if (vp != dqvp) + VOP_UNLOCK(dqvp, 0, td); + if (dq->dq_flags & DQ_WANT) + wakeup(dq); + dq->dq_flags = 0; + /* + * I/O error in reading quota file, release + * quota structure and reflect problem to caller. + */ + if (error) { + LIST_REMOVE(dq, dq_hash); + dqrele(vp, dq); + *dqp = NODQUOT; + return (error); + } + /* + * Check for no limit to enforce. + * Initialize time values if necessary. + */ + if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && + dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) + dq->dq_flags |= DQ_FAKE; + if (dq->dq_id != 0) { + if (dq->dq_btime == 0) + dq->dq_btime = time_second + ump->um_btime[type]; + if (dq->dq_itime == 0) + dq->dq_itime = time_second + ump->um_itime[type]; + } + *dqp = dq; + return (0); +} + +#ifdef DIAGNOSTIC +/* + * Obtain a reference to a dquot. + */ +static void +dqref(dq) + struct dquot *dq; +{ + + dq->dq_cnt++; +} +#endif + +/* + * Release a reference to a dquot. + */ +void +dqrele(vp, dq) + struct vnode *vp; + struct dquot *dq; +{ + + if (dq == NODQUOT) + return; + if (dq->dq_cnt > 1) { + dq->dq_cnt--; + return; + } + if (dq->dq_flags & DQ_MOD) + (void) dqsync(vp, dq); + if (--dq->dq_cnt > 0) + return; + TAILQ_INSERT_TAIL(&dqfreelist, dq, dq_freelist); +} + +/* + * Update the disk quota in the quota file. + */ +static int +dqsync(vp, dq) + struct vnode *vp; + struct dquot *dq; +{ + struct thread *td = curthread; /* XXX */ + struct vnode *dqvp; + struct iovec aiov; + struct uio auio; + int error; + + if (dq == NODQUOT) + panic("dqsync: dquot"); + if ((dq->dq_flags & DQ_MOD) == 0) + return (0); + if ((dqvp = dq->dq_ump->um_quotas[dq->dq_type]) == NULLVP) + panic("dqsync: file"); + (void) vn_write_suspend_wait(dqvp, NULL, V_WAIT); + if (vp != dqvp) + vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY, td); + while (dq->dq_flags & DQ_LOCK) { + dq->dq_flags |= DQ_WANT; + (void) tsleep(dq, PINOD+2, "dqsync", 0); + if ((dq->dq_flags & DQ_MOD) == 0) { + if (vp != dqvp) + VOP_UNLOCK(dqvp, 0, td); + return (0); + } + } + dq->dq_flags |= DQ_LOCK; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = (caddr_t)&dq->dq_dqb; + aiov.iov_len = sizeof (struct dqblk); + auio.uio_resid = sizeof (struct dqblk); + auio.uio_offset = (off_t)(dq->dq_id * sizeof (struct dqblk)); + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_WRITE; + auio.uio_td = (struct thread *)0; + error = VOP_WRITE(dqvp, &auio, 0, dq->dq_ump->um_cred[dq->dq_type]); + if (auio.uio_resid && error == 0) + error = EIO; + if (dq->dq_flags & DQ_WANT) + wakeup(dq); + dq->dq_flags &= ~(DQ_MOD|DQ_LOCK|DQ_WANT); + if (vp != dqvp) + VOP_UNLOCK(dqvp, 0, td); + return (error); +} + +/* + * Flush all entries from the cache for a particular vnode. + */ +static void +dqflush(vp) + struct vnode *vp; +{ + struct dquot *dq, *nextdq; + struct dqhash *dqh; + + /* + * Move all dquot's that used to refer to this quota + * file off their hash chains (they will eventually + * fall off the head of the free list and be re-used). + */ + for (dqh = &dqhashtbl[dqhash]; dqh >= dqhashtbl; dqh--) { + for (dq = LIST_FIRST(dqh); dq; dq = nextdq) { + nextdq = LIST_NEXT(dq, dq_hash); + if (dq->dq_ump->um_quotas[dq->dq_type] != vp) + continue; + if (dq->dq_cnt) + panic("dqflush: stray dquot"); + LIST_REMOVE(dq, dq_hash); + dq->dq_ump = (struct ufsmount *)0; + } + } +} diff --git a/src/sys/ufs/ufs/ufs_vfsops.c b/src/sys/ufs/ufs/ufs_vfsops.c new file mode 100644 index 0000000..ceab486 --- /dev/null +++ b/src/sys/ufs/ufs/ufs_vfsops.c @@ -0,0 +1,227 @@ +/* + * Copyright (c) 1991, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_vfsops.c 8.8 (Berkeley) 5/20/95 + */ + +#include +__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_vfsops.c,v 1.37 2003/06/15 06:36:19 rwatson Exp $"); + +#include "opt_quota.h" +#include "opt_ufs.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#ifdef UFS_DIRHASH +#include +#include +#endif + +MALLOC_DEFINE(M_UFSMNT, "UFS mount", "UFS mount structure"); +/* + * Make a filesystem operational. + * Nothing to do at the moment. + */ +/* ARGSUSED */ +int +ufs_start(mp, flags, td) + struct mount *mp; + int flags; + struct thread *td; +{ + + return (0); +} + +/* + * Return the root of a filesystem. + */ +int +ufs_root(mp, vpp) + struct mount *mp; + struct vnode **vpp; +{ + struct vnode *nvp; + int error; + + error = VFS_VGET(mp, (ino_t)ROOTINO, LK_EXCLUSIVE, &nvp); + if (error) + return (error); + *vpp = nvp; + return (0); +} + +/* + * Do operations associated with quotas + */ +int +ufs_quotactl(mp, cmds, uid, arg, td) + struct mount *mp; + int cmds; + uid_t uid; + caddr_t arg; + struct thread *td; +{ +#ifndef QUOTA + return (EOPNOTSUPP); +#else + int cmd, type, error; + + if (uid == -1) + uid = td->td_ucred->cr_ruid; + cmd = cmds >> SUBCMDSHIFT; + type = cmds & SUBCMDMASK; + if ((u_int)type >= MAXQUOTAS) + return (EINVAL); + + if (vfs_busy(mp, LK_NOWAIT, 0, td)) + return (0); + + switch (cmd) { + case Q_QUOTAON: + error = quotaon(td, mp, type, arg); + break; + + case Q_QUOTAOFF: + error = quotaoff(td, mp, type); + break; + + case Q_SETQUOTA: + error = setquota(td, mp, uid, type, arg); + break; + + case Q_SETUSE: + error = setuse(td, mp, uid, type, arg); + break; + + case Q_GETQUOTA: + error = getquota(td, mp, uid, type, arg); + break; + + case Q_SYNC: + error = qsync(mp); + break; + + default: + error = EINVAL; + break; + } + vfs_unbusy(mp, td); + return (error); +#endif +} + +/* + * Initial UFS filesystems, done only once. + */ +int +ufs_init(vfsp) + struct vfsconf *vfsp; +{ + + ufs_ihashinit(); +#ifdef QUOTA + dqinit(); +#endif +#ifdef UFS_DIRHASH + ufsdirhash_init(); +#endif + return (0); +} + +/* + * Uninitialise UFS filesystems, done before module unload. + */ +int +ufs_uninit(vfsp) + struct vfsconf *vfsp; +{ + + ufs_ihashuninit(); +#ifdef QUOTA + dquninit(); +#endif +#ifdef UFS_DIRHASH + ufsdirhash_uninit(); +#endif + return (0); +} + +/* + * This is the generic part of fhtovp called after the underlying + * filesystem has validated the file handle. + * + * Call the VFS_CHECKEXP beforehand to verify access. + */ +int +ufs_fhtovp(mp, ufhp, vpp) + struct mount *mp; + struct ufid *ufhp; + struct vnode **vpp; +{ + struct inode *ip; + struct vnode *nvp; + int error; + + error = VFS_VGET(mp, ufhp->ufid_ino, LK_EXCLUSIVE, &nvp); + if (error) { + *vpp = NULLVP; + return (error); + } + ip = VTOI(nvp); + if (ip->i_mode == 0 || ip->i_gen != ufhp->ufid_gen || + ip->i_effnlink <= 0) { + vput(nvp); + *vpp = NULLVP; + return (ESTALE); + } + *vpp = nvp; + return (0); +} diff --git a/src/sys/ufs/ufs/ufs_vnops.c b/src/sys/ufs/ufs/ufs_vnops.c new file mode 100644 index 0000000..8061c2b --- /dev/null +++ b/src/sys/ufs/ufs/ufs_vnops.c @@ -0,0 +1,2812 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95 + */ + +#include +__FBSDID("$FreeBSD: src/sys/ufs/ufs/ufs_vnops.c,v 1.234 2003/10/18 14:10:27 phk Exp $"); + +#include "opt_mac.h" +#include "opt_quota.h" +#include "opt_suiddir.h" +#include "opt_ufs.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include /* XXX */ + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#ifdef UFS_DIRHASH +#include +#endif + +static int ufs_access(struct vop_access_args *); +static int ufs_advlock(struct vop_advlock_args *); +static int ufs_chmod(struct vnode *, int, struct ucred *, struct thread *); +static int ufs_chown(struct vnode *, uid_t, gid_t, struct ucred *, struct thread *); +static int ufs_close(struct vop_close_args *); +static int ufs_create(struct vop_create_args *); +static int ufs_getattr(struct vop_getattr_args *); +static int ufs_link(struct vop_link_args *); +static int ufs_makeinode(int mode, struct vnode *, struct vnode **, struct componentname *); +static int ufs_mkdir(struct vop_mkdir_args *); +static int ufs_mknod(struct vop_mknod_args *); +static int ufs_open(struct vop_open_args *); +static int ufs_pathconf(struct vop_pathconf_args *); +static int ufs_print(struct vop_print_args *); +static int ufs_readlink(struct vop_readlink_args *); +static int ufs_remove(struct vop_remove_args *); +static int ufs_rename(struct vop_rename_args *); +static int ufs_rmdir(struct vop_rmdir_args *); +static int ufs_setattr(struct vop_setattr_args *); +static int ufs_strategy(struct vop_strategy_args *); +static int ufs_symlink(struct vop_symlink_args *); +static int ufs_whiteout(struct vop_whiteout_args *); +static int ufsfifo_close(struct vop_close_args *); +static int ufsfifo_kqfilter(struct vop_kqfilter_args *); +static int ufsfifo_read(struct vop_read_args *); +static int ufsfifo_write(struct vop_write_args *); +static int ufsspec_close(struct vop_close_args *); +static int ufsspec_read(struct vop_read_args *); +static int ufsspec_write(struct vop_write_args *); +static int filt_ufsread(struct knote *kn, long hint); +static int filt_ufswrite(struct knote *kn, long hint); +static int filt_ufsvnode(struct knote *kn, long hint); +static void filt_ufsdetach(struct knote *kn); +static int ufs_kqfilter(struct vop_kqfilter_args *ap); + +union _qcvt { + int64_t qcvt; + int32_t val[2]; +}; +#define SETHIGH(q, h) { \ + union _qcvt tmp; \ + tmp.qcvt = (q); \ + tmp.val[_QUAD_HIGHWORD] = (h); \ + (q) = tmp.qcvt; \ +} +#define SETLOW(q, l) { \ + union _qcvt tmp; \ + tmp.qcvt = (q); \ + tmp.val[_QUAD_LOWWORD] = (l); \ + (q) = tmp.qcvt; \ +} + +/* + * A virgin directory (no blushing please). + */ +static struct dirtemplate mastertemplate = { + 0, 12, DT_DIR, 1, ".", + 0, DIRBLKSIZ - 12, DT_DIR, 2, ".." +}; +static struct odirtemplate omastertemplate = { + 0, 12, 1, ".", + 0, DIRBLKSIZ - 12, 2, ".." +}; + +void +ufs_itimes(vp) + struct vnode *vp; +{ + struct inode *ip; + struct timespec ts; + + ip = VTOI(vp); + if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0) + return; + if ((vp->v_type == VBLK || vp->v_type == VCHR) && !DOINGSOFTDEP(vp)) + ip->i_flag |= IN_LAZYMOD; + else + ip->i_flag |= IN_MODIFIED; + if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { + vfs_timestamp(&ts); + if (ip->i_flag & IN_ACCESS) { + DIP(ip, i_atime) = ts.tv_sec; + DIP(ip, i_atimensec) = ts.tv_nsec; + } + if (ip->i_flag & IN_UPDATE) { + DIP(ip, i_mtime) = ts.tv_sec; + DIP(ip, i_mtimensec) = ts.tv_nsec; + ip->i_modrev++; + } + if (ip->i_flag & IN_CHANGE) { + DIP(ip, i_ctime) = ts.tv_sec; + DIP(ip, i_ctimensec) = ts.tv_nsec; + } + } + ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE); +} + +/* + * Create a regular file + */ +static int +ufs_create(ap) + struct vop_create_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + int error; + + error = + ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode), + ap->a_dvp, ap->a_vpp, ap->a_cnp); + if (error) + return (error); + VN_KNOTE(ap->a_dvp, NOTE_WRITE); + return (0); +} + +/* + * Mknod vnode call + */ +/* ARGSUSED */ +static int +ufs_mknod(ap) + struct vop_mknod_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + struct vattr *vap = ap->a_vap; + struct vnode **vpp = ap->a_vpp; + struct inode *ip; + ino_t ino; + int error; + + error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode), + ap->a_dvp, vpp, ap->a_cnp); + if (error) + return (error); + VN_KNOTE(ap->a_dvp, NOTE_WRITE); + ip = VTOI(*vpp); + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; + if (vap->va_rdev != VNOVAL) { + /* + * Want to be able to use this to make badblock + * inodes, so don't truncate the dev number. + */ + DIP(ip, i_rdev) = vap->va_rdev; + } + /* + * Remove inode, then reload it through VFS_VGET so it is + * checked to see if it is an alias of an existing entry in + * the inode cache. + */ + vput(*vpp); + (*vpp)->v_type = VNON; + ino = ip->i_number; /* Save this before vgone() invalidates ip. */ + vgone(*vpp); + error = VFS_VGET(ap->a_dvp->v_mount, ino, LK_EXCLUSIVE, vpp); + if (error) { + *vpp = NULL; + return (error); + } + return (0); +} + +/* + * Open called. + * + * Nothing to do. + */ +/* ARGSUSED */ +static int +ufs_open(ap) + struct vop_open_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + + /* + * Files marked append-only must be opened for appending. + */ + if ((VTOI(ap->a_vp)->i_flags & APPEND) && + (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE) + return (EPERM); + return (0); +} + +/* + * Close called. + * + * Update the times on the inode. + */ +/* ARGSUSED */ +static int +ufs_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct mount *mp; + + VI_LOCK(vp); + if (vp->v_usecount > 1) { + ufs_itimes(vp); + VI_UNLOCK(vp); + } else { + VI_UNLOCK(vp); + /* + * If we are closing the last reference to an unlinked + * file, then it will be freed by the inactive routine. + * Because the freeing causes a the filesystem to be + * modified, it must be held up during periods when the + * filesystem is suspended. + * + * XXX - EAGAIN is returned to prevent vn_close from + * repeating the vrele operation. + */ + if (vp->v_type == VREG && VTOI(vp)->i_effnlink == 0) { + (void) vn_start_write(vp, &mp, V_WAIT); + vrele(vp); + vn_finished_write(mp); + return (EAGAIN); + } + } + return (0); +} + +static int +ufs_access(ap) + struct vop_access_args /* { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + mode_t mode = ap->a_mode; + int error; +#ifdef UFS_ACL + struct acl *acl; +#endif + + /* + * Disallow write attempts on read-only filesystems; + * unless the file is a socket, fifo, or a block or + * character device resident on the filesystem. + */ + if (mode & VWRITE) { + switch (vp->v_type) { + case VDIR: + case VLNK: + case VREG: + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); +#ifdef QUOTA + if ((error = getinoquota(ip)) != 0) + return (error); +#endif + break; + default: + break; + } + } + + /* If immutable bit set, nobody gets to write it. */ + if ((mode & VWRITE) && (ip->i_flags & (IMMUTABLE | SF_SNAPSHOT))) + return (EPERM); + +#ifdef UFS_ACL + if ((vp->v_mount->mnt_flag & MNT_ACLS) != 0) { + MALLOC(acl, struct acl *, sizeof(*acl), M_ACL, M_WAITOK); + error = VOP_GETACL(vp, ACL_TYPE_ACCESS, acl, ap->a_cred, + ap->a_td); + switch (error) { + case EOPNOTSUPP: + error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, + ip->i_gid, ap->a_mode, ap->a_cred, NULL); + break; + case 0: + error = vaccess_acl_posix1e(vp->v_type, ip->i_uid, + ip->i_gid, acl, ap->a_mode, ap->a_cred, NULL); + break; + default: + printf( +"ufs_access(): Error retrieving ACL on object (%d).\n", + error); + /* + * XXX: Fall back until debugged. Should + * eventually possibly log an error, and return + * EPERM for safety. + */ + error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, + ip->i_gid, ap->a_mode, ap->a_cred, NULL); + } + FREE(acl, M_ACL); + } else +#endif /* !UFS_ACL */ + error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid, + ap->a_mode, ap->a_cred, NULL); + return (error); +} + +/* ARGSUSED */ +static int +ufs_getattr(ap) + struct vop_getattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct vattr *vap = ap->a_vap; + + ufs_itimes(vp); + /* + * Copy from inode table + */ + vap->va_fsid = dev2udev(ip->i_dev); + vap->va_fileid = ip->i_number; + vap->va_mode = ip->i_mode & ~IFMT; + vap->va_nlink = ip->i_effnlink; + vap->va_uid = ip->i_uid; + vap->va_gid = ip->i_gid; + if (ip->i_ump->um_fstype == UFS1) { + vap->va_rdev = ip->i_din1->di_rdev; + vap->va_size = ip->i_din1->di_size; + vap->va_atime.tv_sec = ip->i_din1->di_atime; + vap->va_atime.tv_nsec = ip->i_din1->di_atimensec; + vap->va_mtime.tv_sec = ip->i_din1->di_mtime; + vap->va_mtime.tv_nsec = ip->i_din1->di_mtimensec; + vap->va_ctime.tv_sec = ip->i_din1->di_ctime; + vap->va_ctime.tv_nsec = ip->i_din1->di_ctimensec; + vap->va_birthtime.tv_sec = 0; + vap->va_birthtime.tv_nsec = 0; + vap->va_bytes = dbtob((u_quad_t)ip->i_din1->di_blocks); + } else { + vap->va_rdev = ip->i_din2->di_rdev; + vap->va_size = ip->i_din2->di_size; + vap->va_atime.tv_sec = ip->i_din2->di_atime; + vap->va_atime.tv_nsec = ip->i_din2->di_atimensec; + vap->va_mtime.tv_sec = ip->i_din2->di_mtime; + vap->va_mtime.tv_nsec = ip->i_din2->di_mtimensec; + vap->va_ctime.tv_sec = ip->i_din2->di_ctime; + vap->va_ctime.tv_nsec = ip->i_din2->di_ctimensec; + vap->va_birthtime.tv_sec = ip->i_din2->di_birthtime; + vap->va_birthtime.tv_nsec = ip->i_din2->di_birthnsec; + vap->va_bytes = dbtob((u_quad_t)ip->i_din2->di_blocks); + } + vap->va_flags = ip->i_flags; + vap->va_gen = ip->i_gen; + vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize; + vap->va_type = IFTOVT(ip->i_mode); + vap->va_filerev = ip->i_modrev; + return (0); +} + +/* + * Set attribute vnode op. called from several syscalls + */ +static int +ufs_setattr(ap) + struct vop_setattr_args /* { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + struct vattr *vap = ap->a_vap; + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + struct ucred *cred = ap->a_cred; + struct thread *td = ap->a_td; + int error; + + /* + * Check for unsettable attributes. + */ + if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || + (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || + (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) || + ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { + return (EINVAL); + } + if (vap->va_flags != VNOVAL) { + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + /* + * Callers may only modify the file flags on objects they + * have VADMIN rights for. + */ + if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) + return (error); + /* + * Unprivileged processes and privileged processes in + * jail() are not permitted to unset system flags, or + * modify flags if any system flags are set. + * Privileged non-jail processes may not modify system flags + * if securelevel > 0 and any existing system flags are set. + */ + if (!suser_cred(cred, PRISON_ROOT)) { + if (ip->i_flags + & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) { + error = securelevel_gt(cred, 0); + if (error) + return (error); + } + /* Snapshot flag cannot be set or cleared */ + if (((vap->va_flags & SF_SNAPSHOT) != 0 && + (ip->i_flags & SF_SNAPSHOT) == 0) || + ((vap->va_flags & SF_SNAPSHOT) == 0 && + (ip->i_flags & SF_SNAPSHOT) != 0)) + return (EPERM); + ip->i_flags = vap->va_flags; + DIP(ip, i_flags) = vap->va_flags; + } else { + if (ip->i_flags + & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || + (vap->va_flags & UF_SETTABLE) != vap->va_flags) + return (EPERM); + ip->i_flags &= SF_SETTABLE; + ip->i_flags |= (vap->va_flags & UF_SETTABLE); + DIP(ip, i_flags) = ip->i_flags; + } + ip->i_flag |= IN_CHANGE; + if (vap->va_flags & (IMMUTABLE | APPEND)) + return (0); + } + if (ip->i_flags & (IMMUTABLE | APPEND)) + return (EPERM); + /* + * Go through the fields and update iff not VNOVAL. + */ + if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) { + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + if ((error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, + td)) != 0) + return (error); + } + if (vap->va_size != VNOVAL) { + /* + * Disallow write attempts on read-only filesystems; + * unless the file is a socket, fifo, or a block or + * character device resident on the filesystem. + */ + switch (vp->v_type) { + case VDIR: + return (EISDIR); + case VLNK: + case VREG: + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + if ((ip->i_flags & SF_SNAPSHOT) != 0) + return (EPERM); + break; + default: + break; + } + if ((error = UFS_TRUNCATE(vp, vap->va_size, IO_NORMAL, + cred, td)) != 0) + return (error); + } + if (vap->va_atime.tv_sec != VNOVAL || + vap->va_mtime.tv_sec != VNOVAL || + vap->va_birthtime.tv_sec != VNOVAL) { + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + if ((ip->i_flags & SF_SNAPSHOT) != 0) + return (EPERM); + /* + * From utimes(2): + * If times is NULL, ... The caller must be the owner of + * the file, have permission to write the file, or be the + * super-user. + * If times is non-NULL, ... The caller must be the owner of + * the file or be the super-user. + */ + if ((error = VOP_ACCESS(vp, VADMIN, cred, td)) && + ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || + (error = VOP_ACCESS(vp, VWRITE, cred, td)))) + return (error); + if (vap->va_atime.tv_sec != VNOVAL) + ip->i_flag |= IN_ACCESS; + if (vap->va_mtime.tv_sec != VNOVAL) + ip->i_flag |= IN_CHANGE | IN_UPDATE; + if (vap->va_birthtime.tv_sec != VNOVAL && + ip->i_ump->um_fstype == UFS2) + ip->i_flag |= IN_MODIFIED; + ufs_itimes(vp); + if (vap->va_atime.tv_sec != VNOVAL) { + DIP(ip, i_atime) = vap->va_atime.tv_sec; + DIP(ip, i_atimensec) = vap->va_atime.tv_nsec; + } + if (vap->va_mtime.tv_sec != VNOVAL) { + DIP(ip, i_mtime) = vap->va_mtime.tv_sec; + DIP(ip, i_mtimensec) = vap->va_mtime.tv_nsec; + } + if (vap->va_birthtime.tv_sec != VNOVAL && + ip->i_ump->um_fstype == UFS2) { + ip->i_din2->di_birthtime = vap->va_birthtime.tv_sec; + ip->i_din2->di_birthnsec = vap->va_birthtime.tv_nsec; + } + error = UFS_UPDATE(vp, 0); + if (error) + return (error); + } + error = 0; + if (vap->va_mode != (mode_t)VNOVAL) { + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return (EROFS); + if ((ip->i_flags & SF_SNAPSHOT) != 0 && (vap->va_mode & + (S_IXUSR | S_IWUSR | S_IXGRP | S_IWGRP | S_IXOTH | S_IWOTH))) + return (EPERM); + error = ufs_chmod(vp, (int)vap->va_mode, cred, td); + } + VN_KNOTE(vp, NOTE_ATTRIB); + return (error); +} + +/* + * Change the mode on a file. + * Inode must be locked before calling. + */ +static int +ufs_chmod(vp, mode, cred, td) + struct vnode *vp; + int mode; + struct ucred *cred; + struct thread *td; +{ + struct inode *ip = VTOI(vp); + int error; + + /* + * To modify the permissions on a file, must possess VADMIN + * for that file. + */ + if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) + return (error); + /* + * Privileged processes may set the sticky bit on non-directories, + * as well as set the setgid bit on a file with a group that the + * process is not a member of. Both of these are allowed in + * jail(8). + */ + if (vp->v_type != VDIR && (mode & S_ISTXT)) { + if (suser_cred(cred, PRISON_ROOT)) + return (EFTYPE); + } + if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) { + error = suser_cred(cred, PRISON_ROOT); + if (error) + return (error); + } + ip->i_mode &= ~ALLPERMS; + ip->i_mode |= (mode & ALLPERMS); + DIP(ip, i_mode) = ip->i_mode; + ip->i_flag |= IN_CHANGE; + return (0); +} + +/* + * Perform chown operation on inode ip; + * inode must be locked prior to call. + */ +static int +ufs_chown(vp, uid, gid, cred, td) + struct vnode *vp; + uid_t uid; + gid_t gid; + struct ucred *cred; + struct thread *td; +{ + struct inode *ip = VTOI(vp); + uid_t ouid; + gid_t ogid; + int error = 0; +#ifdef QUOTA + int i; + ufs2_daddr_t change; +#endif + + if (uid == (uid_t)VNOVAL) + uid = ip->i_uid; + if (gid == (gid_t)VNOVAL) + gid = ip->i_gid; + /* + * To modify the ownership of a file, must possess VADMIN + * for that file. + */ + if ((error = VOP_ACCESS(vp, VADMIN, cred, td))) + return (error); + /* + * To change the owner of a file, or change the group of a file + * to a group of which we are not a member, the caller must + * have privilege. + */ + if ((uid != ip->i_uid || + (gid != ip->i_gid && !groupmember(gid, cred))) && + (error = suser_cred(cred, PRISON_ROOT))) + return (error); + ogid = ip->i_gid; + ouid = ip->i_uid; +#ifdef QUOTA + if ((error = getinoquota(ip)) != 0) + return (error); + if (ouid == uid) { + dqrele(vp, ip->i_dquot[USRQUOTA]); + ip->i_dquot[USRQUOTA] = NODQUOT; + } + if (ogid == gid) { + dqrele(vp, ip->i_dquot[GRPQUOTA]); + ip->i_dquot[GRPQUOTA] = NODQUOT; + } + change = DIP(ip, i_blocks); + (void) chkdq(ip, -change, cred, CHOWN); + (void) chkiq(ip, -1, cred, CHOWN); + for (i = 0; i < MAXQUOTAS; i++) { + dqrele(vp, ip->i_dquot[i]); + ip->i_dquot[i] = NODQUOT; + } +#endif + ip->i_gid = gid; + DIP(ip, i_gid) = gid; + ip->i_uid = uid; + DIP(ip, i_uid) = uid; +#ifdef QUOTA + if ((error = getinoquota(ip)) == 0) { + if (ouid == uid) { + dqrele(vp, ip->i_dquot[USRQUOTA]); + ip->i_dquot[USRQUOTA] = NODQUOT; + } + if (ogid == gid) { + dqrele(vp, ip->i_dquot[GRPQUOTA]); + ip->i_dquot[GRPQUOTA] = NODQUOT; + } + if ((error = chkdq(ip, change, cred, CHOWN)) == 0) { + if ((error = chkiq(ip, 1, cred, CHOWN)) == 0) + goto good; + else + (void) chkdq(ip, -change, cred, CHOWN|FORCE); + } + for (i = 0; i < MAXQUOTAS; i++) { + dqrele(vp, ip->i_dquot[i]); + ip->i_dquot[i] = NODQUOT; + } + } + ip->i_gid = ogid; + DIP(ip, i_gid) = ogid; + ip->i_uid = ouid; + DIP(ip, i_uid) = ouid; + if (getinoquota(ip) == 0) { + if (ouid == uid) { + dqrele(vp, ip->i_dquot[USRQUOTA]); + ip->i_dquot[USRQUOTA] = NODQUOT; + } + if (ogid == gid) { + dqrele(vp, ip->i_dquot[GRPQUOTA]); + ip->i_dquot[GRPQUOTA] = NODQUOT; + } + (void) chkdq(ip, change, cred, FORCE|CHOWN); + (void) chkiq(ip, 1, cred, FORCE|CHOWN); + (void) getinoquota(ip); + } + return (error); +good: + if (getinoquota(ip)) + panic("ufs_chown: lost quota"); +#endif /* QUOTA */ + ip->i_flag |= IN_CHANGE; + if (suser_cred(cred, PRISON_ROOT) && (ouid != uid || ogid != gid)) { + ip->i_mode &= ~(ISUID | ISGID); + DIP(ip, i_mode) = ip->i_mode; + } + return (0); +} + +static int +ufs_remove(ap) + struct vop_remove_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + struct inode *ip; + struct vnode *vp = ap->a_vp; + struct vnode *dvp = ap->a_dvp; + int error; + + ip = VTOI(vp); + if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || + (VTOI(dvp)->i_flags & APPEND)) { + error = EPERM; + goto out; + } + error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0); + if (ip->i_nlink <= 0) + vp->v_vflag |= VV_NOSYNC; + VN_KNOTE(vp, NOTE_DELETE); + VN_KNOTE(dvp, NOTE_WRITE); +out: + return (error); +} + +/* + * link vnode call + */ +static int +ufs_link(ap) + struct vop_link_args /* { + struct vnode *a_tdvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct vnode *tdvp = ap->a_tdvp; + struct componentname *cnp = ap->a_cnp; + struct inode *ip; + struct direct newdir; + int error; + +#ifdef DIAGNOSTIC + if ((cnp->cn_flags & HASBUF) == 0) + panic("ufs_link: no name"); +#endif + if (tdvp->v_mount != vp->v_mount) { + error = EXDEV; + goto out; + } + ip = VTOI(vp); + if ((nlink_t)ip->i_nlink >= LINK_MAX) { + error = EMLINK; + goto out; + } + if (ip->i_flags & (IMMUTABLE | APPEND)) { + error = EPERM; + goto out; + } + ip->i_effnlink++; + ip->i_nlink++; + DIP(ip, i_nlink) = ip->i_nlink; + ip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(vp)) + softdep_change_linkcnt(ip); + error = UFS_UPDATE(vp, !(DOINGSOFTDEP(vp) | DOINGASYNC(vp))); + if (!error) { + ufs_makedirentry(ip, cnp, &newdir); + error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL); + } + + if (error) { + ip->i_effnlink--; + ip->i_nlink--; + DIP(ip, i_nlink) = ip->i_nlink; + ip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(vp)) + softdep_change_linkcnt(ip); + } +out: + VN_KNOTE(vp, NOTE_LINK); + VN_KNOTE(tdvp, NOTE_WRITE); + return (error); +} + +/* + * whiteout vnode call + */ +static int +ufs_whiteout(ap) + struct vop_whiteout_args /* { + struct vnode *a_dvp; + struct componentname *a_cnp; + int a_flags; + } */ *ap; +{ + struct vnode *dvp = ap->a_dvp; + struct componentname *cnp = ap->a_cnp; + struct direct newdir; + int error = 0; + + switch (ap->a_flags) { + case LOOKUP: + /* 4.4 format directories support whiteout operations */ + if (dvp->v_mount->mnt_maxsymlinklen > 0) + return (0); + return (EOPNOTSUPP); + + case CREATE: + /* create a new directory whiteout */ +#ifdef DIAGNOSTIC + if ((cnp->cn_flags & SAVENAME) == 0) + panic("ufs_whiteout: missing name"); + if (dvp->v_mount->mnt_maxsymlinklen <= 0) + panic("ufs_whiteout: old format filesystem"); +#endif + + newdir.d_ino = WINO; + newdir.d_namlen = cnp->cn_namelen; + bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1); + newdir.d_type = DT_WHT; + error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL); + break; + + case DELETE: + /* remove an existing directory whiteout */ +#ifdef DIAGNOSTIC + if (dvp->v_mount->mnt_maxsymlinklen <= 0) + panic("ufs_whiteout: old format filesystem"); +#endif + + cnp->cn_flags &= ~DOWHITEOUT; + error = ufs_dirremove(dvp, NULL, cnp->cn_flags, 0); + break; + default: + panic("ufs_whiteout: unknown op"); + } + return (error); +} + +/* + * Rename system call. + * rename("foo", "bar"); + * is essentially + * unlink("bar"); + * link("foo", "bar"); + * unlink("foo"); + * but ``atomically''. Can't do full commit without saving state in the + * inode on disk which isn't feasible at this time. Best we can do is + * always guarantee the target exists. + * + * Basic algorithm is: + * + * 1) Bump link count on source while we're linking it to the + * target. This also ensure the inode won't be deleted out + * from underneath us while we work (it may be truncated by + * a concurrent `trunc' or `open' for creation). + * 2) Link source to destination. If destination already exists, + * delete it first. + * 3) Unlink source reference to inode if still around. If a + * directory was moved and the parent of the destination + * is different from the source, patch the ".." entry in the + * directory. + */ +static int +ufs_rename(ap) + struct vop_rename_args /* { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; + } */ *ap; +{ + struct vnode *tvp = ap->a_tvp; + struct vnode *tdvp = ap->a_tdvp; + struct vnode *fvp = ap->a_fvp; + struct vnode *fdvp = ap->a_fdvp; + struct componentname *tcnp = ap->a_tcnp; + struct componentname *fcnp = ap->a_fcnp; + struct thread *td = fcnp->cn_thread; + struct inode *ip, *xp, *dp; + struct direct newdir; + int doingdirectory = 0, oldparent = 0, newparent = 0; + int error = 0, ioflag; + +#ifdef DIAGNOSTIC + if ((tcnp->cn_flags & HASBUF) == 0 || + (fcnp->cn_flags & HASBUF) == 0) + panic("ufs_rename: no name"); +#endif + /* + * Check for cross-device rename. + */ + if ((fvp->v_mount != tdvp->v_mount) || + (tvp && (fvp->v_mount != tvp->v_mount))) { + error = EXDEV; +abortit: + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp) + vput(tvp); + vrele(fdvp); + vrele(fvp); + return (error); + } + + if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) || + (VTOI(tdvp)->i_flags & APPEND))) { + error = EPERM; + goto abortit; + } + + /* + * Renaming a file to itself has no effect. The upper layers should + * not call us in that case. Temporarily just warn if they do. + */ + if (fvp == tvp) { + printf("ufs_rename: fvp == tvp (can't happen)\n"); + error = 0; + goto abortit; + } + + if ((error = vn_lock(fvp, LK_EXCLUSIVE, td)) != 0) + goto abortit; + dp = VTOI(fdvp); + ip = VTOI(fvp); + if (ip->i_nlink >= LINK_MAX) { + VOP_UNLOCK(fvp, 0, td); + error = EMLINK; + goto abortit; + } + if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) + || (dp->i_flags & APPEND)) { + VOP_UNLOCK(fvp, 0, td); + error = EPERM; + goto abortit; + } + if ((ip->i_mode & IFMT) == IFDIR) { + /* + * Avoid ".", "..", and aliases of "." for obvious reasons. + */ + if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') || + dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT || + (ip->i_flag & IN_RENAME)) { + VOP_UNLOCK(fvp, 0, td); + error = EINVAL; + goto abortit; + } + ip->i_flag |= IN_RENAME; + oldparent = dp->i_number; + doingdirectory = 1; + } + VN_KNOTE(fdvp, NOTE_WRITE); /* XXX right place? */ + vrele(fdvp); + + /* + * When the target exists, both the directory + * and target vnodes are returned locked. + */ + dp = VTOI(tdvp); + xp = NULL; + if (tvp) + xp = VTOI(tvp); + + /* + * 1) Bump link count while we're moving stuff + * around. If we crash somewhere before + * completing our work, the link count + * may be wrong, but correctable. + */ + ip->i_effnlink++; + ip->i_nlink++; + DIP(ip, i_nlink) = ip->i_nlink; + ip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(fvp)) + softdep_change_linkcnt(ip); + if ((error = UFS_UPDATE(fvp, !(DOINGSOFTDEP(fvp) | + DOINGASYNC(fvp)))) != 0) { + VOP_UNLOCK(fvp, 0, td); + goto bad; + } + + /* + * If ".." must be changed (ie the directory gets a new + * parent) then the source directory must not be in the + * directory heirarchy above the target, as this would + * orphan everything below the source directory. Also + * the user must have write permission in the source so + * as to be able to change "..". We must repeat the call + * to namei, as the parent directory is unlocked by the + * call to checkpath(). + */ + error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread); + VOP_UNLOCK(fvp, 0, td); + if (oldparent != dp->i_number) + newparent = dp->i_number; + if (doingdirectory && newparent) { + if (error) /* write access check above */ + goto bad; + if (xp != NULL) + vput(tvp); + error = ufs_checkpath(ip, dp, tcnp->cn_cred); + if (error) + goto out; + if ((tcnp->cn_flags & SAVESTART) == 0) + panic("ufs_rename: lost to startdir"); + VREF(tdvp); + error = relookup(tdvp, &tvp, tcnp); + if (error) + goto out; + vrele(tdvp); + dp = VTOI(tdvp); + xp = NULL; + if (tvp) + xp = VTOI(tvp); + } + /* + * 2) If target doesn't exist, link the target + * to the source and unlink the source. + * Otherwise, rewrite the target directory + * entry to reference the source inode and + * expunge the original entry's existence. + */ + if (xp == NULL) { + if (dp->i_dev != ip->i_dev) + panic("ufs_rename: EXDEV"); + /* + * Account for ".." in new directory. + * When source and destination have the same + * parent we don't fool with the link count. + */ + if (doingdirectory && newparent) { + if ((nlink_t)dp->i_nlink >= LINK_MAX) { + error = EMLINK; + goto bad; + } + dp->i_effnlink++; + dp->i_nlink++; + DIP(dp, i_nlink) = dp->i_nlink; + dp->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(tdvp)) + softdep_change_linkcnt(dp); + error = UFS_UPDATE(tdvp, !(DOINGSOFTDEP(tdvp) | + DOINGASYNC(tdvp))); + if (error) + goto bad; + } + ufs_makedirentry(ip, tcnp, &newdir); + error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL); + if (error) { + if (doingdirectory && newparent) { + dp->i_effnlink--; + dp->i_nlink--; + DIP(dp, i_nlink) = dp->i_nlink; + dp->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(tdvp)) + softdep_change_linkcnt(dp); + (void)UFS_UPDATE(tdvp, 1); + } + goto bad; + } + VN_KNOTE(tdvp, NOTE_WRITE); + vput(tdvp); + } else { + if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev) + panic("ufs_rename: EXDEV"); + /* + * Short circuit rename(foo, foo). + */ + if (xp->i_number == ip->i_number) + panic("ufs_rename: same file"); + /* + * If the parent directory is "sticky", then the caller + * must possess VADMIN for the parent directory, or the + * destination of the rename. This implements append-only + * directories. + */ + if ((dp->i_mode & S_ISTXT) && + VOP_ACCESS(tdvp, VADMIN, tcnp->cn_cred, td) && + VOP_ACCESS(tvp, VADMIN, tcnp->cn_cred, td)) { + error = EPERM; + goto bad; + } + /* + * Target must be empty if a directory and have no links + * to it. Also, ensure source and target are compatible + * (both directories, or both not directories). + */ + if ((xp->i_mode&IFMT) == IFDIR) { + if ((xp->i_effnlink > 2) || + !ufs_dirempty(xp, dp->i_number, tcnp->cn_cred)) { + error = ENOTEMPTY; + goto bad; + } + if (!doingdirectory) { + error = ENOTDIR; + goto bad; + } + cache_purge(tdvp); + } else if (doingdirectory) { + error = EISDIR; + goto bad; + } + error = ufs_dirrewrite(dp, xp, ip->i_number, + IFTODT(ip->i_mode), + (doingdirectory && newparent) ? newparent : doingdirectory); + if (error) + goto bad; + if (doingdirectory) { + if (!newparent) { + dp->i_effnlink--; + if (DOINGSOFTDEP(tdvp)) + softdep_change_linkcnt(dp); + } + xp->i_effnlink--; + if (DOINGSOFTDEP(tvp)) + softdep_change_linkcnt(xp); + } + if (doingdirectory && !DOINGSOFTDEP(tvp)) { + /* + * Truncate inode. The only stuff left in the directory + * is "." and "..". The "." reference is inconsequential + * since we are quashing it. We have removed the "." + * reference and the reference in the parent directory, + * but there may be other hard links. The soft + * dependency code will arrange to do these operations + * after the parent directory entry has been deleted on + * disk, so when running with that code we avoid doing + * them now. + */ + if (!newparent) { + dp->i_nlink--; + DIP(dp, i_nlink) = dp->i_nlink; + dp->i_flag |= IN_CHANGE; + } + xp->i_nlink--; + DIP(xp, i_nlink) = xp->i_nlink; + xp->i_flag |= IN_CHANGE; + ioflag = IO_NORMAL; + if (DOINGASYNC(tvp)) + ioflag |= IO_SYNC; + if ((error = UFS_TRUNCATE(tvp, (off_t)0, ioflag, + tcnp->cn_cred, tcnp->cn_thread)) != 0) + goto bad; + } + VN_KNOTE(tdvp, NOTE_WRITE); + vput(tdvp); + VN_KNOTE(tvp, NOTE_DELETE); + vput(tvp); + xp = NULL; + } + + /* + * 3) Unlink the source. + */ + fcnp->cn_flags &= ~MODMASK; + fcnp->cn_flags |= LOCKPARENT | LOCKLEAF; + if ((fcnp->cn_flags & SAVESTART) == 0) + panic("ufs_rename: lost from startdir"); + VREF(fdvp); + error = relookup(fdvp, &fvp, fcnp); + if (error == 0) + vrele(fdvp); + if (fvp != NULL) { + xp = VTOI(fvp); + dp = VTOI(fdvp); + } else { + /* + * From name has disappeared. IN_RENAME is not sufficient + * to protect against directory races due to timing windows, + * so we have to remove the panic. XXX the only real way + * to solve this issue is at a much higher level. By the + * time we hit ufs_rename() it's too late. + */ +#if 0 + if (doingdirectory) + panic("ufs_rename: lost dir entry"); +#endif + vrele(ap->a_fvp); + return (0); + } + /* + * Ensure that the directory entry still exists and has not + * changed while the new name has been entered. If the source is + * a file then the entry may have been unlinked or renamed. In + * either case there is no further work to be done. If the source + * is a directory then it cannot have been rmdir'ed; the IN_RENAME + * flag ensures that it cannot be moved by another rename or removed + * by a rmdir. + */ + if (xp != ip) { + /* + * From name resolves to a different inode. IN_RENAME is + * not sufficient protection against timing window races + * so we can't panic here. XXX the only real way + * to solve this issue is at a much higher level. By the + * time we hit ufs_rename() it's too late. + */ +#if 0 + if (doingdirectory) + panic("ufs_rename: lost dir entry"); +#endif + } else { + /* + * If the source is a directory with a + * new parent, the link count of the old + * parent directory must be decremented + * and ".." set to point to the new parent. + */ + if (doingdirectory && newparent) { + xp->i_offset = mastertemplate.dot_reclen; + ufs_dirrewrite(xp, dp, newparent, DT_DIR, 0); + cache_purge(fdvp); + } + error = ufs_dirremove(fdvp, xp, fcnp->cn_flags, 0); + xp->i_flag &= ~IN_RENAME; + } + VN_KNOTE(fvp, NOTE_RENAME); + if (dp) + vput(fdvp); + if (xp) + vput(fvp); + vrele(ap->a_fvp); + return (error); + +bad: + if (xp) + vput(ITOV(xp)); + vput(ITOV(dp)); +out: + if (doingdirectory) + ip->i_flag &= ~IN_RENAME; + if (vn_lock(fvp, LK_EXCLUSIVE, td) == 0) { + ip->i_effnlink--; + ip->i_nlink--; + DIP(ip, i_nlink) = ip->i_nlink; + ip->i_flag |= IN_CHANGE; + ip->i_flag &= ~IN_RENAME; + if (DOINGSOFTDEP(fvp)) + softdep_change_linkcnt(ip); + vput(fvp); + } else + vrele(fvp); + return (error); +} + +/* + * Mkdir system call + */ +static int +ufs_mkdir(ap) + struct vop_mkdir_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + } */ *ap; +{ + struct vnode *dvp = ap->a_dvp; + struct vattr *vap = ap->a_vap; + struct componentname *cnp = ap->a_cnp; + struct inode *ip, *dp; + struct vnode *tvp; + struct buf *bp; + struct dirtemplate dirtemplate, *dtp; + struct direct newdir; +#ifdef UFS_ACL + struct acl *acl, *dacl; +#endif + int error, dmode; + long blkoff; + +#ifdef DIAGNOSTIC + if ((cnp->cn_flags & HASBUF) == 0) + panic("ufs_mkdir: no name"); +#endif + dp = VTOI(dvp); + if ((nlink_t)dp->i_nlink >= LINK_MAX) { + error = EMLINK; + goto out; + } + dmode = vap->va_mode & 0777; + dmode |= IFDIR; + /* + * Must simulate part of ufs_makeinode here to acquire the inode, + * but not have it entered in the parent directory. The entry is + * made later after writing "." and ".." entries. + */ + error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, &tvp); + if (error) + goto out; + ip = VTOI(tvp); + ip->i_gid = dp->i_gid; + DIP(ip, i_gid) = dp->i_gid; +#ifdef SUIDDIR + { +#ifdef QUOTA + struct ucred ucred, *ucp; + ucp = cnp->cn_cred; +#endif + /* + * If we are hacking owners here, (only do this where told to) + * and we are not giving it TO root, (would subvert quotas) + * then go ahead and give it to the other user. + * The new directory also inherits the SUID bit. + * If user's UID and dir UID are the same, + * 'give it away' so that the SUID is still forced on. + */ + if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && + (dp->i_mode & ISUID) && dp->i_uid) { + dmode |= ISUID; + ip->i_uid = dp->i_uid; + DIP(ip, i_uid) = dp->i_uid; +#ifdef QUOTA + if (dp->i_uid != cnp->cn_cred->cr_uid) { + /* + * Make sure the correct user gets charged + * for the space. + * Make a dummy credential for the victim. + * XXX This seems to never be accessed out of + * our context so a stack variable is ok. + */ + ucred.cr_ref = 1; + ucred.cr_uid = ip->i_uid; + ucred.cr_ngroups = 1; + ucred.cr_groups[0] = dp->i_gid; + ucp = &ucred; + } +#endif + } else { + ip->i_uid = cnp->cn_cred->cr_uid; + DIP(ip, i_uid) = ip->i_uid; + } +#ifdef QUOTA + if ((error = getinoquota(ip)) || + (error = chkiq(ip, 1, ucp, 0))) { + UFS_VFREE(tvp, ip->i_number, dmode); + vput(tvp); + return (error); + } +#endif + } +#else /* !SUIDDIR */ + ip->i_uid = cnp->cn_cred->cr_uid; + DIP(ip, i_uid) = ip->i_uid; +#ifdef QUOTA + if ((error = getinoquota(ip)) || + (error = chkiq(ip, 1, cnp->cn_cred, 0))) { + UFS_VFREE(tvp, ip->i_number, dmode); + vput(tvp); + return (error); + } +#endif +#endif /* !SUIDDIR */ + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; +#ifdef UFS_ACL + acl = dacl = NULL; + if ((dvp->v_mount->mnt_flag & MNT_ACLS) != 0) { + MALLOC(acl, struct acl *, sizeof(*acl), M_ACL, M_WAITOK); + MALLOC(dacl, struct acl *, sizeof(*dacl), M_ACL, M_WAITOK); + + /* + * Retrieve default ACL from parent, if any. + */ + error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cnp->cn_cred, + cnp->cn_thread); + switch (error) { + case 0: + /* + * Retrieved a default ACL, so merge mode and ACL if + * necessary. If the ACL is empty, fall through to + * the "not defined or available" case. + */ + if (acl->acl_cnt != 0) { + dmode = acl_posix1e_newfilemode(dmode, acl); + ip->i_mode = dmode; + DIP(ip, i_mode) = dmode; + *dacl = *acl; + ufs_sync_acl_from_inode(ip, acl); + break; + } + /* FALLTHROUGH */ + + case EOPNOTSUPP: + /* + * Just use the mode as-is. + */ + ip->i_mode = dmode; + DIP(ip, i_mode) = dmode; + FREE(acl, M_ACL); + FREE(dacl, M_ACL); + dacl = acl = NULL; + break; + + default: + UFS_VFREE(tvp, ip->i_number, dmode); + vput(tvp); + FREE(acl, M_ACL); + FREE(dacl, M_ACL); + return (error); + } + } else { +#endif /* !UFS_ACL */ + ip->i_mode = dmode; + DIP(ip, i_mode) = dmode; +#ifdef UFS_ACL + } +#endif + tvp->v_type = VDIR; /* Rest init'd in getnewvnode(). */ + ip->i_effnlink = 2; + ip->i_nlink = 2; + DIP(ip, i_nlink) = 2; + if (DOINGSOFTDEP(tvp)) + softdep_change_linkcnt(ip); + if (cnp->cn_flags & ISWHITEOUT) { + ip->i_flags |= UF_OPAQUE; + DIP(ip, i_flags) = ip->i_flags; + } + + /* + * Bump link count in parent directory to reflect work done below. + * Should be done before reference is created so cleanup is + * possible if we crash. + */ + dp->i_effnlink++; + dp->i_nlink++; + DIP(dp, i_nlink) = dp->i_nlink; + dp->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(dvp)) + softdep_change_linkcnt(dp); + error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(dvp) | DOINGASYNC(dvp))); + if (error) + goto bad; +#ifdef MAC + if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) { + error = mac_create_vnode_extattr(cnp->cn_cred, dvp->v_mount, + dvp, tvp, cnp); + if (error) + goto bad; + } +#endif +#ifdef UFS_ACL + if (acl != NULL) { + /* + * XXX: If we abort now, will Soft Updates notify the extattr + * code that the EAs for the file need to be released? + */ + error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cnp->cn_cred, + cnp->cn_thread); + if (error == 0) + error = VOP_SETACL(tvp, ACL_TYPE_DEFAULT, dacl, + cnp->cn_cred, cnp->cn_thread); + switch (error) { + case 0: + break; + + case EOPNOTSUPP: + /* + * XXX: This should not happen, as EOPNOTSUPP above + * was supposed to free acl. + */ + printf("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()\n"); + /* + panic("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()"); + */ + break; + + default: + FREE(acl, M_ACL); + FREE(dacl, M_ACL); + goto bad; + } + FREE(acl, M_ACL); + FREE(dacl, M_ACL); + } +#endif /* !UFS_ACL */ + + /* + * Initialize directory with "." and ".." from static template. + */ + if (dvp->v_mount->mnt_maxsymlinklen > 0 + ) + dtp = &mastertemplate; + else + dtp = (struct dirtemplate *)&omastertemplate; + dirtemplate = *dtp; + dirtemplate.dot_ino = ip->i_number; + dirtemplate.dotdot_ino = dp->i_number; + if ((error = UFS_BALLOC(tvp, (off_t)0, DIRBLKSIZ, cnp->cn_cred, + BA_CLRBUF, &bp)) != 0) + goto bad; + ip->i_size = DIRBLKSIZ; + DIP(ip, i_size) = DIRBLKSIZ; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + vnode_pager_setsize(tvp, (u_long)ip->i_size); + bcopy((caddr_t)&dirtemplate, (caddr_t)bp->b_data, sizeof dirtemplate); + if (DOINGSOFTDEP(tvp)) { + /* + * Ensure that the entire newly allocated block is a + * valid directory so that future growth within the + * block does not have to ensure that the block is + * written before the inode. + */ + blkoff = DIRBLKSIZ; + while (blkoff < bp->b_bcount) { + ((struct direct *) + (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ; + blkoff += DIRBLKSIZ; + } + } + if ((error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(tvp) | + DOINGASYNC(tvp)))) != 0) { + (void)BUF_WRITE(bp); + goto bad; + } + /* + * Directory set up, now install its entry in the parent directory. + * + * If we are not doing soft dependencies, then we must write out the + * buffer containing the new directory body before entering the new + * name in the parent. If we are doing soft dependencies, then the + * buffer containing the new directory body will be passed to and + * released in the soft dependency code after the code has attached + * an appropriate ordering dependency to the buffer which ensures that + * the buffer is written before the new name is written in the parent. + */ + if (DOINGASYNC(dvp)) + bdwrite(bp); + else if (!DOINGSOFTDEP(dvp) && ((error = BUF_WRITE(bp)))) + goto bad; + ufs_makedirentry(ip, cnp, &newdir); + error = ufs_direnter(dvp, tvp, &newdir, cnp, bp); + +bad: + if (error == 0) { + VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK); + *ap->a_vpp = tvp; + } else { + dp->i_effnlink--; + dp->i_nlink--; + DIP(dp, i_nlink) = dp->i_nlink; + dp->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(dvp)) + softdep_change_linkcnt(dp); + /* + * No need to do an explicit VOP_TRUNCATE here, vrele will + * do this for us because we set the link count to 0. + */ + ip->i_effnlink = 0; + ip->i_nlink = 0; + DIP(ip, i_nlink) = 0; + ip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(tvp)) + softdep_change_linkcnt(ip); + vput(tvp); + } +out: + return (error); +} + +/* + * Rmdir system call. + */ +static int +ufs_rmdir(ap) + struct vop_rmdir_args /* { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct vnode *dvp = ap->a_dvp; + struct componentname *cnp = ap->a_cnp; + struct inode *ip, *dp; + int error, ioflag; + + ip = VTOI(vp); + dp = VTOI(dvp); + + /* + * Do not remove a directory that is in the process of being renamed. + * Verify the directory is empty (and valid). Rmdir ".." will not be + * valid since ".." will contain a reference to the current directory + * and thus be non-empty. Do not allow the removal of mounted on + * directories (this can happen when an NFS exported filesystem + * tries to remove a locally mounted on directory). + */ + error = 0; + if (ip->i_flag & IN_RENAME) { + error = EINVAL; + goto out; + } + if (ip->i_effnlink != 2 || + !ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) { + error = ENOTEMPTY; + goto out; + } + if ((dp->i_flags & APPEND) + || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) { + error = EPERM; + goto out; + } + if (vp->v_mountedhere != 0) { + error = EINVAL; + goto out; + } + /* + * Delete reference to directory before purging + * inode. If we crash in between, the directory + * will be reattached to lost+found, + */ + dp->i_effnlink--; + ip->i_effnlink--; + if (DOINGSOFTDEP(vp)) { + softdep_change_linkcnt(dp); + softdep_change_linkcnt(ip); + } + error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1); + if (error) { + dp->i_effnlink++; + ip->i_effnlink++; + if (DOINGSOFTDEP(vp)) { + softdep_change_linkcnt(dp); + softdep_change_linkcnt(ip); + } + goto out; + } + VN_KNOTE(dvp, NOTE_WRITE | NOTE_LINK); + cache_purge(dvp); + /* + * Truncate inode. The only stuff left in the directory is "." and + * "..". The "." reference is inconsequential since we are quashing + * it. The soft dependency code will arrange to do these operations + * after the parent directory entry has been deleted on disk, so + * when running with that code we avoid doing them now. + */ + if (!DOINGSOFTDEP(vp)) { + dp->i_nlink--; + DIP(dp, i_nlink) = dp->i_nlink; + dp->i_flag |= IN_CHANGE; + ip->i_nlink--; + DIP(ip, i_nlink) = ip->i_nlink; + ip->i_flag |= IN_CHANGE; + ioflag = IO_NORMAL; + if (DOINGASYNC(vp)) + ioflag |= IO_SYNC; + error = UFS_TRUNCATE(vp, (off_t)0, ioflag, cnp->cn_cred, + cnp->cn_thread); + } + cache_purge(vp); +#ifdef UFS_DIRHASH + /* Kill any active hash; i_effnlink == 0, so it will not come back. */ + if (ip->i_dirhash != NULL) + ufsdirhash_free(ip); +#endif +out: + VN_KNOTE(vp, NOTE_DELETE); + return (error); +} + +/* + * symlink -- make a symbolic link + */ +static int +ufs_symlink(ap) + struct vop_symlink_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + char *a_target; + } */ *ap; +{ + struct vnode *vp, **vpp = ap->a_vpp; + struct inode *ip; + int len, error; + + error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp, + vpp, ap->a_cnp); + if (error) + return (error); + VN_KNOTE(ap->a_dvp, NOTE_WRITE); + vp = *vpp; + len = strlen(ap->a_target); + if (len < vp->v_mount->mnt_maxsymlinklen) { + ip = VTOI(vp); + bcopy(ap->a_target, SHORTLINK(ip), len); + ip->i_size = len; + DIP(ip, i_size) = len; + ip->i_flag |= IN_CHANGE | IN_UPDATE; + } else + error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0, + UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK, + ap->a_cnp->cn_cred, NOCRED, (int *)0, (struct thread *)0); + if (error) + vput(vp); + return (error); +} + +/* + * Vnode op for reading directories. + * + * The routine below assumes that the on-disk format of a directory + * is the same as that defined by . If the on-disk + * format changes, then it will be necessary to do a conversion + * from the on-disk format that read returns to the format defined + * by . + */ +int +ufs_readdir(ap) + struct vop_readdir_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + int *a_eofflag; + int *ncookies; + u_long **a_cookies; + } */ *ap; +{ + struct uio *uio = ap->a_uio; + int error; + size_t count, lost; + off_t off; + + if (ap->a_ncookies != NULL) + /* + * Ensure that the block is aligned. The caller can use + * the cookies to determine where in the block to start. + */ + uio->uio_offset &= ~(DIRBLKSIZ - 1); + off = uio->uio_offset; + count = uio->uio_resid; + /* Make sure we don't return partial entries. */ + if (count <= ((uio->uio_offset + count) & (DIRBLKSIZ -1))) + return (EINVAL); + count -= (uio->uio_offset + count) & (DIRBLKSIZ -1); + lost = uio->uio_resid - count; + uio->uio_resid = count; + uio->uio_iov->iov_len = count; +# if (BYTE_ORDER == LITTLE_ENDIAN) + if (ap->a_vp->v_mount->mnt_maxsymlinklen > 0) { + error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); + } else { + struct dirent *dp, *edp; + struct uio auio; + struct iovec aiov; + caddr_t dirbuf; + int readcnt; + u_char tmp; + + auio = *uio; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_segflg = UIO_SYSSPACE; + aiov.iov_len = count; + MALLOC(dirbuf, caddr_t, count, M_TEMP, M_WAITOK); + aiov.iov_base = dirbuf; + error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred); + if (error == 0) { + readcnt = count - auio.uio_resid; + edp = (struct dirent *)&dirbuf[readcnt]; + for (dp = (struct dirent *)dirbuf; dp < edp; ) { + tmp = dp->d_namlen; + dp->d_namlen = dp->d_type; + dp->d_type = tmp; + if (dp->d_reclen > 0) { + dp = (struct dirent *) + ((char *)dp + dp->d_reclen); + } else { + error = EIO; + break; + } + } + if (dp >= edp) + error = uiomove(dirbuf, readcnt, uio); + } + FREE(dirbuf, M_TEMP); + } +# else + error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred); +# endif + if (!error && ap->a_ncookies != NULL) { + struct dirent* dpStart; + struct dirent* dpEnd; + struct dirent* dp; + int ncookies; + u_long *cookies; + u_long *cookiep; + + if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) + panic("ufs_readdir: unexpected uio from NFS server"); + dpStart = (struct dirent *) + ((char *)uio->uio_iov->iov_base - (uio->uio_offset - off)); + dpEnd = (struct dirent *) uio->uio_iov->iov_base; + for (dp = dpStart, ncookies = 0; + dp < dpEnd; + dp = (struct dirent *)((caddr_t) dp + dp->d_reclen)) + ncookies++; + MALLOC(cookies, u_long *, ncookies * sizeof(u_long), M_TEMP, + M_WAITOK); + for (dp = dpStart, cookiep = cookies; + dp < dpEnd; + dp = (struct dirent *)((caddr_t) dp + dp->d_reclen)) { + off += dp->d_reclen; + *cookiep++ = (u_long) off; + } + *ap->a_ncookies = ncookies; + *ap->a_cookies = cookies; + } + uio->uio_resid += lost; + if (ap->a_eofflag) + *ap->a_eofflag = VTOI(ap->a_vp)->i_size <= uio->uio_offset; + return (error); +} + +/* + * Return target name of a symbolic link + */ +static int +ufs_readlink(ap) + struct vop_readlink_args /* { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + doff_t isize; + + isize = ip->i_size; + if ((isize < vp->v_mount->mnt_maxsymlinklen) || + DIP(ip, i_blocks) == 0) { /* XXX - for old fastlink support */ + uiomove(SHORTLINK(ip), isize, ap->a_uio); + return (0); + } + return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred)); +} + +/* + * Calculate the logical to physical mapping if not done already, + * then call the device strategy routine. + * + * In order to be able to swap to a file, the ufs_bmaparray() operation may not + * deadlock on memory. See ufs_bmap() for details. + */ +static int +ufs_strategy(ap) + struct vop_strategy_args /* { + struct vnode *a_vp; + struct buf *a_bp; + } */ *ap; +{ + struct buf *bp = ap->a_bp; + struct vnode *vp = ap->a_vp; + struct inode *ip; + ufs2_daddr_t blkno; + int error; + + KASSERT(ap->a_vp == ap->a_bp->b_vp, ("%s(%p != %p)", + __func__, ap->a_vp, ap->a_bp->b_vp)); + ip = VTOI(vp); + if (bp->b_blkno == bp->b_lblkno) { + error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, bp, NULL, NULL); + bp->b_blkno = blkno; + if (error) { + bp->b_error = error; + bp->b_ioflags |= BIO_ERROR; + bufdone(bp); + return (error); + } + if ((long)bp->b_blkno == -1) + vfs_bio_clrbuf(bp); + } + if ((long)bp->b_blkno == -1) { + bufdone(bp); + return (0); + } + vp = ip->i_devvp; + bp->b_dev = vp->v_rdev; + bp->b_iooffset = dbtob(bp->b_blkno); + VOP_SPECSTRATEGY(vp, bp); + return (0); +} + +/* + * Print out the contents of an inode. + */ +static int +ufs_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct inode *ip = VTOI(vp); + + printf("\tino %lu, on dev %s (%d, %d)", (u_long)ip->i_number, + devtoname(ip->i_dev), major(ip->i_dev), minor(ip->i_dev)); + if (vp->v_type == VFIFO) + fifo_printinfo(vp); + printf("\n"); + return (0); +} + +/* + * Read wrapper for special devices. + */ +static int +ufsspec_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + int error, resid; + struct inode *ip; + struct uio *uio; + + uio = ap->a_uio; + resid = uio->uio_resid; + error = VOCALL(spec_vnodeop_p, VOFFSET(vop_read), ap); + /* + * The inode may have been revoked during the call, so it must not + * be accessed blindly here or in the other wrapper functions. + */ + ip = VTOI(ap->a_vp); + if (ip != NULL && (uio->uio_resid != resid || (error == 0 && resid != 0))) + ip->i_flag |= IN_ACCESS; + return (error); +} + +/* + * Write wrapper for special devices. + */ +static int +ufsspec_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + int error, resid; + struct inode *ip; + struct uio *uio; + + uio = ap->a_uio; + resid = uio->uio_resid; + error = VOCALL(spec_vnodeop_p, VOFFSET(vop_write), ap); + ip = VTOI(ap->a_vp); + if (ip != NULL && (uio->uio_resid != resid || (error == 0 && resid != 0))) + VTOI(ap->a_vp)->i_flag |= IN_CHANGE | IN_UPDATE; + return (error); +} + +/* + * Close wrapper for special devices. + * + * Update the times on the inode then do device close. + */ +static int +ufsspec_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + VI_LOCK(vp); + if (vp->v_usecount > 1) + ufs_itimes(vp); + VI_UNLOCK(vp); + return (VOCALL(spec_vnodeop_p, VOFFSET(vop_close), ap)); +} + +/* + * Read wrapper for fifos. + */ +static int +ufsfifo_read(ap) + struct vop_read_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + int error, resid; + struct inode *ip; + struct uio *uio; + + uio = ap->a_uio; + resid = uio->uio_resid; + error = VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), ap); + ip = VTOI(ap->a_vp); + if ((ap->a_vp->v_mount->mnt_flag & MNT_NOATIME) == 0 && ip != NULL && + (uio->uio_resid != resid || (error == 0 && resid != 0))) + ip->i_flag |= IN_ACCESS; + return (error); +} + +/* + * Write wrapper for fifos. + */ +static int +ufsfifo_write(ap) + struct vop_write_args /* { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; + } */ *ap; +{ + int error, resid; + struct inode *ip; + struct uio *uio; + + uio = ap->a_uio; + resid = uio->uio_resid; + error = VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), ap); + ip = VTOI(ap->a_vp); + if (ip != NULL && (uio->uio_resid != resid || (error == 0 && resid != 0))) + ip->i_flag |= IN_CHANGE | IN_UPDATE; + return (error); +} + +/* + * Close wrapper for fifos. + * + * Update the times on the inode then do device close. + */ +static int +ufsfifo_close(ap) + struct vop_close_args /* { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct thread *a_td; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + VI_LOCK(vp); + if (vp->v_usecount > 1) + ufs_itimes(vp); + VI_UNLOCK(vp); + return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_close), ap)); +} + +/* + * Kqfilter wrapper for fifos. + * + * Fall through to ufs kqfilter routines if needed + */ +static int +ufsfifo_kqfilter(ap) + struct vop_kqfilter_args *ap; +{ + int error; + + error = VOCALL(fifo_vnodeop_p, VOFFSET(vop_kqfilter), ap); + if (error) + error = ufs_kqfilter(ap); + return (error); +} + +/* + * Return POSIX pathconf information applicable to ufs filesystems. + */ +static int +ufs_pathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + int error; + + error = 0; + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = LINK_MAX; + break; + case _PC_NAME_MAX: + *ap->a_retval = NAME_MAX; + break; + case _PC_PATH_MAX: + *ap->a_retval = PATH_MAX; + break; + case _PC_PIPE_BUF: + *ap->a_retval = PIPE_BUF; + break; + case _PC_CHOWN_RESTRICTED: + *ap->a_retval = 1; + break; + case _PC_NO_TRUNC: + *ap->a_retval = 1; + break; + case _PC_ACL_EXTENDED: +#ifdef UFS_ACL + if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS) + *ap->a_retval = 1; + else + *ap->a_retval = 0; +#else + *ap->a_retval = 0; +#endif + break; + case _PC_ACL_PATH_MAX: +#ifdef UFS_ACL + if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS) + *ap->a_retval = ACL_MAX_ENTRIES; + else + *ap->a_retval = 3; +#else + *ap->a_retval = 3; +#endif + break; + case _PC_MAC_PRESENT: +#ifdef MAC + if (ap->a_vp->v_mount->mnt_flag & MNT_MULTILABEL) + *ap->a_retval = 1; + else + *ap->a_retval = 0; +#else + *ap->a_retval = 0; +#endif + break; + case _PC_ASYNC_IO: + /* _PC_ASYNC_IO should have been handled by upper layers. */ + KASSERT(0, ("_PC_ASYNC_IO should not get here")); + error = EINVAL; + break; + case _PC_PRIO_IO: + *ap->a_retval = 0; + break; + case _PC_SYNC_IO: + *ap->a_retval = 0; + break; + case _PC_ALLOC_SIZE_MIN: + *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize; + break; + case _PC_FILESIZEBITS: + *ap->a_retval = 64; + break; + case _PC_REC_INCR_XFER_SIZE: + *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; + break; + case _PC_REC_MAX_XFER_SIZE: + *ap->a_retval = -1; /* means ``unlimited'' */ + break; + case _PC_REC_MIN_XFER_SIZE: + *ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize; + break; + case _PC_REC_XFER_ALIGN: + *ap->a_retval = PAGE_SIZE; + break; + case _PC_SYMLINK_MAX: + *ap->a_retval = MAXPATHLEN; + break; + + default: + error = EINVAL; + break; + } + return (error); +} + +/* + * Advisory record locking support + */ +static int +ufs_advlock(ap) + struct vop_advlock_args /* { + struct vnode *a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; + int a_flags; + } */ *ap; +{ + struct inode *ip = VTOI(ap->a_vp); + + return (lf_advlock(ap, &(ip->i_lockf), ip->i_size)); +} + +/* + * Initialize the vnode associated with a new inode, handle aliased + * vnodes. + */ +int +ufs_vinit(mntp, specops, fifoops, vpp) + struct mount *mntp; + vop_t **specops; + vop_t **fifoops; + struct vnode **vpp; +{ + struct inode *ip; + struct vnode *vp; + struct timeval tv; + + vp = *vpp; + ip = VTOI(vp); + switch(vp->v_type = IFTOVT(ip->i_mode)) { + case VCHR: + case VBLK: + vp->v_op = specops; + vp = addaliasu(vp, DIP(ip, i_rdev)); + ip->i_vnode = vp; + break; + case VFIFO: + vp->v_op = fifoops; + break; + default: + break; + + } + ASSERT_VOP_LOCKED(vp, "ufs_vinit"); + if (ip->i_number == ROOTINO) + vp->v_vflag |= VV_ROOT; + /* + * Initialize modrev times + */ + getmicrouptime(&tv); + SETHIGH(ip->i_modrev, tv.tv_sec); + SETLOW(ip->i_modrev, tv.tv_usec * 4294); + *vpp = vp; + return (0); +} + +/* + * Allocate a new inode. + * Vnode dvp must be locked. + */ +static int +ufs_makeinode(mode, dvp, vpp, cnp) + int mode; + struct vnode *dvp; + struct vnode **vpp; + struct componentname *cnp; +{ + struct inode *ip, *pdir; + struct direct newdir; + struct vnode *tvp; +#ifdef UFS_ACL + struct acl *acl; +#endif + int error; + + pdir = VTOI(dvp); +#ifdef DIAGNOSTIC + if ((cnp->cn_flags & HASBUF) == 0) + panic("ufs_makeinode: no name"); +#endif + *vpp = NULL; + if ((mode & IFMT) == 0) + mode |= IFREG; + + error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp); + if (error) + return (error); + ip = VTOI(tvp); + ip->i_gid = pdir->i_gid; + DIP(ip, i_gid) = pdir->i_gid; +#ifdef SUIDDIR + { +#ifdef QUOTA + struct ucred ucred, *ucp; + ucp = cnp->cn_cred; +#endif + /* + * If we are not the owner of the directory, + * and we are hacking owners here, (only do this where told to) + * and we are not giving it TO root, (would subvert quotas) + * then go ahead and give it to the other user. + * Note that this drops off the execute bits for security. + */ + if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) && + (pdir->i_mode & ISUID) && + (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) { + ip->i_uid = pdir->i_uid; + DIP(ip, i_uid) = ip->i_uid; + mode &= ~07111; +#ifdef QUOTA + /* + * Make sure the correct user gets charged + * for the space. + * Quickly knock up a dummy credential for the victim. + * XXX This seems to never be accessed out of our + * context so a stack variable is ok. + */ + ucred.cr_ref = 1; + ucred.cr_uid = ip->i_uid; + ucred.cr_ngroups = 1; + ucred.cr_groups[0] = pdir->i_gid; + ucp = &ucred; +#endif + } else { + ip->i_uid = cnp->cn_cred->cr_uid; + DIP(ip, i_uid) = ip->i_uid; + } + +#ifdef QUOTA + if ((error = getinoquota(ip)) || + (error = chkiq(ip, 1, ucp, 0))) { + UFS_VFREE(tvp, ip->i_number, mode); + vput(tvp); + return (error); + } +#endif + } +#else /* !SUIDDIR */ + ip->i_uid = cnp->cn_cred->cr_uid; + DIP(ip, i_uid) = ip->i_uid; +#ifdef QUOTA + if ((error = getinoquota(ip)) || + (error = chkiq(ip, 1, cnp->cn_cred, 0))) { + UFS_VFREE(tvp, ip->i_number, mode); + vput(tvp); + return (error); + } +#endif +#endif /* !SUIDDIR */ + ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE; +#ifdef UFS_ACL + acl = NULL; + if ((dvp->v_mount->mnt_flag & MNT_ACLS) != 0) { + MALLOC(acl, struct acl *, sizeof(*acl), M_ACL, M_WAITOK); + + /* + * Retrieve default ACL for parent, if any. + */ + error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cnp->cn_cred, + cnp->cn_thread); + switch (error) { + case 0: + /* + * Retrieved a default ACL, so merge mode and ACL if + * necessary. + */ + if (acl->acl_cnt != 0) { + /* + * Two possible ways for default ACL to not + * be present. First, the EA can be + * undefined, or second, the default ACL can + * be blank. If it's blank, fall through to + * the it's not defined case. + */ + mode = acl_posix1e_newfilemode(mode, acl); + ip->i_mode = mode; + DIP(ip, i_mode) = mode; + ufs_sync_acl_from_inode(ip, acl); + break; + } + /* FALLTHROUGH */ + + case EOPNOTSUPP: + /* + * Just use the mode as-is. + */ + ip->i_mode = mode; + DIP(ip, i_mode) = mode; + FREE(acl, M_ACL); + acl = NULL; + break; + + default: + UFS_VFREE(tvp, ip->i_number, mode); + vput(tvp); + FREE(acl, M_ACL); + acl = NULL; + return (error); + } + } else { +#endif + ip->i_mode = mode; + DIP(ip, i_mode) = mode; +#ifdef UFS_ACL + } +#endif + tvp->v_type = IFTOVT(mode); /* Rest init'd in getnewvnode(). */ + ip->i_effnlink = 1; + ip->i_nlink = 1; + DIP(ip, i_nlink) = 1; + if (DOINGSOFTDEP(tvp)) + softdep_change_linkcnt(ip); + if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) && + suser_cred(cnp->cn_cred, PRISON_ROOT)) { + ip->i_mode &= ~ISGID; + DIP(ip, i_mode) = ip->i_mode; + } + + if (cnp->cn_flags & ISWHITEOUT) { + ip->i_flags |= UF_OPAQUE; + DIP(ip, i_flags) = ip->i_flags; + } + + /* + * Make sure inode goes to disk before directory entry. + */ + error = UFS_UPDATE(tvp, !(DOINGSOFTDEP(tvp) | DOINGASYNC(tvp))); + if (error) + goto bad; +#ifdef MAC + if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) { + error = mac_create_vnode_extattr(cnp->cn_cred, dvp->v_mount, + dvp, tvp, cnp); + if (error) + goto bad; + } +#endif +#ifdef UFS_ACL + if (acl != NULL) { + /* + * XXX: If we abort now, will Soft Updates notify the extattr + * code that the EAs for the file need to be released? + */ + error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cnp->cn_cred, + cnp->cn_thread); + switch (error) { + case 0: + break; + + case EOPNOTSUPP: + /* + * XXX: This should not happen, as EOPNOTSUPP above was + * supposed to free acl. + */ + printf("ufs_makeinode: VOP_GETACL() but no " + "VOP_SETACL()\n"); + /* panic("ufs_makeinode: VOP_GETACL() but no " + "VOP_SETACL()"); */ + break; + + default: + FREE(acl, M_ACL); + goto bad; + } + FREE(acl, M_ACL); + } +#endif /* !UFS_ACL */ + ufs_makedirentry(ip, cnp, &newdir); + error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL); + if (error) + goto bad; + *vpp = tvp; + return (0); + +bad: + /* + * Write error occurred trying to update the inode + * or the directory so must deallocate the inode. + */ + ip->i_effnlink = 0; + ip->i_nlink = 0; + DIP(ip, i_nlink) = 0; + ip->i_flag |= IN_CHANGE; + if (DOINGSOFTDEP(tvp)) + softdep_change_linkcnt(ip); + vput(tvp); + return (error); +} + +static struct filterops ufsread_filtops = + { 1, NULL, filt_ufsdetach, filt_ufsread }; +static struct filterops ufswrite_filtops = + { 1, NULL, filt_ufsdetach, filt_ufswrite }; +static struct filterops ufsvnode_filtops = + { 1, NULL, filt_ufsdetach, filt_ufsvnode }; + +static int +ufs_kqfilter(ap) + struct vop_kqfilter_args /* { + struct vnode *a_vp; + struct knote *a_kn; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + struct knote *kn = ap->a_kn; + + switch (kn->kn_filter) { + case EVFILT_READ: + kn->kn_fop = &ufsread_filtops; + break; + case EVFILT_WRITE: + kn->kn_fop = &ufswrite_filtops; + break; + case EVFILT_VNODE: + kn->kn_fop = &ufsvnode_filtops; + break; + default: + return (1); + } + + kn->kn_hook = (caddr_t)vp; + + if (vp->v_pollinfo == NULL) + v_addpollinfo(vp); + mtx_lock(&vp->v_pollinfo->vpi_lock); + SLIST_INSERT_HEAD(&vp->v_pollinfo->vpi_selinfo.si_note, kn, kn_selnext); + mtx_unlock(&vp->v_pollinfo->vpi_lock); + + return (0); +} + +static void +filt_ufsdetach(struct knote *kn) +{ + struct vnode *vp = (struct vnode *)kn->kn_hook; + + KASSERT(vp->v_pollinfo != NULL, ("Mising v_pollinfo")); + mtx_lock(&vp->v_pollinfo->vpi_lock); + SLIST_REMOVE(&vp->v_pollinfo->vpi_selinfo.si_note, + kn, knote, kn_selnext); + mtx_unlock(&vp->v_pollinfo->vpi_lock); +} + +/*ARGSUSED*/ +static int +filt_ufsread(struct knote *kn, long hint) +{ + struct vnode *vp = (struct vnode *)kn->kn_hook; + struct inode *ip = VTOI(vp); + + /* + * filesystem is gone, so set the EOF flag and schedule + * the knote for deletion. + */ + if (hint == NOTE_REVOKE) { + kn->kn_flags |= (EV_EOF | EV_ONESHOT); + return (1); + } + + kn->kn_data = ip->i_size - kn->kn_fp->f_offset; + return (kn->kn_data != 0); +} + +/*ARGSUSED*/ +static int +filt_ufswrite(struct knote *kn, long hint) +{ + + /* + * filesystem is gone, so set the EOF flag and schedule + * the knote for deletion. + */ + if (hint == NOTE_REVOKE) + kn->kn_flags |= (EV_EOF | EV_ONESHOT); + + kn->kn_data = 0; + return (1); +} + +static int +filt_ufsvnode(struct knote *kn, long hint) +{ + + if (kn->kn_sfflags & hint) + kn->kn_fflags |= hint; + if (hint == NOTE_REVOKE) { + kn->kn_flags |= EV_EOF; + return (1); + } + return (kn->kn_fflags != 0); +} + +/* Global vfs data structures for ufs. */ +static vop_t **ufs_vnodeop_p; +static struct vnodeopv_entry_desc ufs_vnodeop_entries[] = { + { &vop_default_desc, (vop_t *) vop_defaultop }, + { &vop_fsync_desc, (vop_t *) vop_panic }, + { &vop_read_desc, (vop_t *) vop_panic }, + { &vop_reallocblks_desc, (vop_t *) vop_panic }, + { &vop_write_desc, (vop_t *) vop_panic }, + { &vop_access_desc, (vop_t *) ufs_access }, + { &vop_advlock_desc, (vop_t *) ufs_advlock }, + { &vop_bmap_desc, (vop_t *) ufs_bmap }, + { &vop_cachedlookup_desc, (vop_t *) ufs_lookup }, + { &vop_close_desc, (vop_t *) ufs_close }, + { &vop_create_desc, (vop_t *) ufs_create }, + { &vop_getattr_desc, (vop_t *) ufs_getattr }, + { &vop_inactive_desc, (vop_t *) ufs_inactive }, + { &vop_link_desc, (vop_t *) ufs_link }, + { &vop_lookup_desc, (vop_t *) vfs_cache_lookup }, + { &vop_mkdir_desc, (vop_t *) ufs_mkdir }, + { &vop_mknod_desc, (vop_t *) ufs_mknod }, + { &vop_open_desc, (vop_t *) ufs_open }, + { &vop_pathconf_desc, (vop_t *) ufs_pathconf }, + { &vop_poll_desc, (vop_t *) vop_stdpoll }, + { &vop_kqfilter_desc, (vop_t *) ufs_kqfilter }, + { &vop_getwritemount_desc, (vop_t *) vop_stdgetwritemount }, + { &vop_print_desc, (vop_t *) ufs_print }, + { &vop_readdir_desc, (vop_t *) ufs_readdir }, + { &vop_readlink_desc, (vop_t *) ufs_readlink }, + { &vop_reclaim_desc, (vop_t *) ufs_reclaim }, + { &vop_remove_desc, (vop_t *) ufs_remove }, + { &vop_rename_desc, (vop_t *) ufs_rename }, + { &vop_rmdir_desc, (vop_t *) ufs_rmdir }, + { &vop_setattr_desc, (vop_t *) ufs_setattr }, +#ifdef MAC + { &vop_setlabel_desc, (vop_t *) vop_stdsetlabel_ea }, +#endif + { &vop_strategy_desc, (vop_t *) ufs_strategy }, + { &vop_symlink_desc, (vop_t *) ufs_symlink }, + { &vop_whiteout_desc, (vop_t *) ufs_whiteout }, +#ifdef UFS_EXTATTR + { &vop_getextattr_desc, (vop_t *) ufs_getextattr }, + { &vop_deleteextattr_desc, (vop_t *) ufs_deleteextattr }, + { &vop_setextattr_desc, (vop_t *) ufs_setextattr }, +#endif +#ifdef UFS_ACL + { &vop_getacl_desc, (vop_t *) ufs_getacl }, + { &vop_setacl_desc, (vop_t *) ufs_setacl }, + { &vop_aclcheck_desc, (vop_t *) ufs_aclcheck }, +#endif + { NULL, NULL } +}; +static struct vnodeopv_desc ufs_vnodeop_opv_desc = + { &ufs_vnodeop_p, ufs_vnodeop_entries }; + +static vop_t **ufs_specop_p; +static struct vnodeopv_entry_desc ufs_specop_entries[] = { + { &vop_default_desc, (vop_t *) spec_vnoperate }, + { &vop_fsync_desc, (vop_t *) vop_panic }, + { &vop_access_desc, (vop_t *) ufs_access }, + { &vop_close_desc, (vop_t *) ufsspec_close }, + { &vop_getattr_desc, (vop_t *) ufs_getattr }, + { &vop_inactive_desc, (vop_t *) ufs_inactive }, + { &vop_print_desc, (vop_t *) ufs_print }, + { &vop_read_desc, (vop_t *) ufsspec_read }, + { &vop_reclaim_desc, (vop_t *) ufs_reclaim }, + { &vop_setattr_desc, (vop_t *) ufs_setattr }, +#ifdef MAC + { &vop_setlabel_desc, (vop_t *) vop_stdsetlabel_ea }, +#endif + { &vop_write_desc, (vop_t *) ufsspec_write }, +#ifdef UFS_EXTATTR + { &vop_getextattr_desc, (vop_t *) ufs_getextattr }, + { &vop_deleteextattr_desc, (vop_t *) ufs_deleteextattr }, + { &vop_setextattr_desc, (vop_t *) ufs_setextattr }, +#endif +#ifdef UFS_ACL + { &vop_getacl_desc, (vop_t *) ufs_getacl }, + { &vop_setacl_desc, (vop_t *) ufs_setacl }, + { &vop_aclcheck_desc, (vop_t *) ufs_aclcheck }, +#endif + {NULL, NULL} +}; +static struct vnodeopv_desc ufs_specop_opv_desc = + { &ufs_specop_p, ufs_specop_entries }; + +static vop_t **ufs_fifoop_p; +static struct vnodeopv_entry_desc ufs_fifoop_entries[] = { + { &vop_default_desc, (vop_t *) fifo_vnoperate }, + { &vop_fsync_desc, (vop_t *) vop_panic }, + { &vop_access_desc, (vop_t *) ufs_access }, + { &vop_close_desc, (vop_t *) ufsfifo_close }, + { &vop_getattr_desc, (vop_t *) ufs_getattr }, + { &vop_inactive_desc, (vop_t *) ufs_inactive }, + { &vop_kqfilter_desc, (vop_t *) ufsfifo_kqfilter }, + { &vop_print_desc, (vop_t *) ufs_print }, + { &vop_read_desc, (vop_t *) ufsfifo_read }, + { &vop_reclaim_desc, (vop_t *) ufs_reclaim }, + { &vop_setattr_desc, (vop_t *) ufs_setattr }, +#ifdef MAC + { &vop_setlabel_desc, (vop_t *) vop_stdsetlabel_ea }, +#endif + { &vop_write_desc, (vop_t *) ufsfifo_write }, +#ifdef UFS_EXTATTR + { &vop_getextattr_desc, (vop_t *) ufs_getextattr }, + { &vop_deleteextattr_desc, (vop_t *) ufs_deleteextattr }, + { &vop_setextattr_desc, (vop_t *) ufs_setextattr }, +#endif +#ifdef UFS_ACL + { &vop_getacl_desc, (vop_t *) ufs_getacl }, + { &vop_setacl_desc, (vop_t *) ufs_setacl }, + { &vop_aclcheck_desc, (vop_t *) ufs_aclcheck }, +#endif + { NULL, NULL } +}; +static struct vnodeopv_desc ufs_fifoop_opv_desc = + { &ufs_fifoop_p, ufs_fifoop_entries }; + +VNODEOP_SET(ufs_vnodeop_opv_desc); +VNODEOP_SET(ufs_specop_opv_desc); +VNODEOP_SET(ufs_fifoop_opv_desc); + +int +ufs_vnoperate(ap) + struct vop_generic_args /* { + struct vnodeop_desc *a_desc; + } */ *ap; +{ + return (VOCALL(ufs_vnodeop_p, ap->a_desc->vdesc_offset, ap)); +} + +int +ufs_vnoperatefifo(ap) + struct vop_generic_args /* { + struct vnodeop_desc *a_desc; + } */ *ap; +{ + return (VOCALL(ufs_fifoop_p, ap->a_desc->vdesc_offset, ap)); +} + +int +ufs_vnoperatespec(ap) + struct vop_generic_args /* { + struct vnodeop_desc *a_desc; + } */ *ap; +{ + return (VOCALL(ufs_specop_p, ap->a_desc->vdesc_offset, ap)); +} diff --git a/src/sys/ufs/ufs/ufsmount.h b/src/sys/ufs/ufs/ufsmount.h new file mode 100644 index 0000000..f1ff892 --- /dev/null +++ b/src/sys/ufs/ufs/ufsmount.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufsmount.h 8.6 (Berkeley) 3/30/95 + * $FreeBSD: src/sys/ufs/ufs/ufsmount.h,v 1.28 2003/01/07 18:23:50 mckusick Exp $ + */ + +#ifndef _UFS_UFS_UFSMOUNT_H_ +#define _UFS_UFS_UFSMOUNT_H_ + +/* + * Arguments to mount UFS-based filesystems + */ +struct ufs_args { + char *fspec; /* block special device to mount */ + struct export_args export; /* network export information */ +}; + +#ifdef _KERNEL + +#ifdef MALLOC_DECLARE +MALLOC_DECLARE(M_UFSMNT); +#endif + +struct buf; +struct inode; +struct nameidata; +struct timeval; +struct ucred; +struct uio; +struct vnode; +struct ufs_extattr_per_mount; + +/* This structure describes the UFS specific mount structure data. */ +struct ufsmount { + struct mount *um_mountp; /* filesystem vfs structure */ + dev_t um_dev; /* device mounted */ + struct vnode *um_devvp; /* block device mounted vnode */ + u_long um_fstype; /* type of filesystem */ + struct fs *um_fs; /* pointer to superblock */ + struct vnode *um_quotas[MAXQUOTAS]; /* pointer to quota files */ + struct ucred *um_cred[MAXQUOTAS]; /* quota file access cred */ + struct ufs_extattr_per_mount um_extattr; /* extended attrs */ + u_long um_nindir; /* indirect ptrs per block */ + u_long um_bptrtodb; /* indir ptr to disk block */ + u_long um_seqinc; /* inc between seq blocks */ + long um_numindirdeps; /* indirdeps for this filesys */ + time_t um_btime[MAXQUOTAS]; /* block quota time limit */ + time_t um_itime[MAXQUOTAS]; /* inode quota time limit */ + char um_qflags[MAXQUOTAS]; /* quota specific flags */ + int64_t um_savedmaxfilesize; /* XXX - limit maxfilesize */ + int (*um_balloc)(struct vnode *, off_t, int, struct ucred *, int, struct buf **); + int (*um_blkatoff)(struct vnode *, off_t, char **, struct buf **); + int (*um_truncate)(struct vnode *, off_t, int, struct ucred *, struct thread *); + int (*um_update)(struct vnode *, int); + int (*um_valloc)(struct vnode *, int, struct ucred *, struct vnode **); + int (*um_vfree)(struct vnode *, ino_t, int); + void (*um_ifree)(struct ufsmount *, struct inode *); +}; + +#define UFS_BALLOC(aa, bb, cc, dd, ee, ff) VFSTOUFS((aa)->v_mount)->um_balloc(aa, bb, cc, dd, ee, ff) +#define UFS_BLKATOFF(aa, bb, cc, dd) VFSTOUFS((aa)->v_mount)->um_blkatoff(aa, bb, cc, dd) +#define UFS_TRUNCATE(aa, bb, cc, dd, ee) VFSTOUFS((aa)->v_mount)->um_truncate(aa, bb, cc, dd, ee) +#define UFS_UPDATE(aa, bb) VFSTOUFS((aa)->v_mount)->um_update(aa, bb) +#define UFS_VALLOC(aa, bb, cc, dd) VFSTOUFS((aa)->v_mount)->um_valloc(aa, bb, cc, dd) +#define UFS_VFREE(aa, bb, cc) VFSTOUFS((aa)->v_mount)->um_vfree(aa, bb, cc) +#define UFS_IFREE(aa, bb) ((aa)->um_ifree(aa, bb)) + +/* + * Filesystem types + */ +#define UFS1 1 +#define UFS2 2 + +/* + * Flags describing the state of quotas. + */ +#define QTF_OPENING 0x01 /* Q_QUOTAON in progress */ +#define QTF_CLOSING 0x02 /* Q_QUOTAOFF in progress */ + +/* Convert mount ptr to ufsmount ptr. */ +#define VFSTOUFS(mp) ((struct ufsmount *)((mp)->mnt_data)) + +/* + * Macros to access filesystem parameters in the ufsmount structure. + * Used by ufs_bmap. + */ +#define MNINDIR(ump) ((ump)->um_nindir) +#define blkptrtodb(ump, b) ((b) << (ump)->um_bptrtodb) +#define is_sequential(ump, a, b) ((b) == (a) + ump->um_seqinc) +#endif /* _KERNEL */ + +#endif