LCOV - coverage.lcov - libbpfilter/core/lock.c

LCOV - code coverage report

Current view:	top level - libbpfilter/core - lock.c (source / functions)		Coverage	Total	Hit
Test:	coverage.lcov	Lines:	83.6 %	122	102
Test Date:	2026-05-28 10:53:42	Functions:	100.0 %	9	9
		Branches:	56.9 %	116	66

             Branch data     Line data    Source code

       1                 :             : /* SPDX-License-Identifier: GPL-2.0-only */
       2                 :             : /*
       3                 :             :  * Copyright (c) Meta Platforms, Inc. and affiliates.
       4                 :             :  */
       5                 :             : 
       6                 :             : /* `renameat2` and `RENAME_NOREPLACE` require _GNU_SOURCE from glibc. */
       7                 :             : #ifndef _GNU_SOURCE
       8                 :             : #define _GNU_SOURCE
       9                 :             : #endif
      10                 :             : 
      11                 :             : #include "core/lock.h"
      12                 :             : 
      13                 :             : #include <dirent.h>
      14                 :             : #include <errno.h>
      15                 :             : #include <fcntl.h>
      16                 :             : #include <stdio.h>
      17                 :             : #include <stdlib.h>
      18                 :             : #include <string.h>
      19                 :             : #include <sys/file.h>
      20                 :             : #include <sys/stat.h>
      21                 :             : #include <sys/types.h>
      22                 :             : #include <time.h>
      23                 :             : #include <unistd.h>
      24                 :             : 
      25                 :             : #include <bpfilter/ctx.h>
      26                 :             : #include <bpfilter/helper.h>
      27                 :             : #include <bpfilter/io.h>
      28                 :             : #include <bpfilter/logger.h>
      29                 :             : 
      30                 :             : #define BF_PERM_755 (S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)
      31                 :             : 
      32                 :             : /** Bounded retry count for the "recheck-after-flock" loop (P1). Each failed
      33                 :             :  * attempt corresponds to a completed `unlink + recreate` by another
      34                 :             :  * `BF_LOCK_WRITE` holder, so this budget is extremely generous in practice. */
      35                 :             : #define BF_LOCK_MAX_RETRIES 8
      36                 :             : 
      37                 :             : /** Bounded retry count for the staging name collision loop. Collisions are
      38                 :             :  * astronomically rare given the random suffix, so a small budget suffices. */
      39                 :             : #define BF_LOCK_STAGING_NAME_RETRIES 4
      40                 :             : 
      41                 :             : /** Number of random bytes pulled from `/dev/urandom` for the staging suffix. */
      42                 :             : #define BF_LOCK_STAGING_RAND_BYTES 8
      43                 :             : 
      44                 :             : /**
      45                 :             :  * @brief Apply an `flock(2)` of the requested mode on `fd`.
      46                 :             :  *
      47                 :             :  * `BF_LOCK_NONE` is a no-op; `BF_LOCK_READ` maps to `LOCK_SH`; `BF_LOCK_WRITE`
      48                 :             :  * maps to `LOCK_EX`. All requests are non-blocking (`LOCK_NB`): contention
      49                 :             :  * returns `-EWOULDBLOCK` immediately rather than waiting.
      50                 :             :  *
      51                 :             :  * @param fd File descriptor to lock.
      52                 :             :  * @param mode Locking mode, see `bf_lock_mode`.
      53                 :             :  * @return 0 on success, or a negative errno value on failure.
      54                 :             :  */
      55                 :       14191 : static int _bf_flock(int fd, enum bf_lock_mode mode)
      56                 :             : {
      57                 :             :     int op = LOCK_NB;
      58                 :             : 
      59         [ +  - ]:       14191 :     if (mode >= _BF_LOCK_MAX)
      60                 :             :         return -EINVAL;
      61                 :             : 
      62         [ +  - ]:       14191 :     if (fd < 0)
      63                 :             :         return -EBADFD;
      64                 :             : 
      65         [ +  + ]:       14191 :     if (mode == BF_LOCK_NONE)
      66                 :             :         return 0;
      67                 :             : 
      68         [ +  + ]:       14187 :     op |= (mode == BF_LOCK_WRITE) ? LOCK_EX : LOCK_SH;
      69                 :             : 
      70         [ +  + ]:       14187 :     if (flock(fd, op) < 0)
      71                 :           4 :         return -errno;
      72                 :             : 
      73                 :             :     return 0;
      74                 :             : }
      75                 :             : 
      76                 :             : /**
      77                 :             :  * @brief Fill `buf` with a unique staging name.
      78                 :             :  *
      79                 :             :  * The name has the form `<prefix><pid>_<hex>` where `<hex>` is a hex-encoded
      80                 :             :  * random suffix pulled from `/dev/urandom`. Uniqueness isn't strictly required
      81                 :             :  * (collisions cause `mkdirat(EEXIST)` and are retried by the caller), but a
      82                 :             :  * unique name avoids any chance of contention on the staging flock.
      83                 :             :  *
      84                 :             :  * If `/dev/urandom` cannot be read, fall back to a name based on `pid` and
      85                 :             :  * `time(NULL)`; the caller's retry loop will paper over the rare collision
      86                 :             :  * that this fallback could produce.
      87                 :             :  *
      88                 :             :  * @param buf Buffer to write the staging name into.
      89                 :             :  * @param size Size of `buf`.
      90                 :             :  */
      91                 :        1311 : static void _bf_make_staging_name(char *buf, size_t size)
      92                 :             : {
      93                 :        1311 :     unsigned char rand[BF_LOCK_STAGING_RAND_BYTES] = {0};
      94                 :             :     char hex[(BF_LOCK_STAGING_RAND_BYTES * 2) + 1];
      95                 :        2622 :     _cleanup_close_ int fd = -1;
      96                 :             :     ssize_t n = -1;
      97                 :             : 
      98                 :        1311 :     fd = open("/dev/urandom", O_RDONLY | O_CLOEXEC);
      99         [ +  - ]:        1311 :     if (fd >= 0)
     100                 :        1311 :         n = read(fd, rand, sizeof(rand));
     101                 :             : 
     102         [ -  + ]:        1311 :     if (n != (ssize_t)sizeof(rand)) {
     103                 :             :         /* Fallback: derive bytes from `time(NULL)` so two processes that
     104                 :             :          * both fail to read /dev/urandom in the same second still differ on
     105                 :             :          * `pid`. Collisions are handled by the caller's retry loop. */
     106                 :           0 :         unsigned long fallback = (unsigned long)time(NULL);
     107         [ #  # ]:           0 :         for (size_t i = 0; i < sizeof(rand); ++i)
     108                 :           0 :             rand[i] = (unsigned char)(fallback >> (i * 8));
     109                 :             :     }
     110                 :             : 
     111         [ +  + ]:       11799 :     for (size_t i = 0; i < sizeof(rand); ++i)
     112                 :       10488 :         (void)snprintf(&hex[i * 2], 3, "%02x", rand[i]);
     113                 :             : 
     114                 :             :     /* bpffs rejects names starting with '.', so the prefix and format must
     115                 :             :      * stick to [a-zA-Z0-9_-]. */
     116                 :        1311 :     (void)snprintf(buf, size, "%s%d_%s", BF_LOCK_STAGING_PREFIX, (int)getpid(),
     117                 :             :                    hex);
     118                 :        1311 : }
     119                 :             : 
     120                 :             : /**
     121                 :             :  * @brief Stage-and-rename primitive (I3).
     122                 :             :  *
     123                 :             :  * Create a uniquely-named staging directory under `pindir_fd`, open it,
     124                 :             :  * acquire an exclusive flock on it, then atomically publish it as `name`
     125                 :             :  * via `renameat2(RENAME_NOREPLACE)`.
     126                 :             :  *
     127                 :             :  * On success, returns a locked file descriptor referring to the inode now
     128                 :             :  * reachable as `<pindir>/<name>`.
     129                 :             :  *
     130                 :             :  * On failure, any state created (staging dir) is cleaned up; no side
     131                 :             :  * effects leak out.
     132                 :             :  * @param pindir_fd File descriptor of the pin directory.
     133                 :             :  * @param name Name of the directory (in the pin directory) to open.
     134                 :             :  * @return The open and locked file descriptor on success, or a negative errno
     135                 :             :  *         value on failure.
     136                 :             :  */
     137                 :        1311 : static int _bf_lock_stage_and_publish(int pindir_fd, const char *name)
     138                 :             : {
     139                 :             :     char staging[NAME_MAX];
     140                 :        1311 :     _cleanup_close_ int fd = -1;
     141                 :             :     int r;
     142                 :             : 
     143                 :             :     assert(name);
     144                 :             : 
     145                 :             :     /* 1. Create a unique staging directory. Retry a small number of times
     146                 :             :      *    if we happen to collide with our own prior staging dirs (should
     147                 :             :      *    be astronomically rare given the random suffix). */
     148                 :        1311 :     for (int attempt = 0; attempt < BF_LOCK_STAGING_NAME_RETRIES; ++attempt) {
     149                 :        1311 :         _bf_make_staging_name(staging, sizeof(staging));
     150         [ -  + ]:        1311 :         if (mkdirat(pindir_fd, staging, BF_PERM_755) == 0)
     151                 :             :             break;
     152         [ #  # ]:           0 :         if (errno != EEXIST) {
     153         [ #  # ]:           0 :             return bf_err_r(-errno,
     154                 :             :                             "failed to create staging dir '%s' under pindir",
     155                 :             :                             staging);
     156                 :             :         }
     157         [ #  # ]:           0 :         if (attempt == BF_LOCK_STAGING_NAME_RETRIES - 1) {
     158         [ #  # ]:           0 :             return bf_err_r(
     159                 :             :                 -EAGAIN,
     160                 :             :                 "failed to generate a unique staging dir name after retries");
     161                 :             :         }
     162                 :             :     }
     163                 :             : 
     164                 :             :     /* 2. Open the staging directory. */
     165                 :        1311 :     fd = openat(pindir_fd, staging, O_DIRECTORY);
     166         [ -  + ]:        1311 :     if (fd < 0) {
     167                 :           0 :         r = -errno;
     168                 :           0 :         (void)unlinkat(pindir_fd, staging, AT_REMOVEDIR);
     169         [ #  # ]:           0 :         return bf_err_r(r, "failed to open staging dir '%s'", staging);
     170                 :             :     }
     171                 :             : 
     172                 :             :     /* 3. Exclusively lock the staging inode. Cannot contend: we own the
     173                 :             :      *    unique staging name. */
     174                 :        1311 :     r = _bf_flock(fd, BF_LOCK_WRITE);
     175         [ -  + ]:        1311 :     if (r) {
     176                 :           0 :         (void)unlinkat(pindir_fd, staging, AT_REMOVEDIR);
     177         [ #  # ]:           0 :         return bf_err_r(r, "failed to flock staging dir '%s'", staging);
     178                 :             :     }
     179                 :             : 
     180                 :             :     /* 4. Publish atomically. `RENAME_NOREPLACE` ensures we lose cleanly to
     181                 :             :      *    any concurrent creator that already claimed `name`. */
     182         [ -  + ]:        1311 :     if (renameat2(pindir_fd, staging, pindir_fd, name, RENAME_NOREPLACE) < 0) {
     183                 :           0 :         r = -errno;
     184                 :             :         /* Staging dir was never published; safe to remove (we hold its
     185                 :             :          * flock, nobody else can observe or lock it). */
     186                 :           0 :         (void)unlinkat(pindir_fd, staging, AT_REMOVEDIR);
     187                 :           0 :         return r;
     188                 :             :     }
     189                 :             : 
     190                 :        1311 :     return TAKE_FD(fd);
     191                 :             : }
     192                 :             : 
     193                 :             : /**
     194                 :             :  * @brief Open an existing chain dir with the recheck-after-flock protocol
     195                 :             :  * (P1).
     196                 :             :  *
     197                 :             :  * Between `openat` and `flock`, the name might be unlinked and recreated by
     198                 :             :  * another `BF_LOCK_WRITE` holder, which would leave us holding a lock on an
     199                 :             :  * orphaned inode. Detect this by comparing the inode we locked against the
     200                 :             :  * one currently reachable via the name, and retry on mismatch.
     201                 :             :  *
     202                 :             :  * On success, returns a locked file descriptor whose inode is guaranteed to
     203                 :             :  * be the one currently bound to `name`.
     204                 :             :  * @param pindir_fd File descriptor of the directory containing `name`.
     205                 :             :  * @param name Name of the directory to open and lock.
     206                 :             :  * @param mode Locking mode for `name`.
     207                 :             :  * @return Open and locked file descriptor to `name` in `pindir_fd`, or a
     208                 :             :  *         negative errno value on failure.
     209                 :             :  */
     210                 :        6572 : static int _bf_lock_open_existing(int pindir_fd, const char *name,
     211                 :             :                                   enum bf_lock_mode mode)
     212                 :             : {
     213                 :             :     int r;
     214                 :             : 
     215                 :             :     assert(name);
     216                 :             : 
     217         [ +  - ]:        6572 :     for (int attempt = 0; attempt < BF_LOCK_MAX_RETRIES; ++attempt) {
     218                 :        6572 :         _cleanup_close_ int fd = -1;
     219                 :             :         struct stat open_st;
     220                 :             :         struct stat live_st;
     221                 :             : 
     222                 :        6572 :         fd = openat(pindir_fd, name, O_DIRECTORY);
     223         [ +  + ]:        6572 :         if (fd < 0)
     224                 :         641 :             return -errno;
     225                 :             : 
     226                 :        5931 :         r = _bf_flock(fd, mode);
     227         [ +  - ]:        5931 :         if (r)
     228                 :             :             return r;
     229                 :             : 
     230         [ -  + ]:        5931 :         if (fstat(fd, &open_st) < 0)
     231                 :           0 :             return -errno;
     232                 :             : 
     233         [ -  + ]:        5931 :         if (fstatat(pindir_fd, name, &live_st, AT_SYMLINK_NOFOLLOW) < 0) {
     234                 :             :             /* Name is gone (ENOENT) or inaccessible. Retry. */
     235                 :             :             continue;
     236                 :             :         }
     237                 :             : 
     238         [ +  - ]:        5931 :         if (open_st.st_dev == live_st.st_dev &&
     239         [ +  - ]:        5931 :             open_st.st_ino == live_st.st_ino)
     240                 :        5931 :             return TAKE_FD(fd);
     241                 :             : 
     242                 :             :         /* Mismatch: the name now resolves to a different inode. Our fd is
     243                 :             :          * pinned to the orphaned inode; drop it and retry. The flock is
     244                 :             :          * released when fd is closed. */
     245                 :             :     }
     246                 :             : 
     247         [ #  # ]:           0 :     return bf_err_r(
     248                 :             :         -EAGAIN,
     249                 :             :         "failed to stably open chain '%s' after %d retries; likely extreme contention",
     250                 :             :         name, BF_LOCK_MAX_RETRIES);
     251                 :             : }
     252                 :             : 
     253                 :        6950 : int bf_lock_init(struct bf_lock *lock, enum bf_lock_mode mode)
     254                 :             : {
     255                 :        6950 :     _clean_bf_lock_ struct bf_lock _lock = bf_lock_default();
     256                 :             :     const char *bpffs_path;
     257                 :             :     int r;
     258                 :             : 
     259                 :             :     assert(lock);
     260                 :             : 
     261                 :        6950 :     bpffs_path = bf_ctx_get_bpffs_path();
     262         [ +  + ]:        6950 :     if (!bpffs_path)
     263         [ +  - ]:           1 :         return bf_err_r(-EINVAL, "context is not initialized");
     264                 :             : 
     265                 :        6949 :     _lock.bpffs_fd = bf_opendir(bpffs_path);
     266         [ -  + ]:        6949 :     if (_lock.bpffs_fd < 0) {
     267         [ #  # ]:           0 :         return bf_err_r(_lock.bpffs_fd, "failed to open bpffs at %s",
     268                 :             :                         bpffs_path);
     269                 :             :     }
     270                 :             : 
     271                 :             :     /* Create the pin directory lazily. Per I1, it is never removed by the
     272                 :             :      * library, so subsequent `openat` calls always see the same inode. */
     273                 :        6949 :     _lock.pindir_fd = bf_opendir_at(_lock.bpffs_fd, "bpfilter", true);
     274         [ -  + ]:        6949 :     if (_lock.pindir_fd < 0) {
     275         [ #  # ]:           0 :         return bf_err_r(_lock.pindir_fd,
     276                 :             :                         "failed to open pin directory %s/bpfilter", bpffs_path);
     277                 :             :     }
     278                 :             : 
     279                 :        6949 :     r = _bf_flock(_lock.pindir_fd, mode);
     280         [ +  + ]:        6949 :     if (r)
     281                 :             :         return r;
     282                 :             : 
     283                 :        6945 :     _lock.pindir_lock = mode;
     284                 :             : 
     285                 :        6945 :     bf_swap(*lock, _lock);
     286                 :             : 
     287                 :        6945 :     return 0;
     288                 :             : }
     289                 :             : 
     290                 :        4125 : int bf_lock_init_for_chain(struct bf_lock *lock, const char *name,
     291                 :             :                            enum bf_lock_mode pindir_mode,
     292                 :             :                            enum bf_lock_mode chain_mode, bool create)
     293                 :             : {
     294                 :        4125 :     _clean_bf_lock_ struct bf_lock _lock = bf_lock_default();
     295                 :             :     int r;
     296                 :             : 
     297                 :             :     assert(lock);
     298                 :             :     assert(name);
     299                 :             : 
     300         [ +  + ]:        4125 :     if (create && pindir_mode != BF_LOCK_WRITE) {
     301         [ +  - ]:           1 :         return bf_err_r(
     302                 :             :             -EINVAL,
     303                 :             :             "creating a chain requires BF_LOCK_WRITE on the pin directory");
     304                 :             :     }
     305                 :             : 
     306                 :        4124 :     r = bf_lock_init(&_lock, pindir_mode);
     307         [ +  - ]:        4124 :     if (r)
     308                 :             :         return r;
     309                 :             : 
     310                 :        4124 :     r = bf_lock_acquire_chain(&_lock, name, chain_mode, create);
     311         [ +  + ]:        4124 :     if (r)
     312                 :             :         return r;
     313                 :             : 
     314                 :        4113 :     bf_swap(*lock, _lock);
     315                 :             : 
     316                 :        4113 :     return 0;
     317                 :             : }
     318                 :             : 
     319                 :       18027 : void bf_lock_cleanup(struct bf_lock *lock)
     320                 :             : {
     321                 :             :     assert(lock);
     322                 :             : 
     323                 :             :     // Quick exit if `lock` wasn't initialized
     324         [ +  + ]:       18027 :     if (lock->bpffs_fd < 0)
     325                 :             :         return;
     326                 :             : 
     327                 :        6949 :     bf_lock_release_chain(lock);
     328                 :             : 
     329                 :             :     /* Per I1, do NOT remove the pin directory. It persists for the
     330                 :             :      * lifetime of the bpffs mount. */
     331                 :        6949 :     closep(&lock->pindir_fd);
     332                 :        6949 :     lock->pindir_lock = BF_LOCK_NONE;
     333                 :             : 
     334                 :        6949 :     closep(&lock->bpffs_fd);
     335                 :             : }
     336                 :             : 
     337                 :        7888 : int bf_lock_acquire_chain(struct bf_lock *lock, const char *name,
     338                 :             :                           enum bf_lock_mode mode, bool create)
     339                 :             : {
     340                 :             :     _cleanup_free_ char *_name = NULL;
     341                 :        7888 :     _cleanup_close_ int chain_fd = -1;
     342                 :             : 
     343                 :             :     assert(lock);
     344                 :             :     assert(name);
     345                 :             : 
     346   [ +  +  -  + ]:        7888 :     if (lock->bpffs_fd < 0 || lock->pindir_fd < 0) {
     347         [ +  - ]:           1 :         return bf_err_r(
     348                 :             :             -EBADFD,
     349                 :             :             "attempting to acquire a chain lock on an invalid bf_lock");
     350                 :             :     }
     351                 :             : 
     352         [ +  + ]:        7887 :     if (lock->chain_fd >= 0) {
     353         [ +  - ]:           2 :         return bf_err_r(-EINVAL, "bf_lock already locks chain '%s'",
     354                 :             :                         lock->chain_name);
     355                 :             :     }
     356                 :             : 
     357         [ +  + ]:        7885 :     if (create) {
     358         [ +  + ]:        1313 :         if (mode != BF_LOCK_WRITE) {
     359         [ +  - ]:           2 :             return bf_err_r(
     360                 :             :                 -EINVAL,
     361                 :             :                 "creating a chain requires BF_LOCK_WRITE on the chain directory");
     362                 :             :         }
     363         [ -  + ]:        1311 :         if (lock->pindir_lock != BF_LOCK_WRITE) {
     364         [ #  # ]:           0 :             return bf_err_r(
     365                 :             :                 -EINVAL,
     366                 :             :                 "creating a chain requires BF_LOCK_WRITE on the pin directory");
     367                 :             :         }
     368                 :             :     }
     369                 :             : 
     370                 :        7883 :     _name = strdup(name);
     371         [ +  - ]:        7883 :     if (!_name)
     372                 :             :         return -ENOMEM;
     373                 :             : 
     374         [ +  + ]:        7883 :     if (create) {
     375                 :             :         /* Stage-and-rename (I3). Returns a locked fd already reachable at
     376                 :             :          * the final name. */
     377                 :        1311 :         chain_fd = _bf_lock_stage_and_publish(lock->pindir_fd, _name);
     378         [ +  - ]:        1311 :         if (chain_fd < 0)
     379                 :             :             return chain_fd;
     380                 :             :     } else {
     381                 :             :         /* Recheck-after-flock (P1). Returns a locked fd for the live
     382                 :             :          * inode of `name`, or an error. */
     383                 :        6572 :         chain_fd = _bf_lock_open_existing(lock->pindir_fd, _name, mode);
     384         [ +  + ]:        6572 :         if (chain_fd < 0)
     385                 :             :             return chain_fd;
     386                 :             :     }
     387                 :             : 
     388                 :        7242 :     lock->chain_fd = TAKE_FD(chain_fd);
     389                 :        7242 :     lock->chain_name = TAKE_PTR(_name);
     390                 :        7242 :     lock->chain_lock = mode;
     391                 :             : 
     392                 :        7242 :     return 0;
     393                 :             : }
     394                 :             : 
     395                 :        8809 : void bf_lock_release_chain(struct bf_lock *lock)
     396                 :             : {
     397                 :             :     assert(lock);
     398                 :             : 
     399   [ +  +  -  + ]:        8809 :     if (lock->bpffs_fd < 0 || lock->pindir_fd < 0) {
     400         [ +  - ]:           1 :         bf_warn("attempting to release a chain lock on an invalid bf_lock");
     401                 :             :         return;
     402                 :             :     }
     403                 :             : 
     404         [ +  + ]:        8808 :     if (lock->chain_fd < 0)
     405                 :             :         return;
     406                 :             : 
     407                 :             :     /* Only WRITE locks will be used to create or modify a chain, meaning
     408                 :             :      * if a READ or NONE lock was held on a chain directory, that directory
     409                 :             :      * hasn't been modified during the lifetime of the lock. So there is no
     410                 :             :      * need to attempt to remove it.
     411                 :             :      *
     412                 :             :      * Per I2, this removal is only race-free against concurrent readers if
     413                 :             :      * the caller also holds BF_LOCK_WRITE on the pin directory. The
     414                 :             :      * locking matrix documented in `lock.h` ensures this. */
     415         [ +  + ]:        7242 :     if (lock->chain_lock == BF_LOCK_WRITE)
     416                 :        2606 :         (void)unlinkat(lock->pindir_fd, lock->chain_name, AT_REMOVEDIR);
     417                 :             : 
     418                 :        7242 :     closep(&lock->chain_fd);
     419                 :             : 
     420                 :        7242 :     BF_FREEP(&lock->chain_name);
     421                 :        7242 :     lock->chain_lock = BF_LOCK_NONE;
     422                 :             : }

Generated by: LCOV version 2.0-1