7
\$\begingroup\$

I required a function to read a file into lines for a text editor, and ended up with two routines. One uses getline(), and the other uses mmap.

Code:

#define _POSIX_C_SOURCE 200'809L // '
#define _XOPEN_SOURCE   700

#include <errno.h>
#include <inttypes.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <unistd.h>

typedef uint64_t timestamp_t;

typedef struct line {
    char *line;
    size_t size;
} Line;

typedef struct file_buf {
    Line *lines;
    uintmax_t capacity;
    uintmax_t count;
} FileBuf;

[[gnu::always_inline, gnu::const]] static inline double mcs_to_secs(double mcs)
{
    return mcs / 1'000'000.0;
}

[[nodiscard, gnu::always_inline, gnu::returns_nonnull, gnu::nonnull]] static
inline void *safe_trim(void *p, size_t n)
{
    void *const cp = realloc(p, n);

    return cp ? cp : p;
}

#ifndef BENCHMARKING
void print_lines(FileBuf fbuf[static 1])
{
    for (size_t i = 0; i < fbuf->count; i++) {
        puts(fbuf->lines[i].line);
    }
}

[[gnu::always_inline, gnu::pure]] static inline size_t get_total_lines(
        FileBuf fbuf[static 1])
{
    return fbuf->count;
}
#endif

void free_lines(FileBuf fbuf[static 1])
{
    for (size_t i = 0; i < fbuf->count; ++i) {
        free(fbuf->lines[i].line);
    }
    free(fbuf->lines);
}

[[nodiscard, gnu::always_inline]] static inline bool resize_fbuf(
        FileBuf fbuf[static 1])
{
    fbuf->capacity += BUFSIZ;
    void *const tmp = realloc(fbuf->lines, fbuf->capacity * sizeof fbuf->lines[0]);
    return tmp == nullptr ? false : (fbuf->lines = tmp), true;
}

[[nodiscard]] bool alloc_and_append_line(FileBuf fbuf[static 1], 
                                         size_t size,
                                         char line[static size])
{
    fbuf->lines[fbuf->count].line = malloc(size + 1);

    if (fbuf->lines[fbuf->count].line == nullptr) {
        return false;
    }

    memcpy(fbuf->lines[fbuf->count].line, line, size);
    fbuf->lines[fbuf->count].line[size] = '\0';
    fbuf->lines[fbuf->count].size = size;
    ++fbuf->count;
    return true;
}

[[nodiscard, gnu::nonnull]] bool readlines2(FILE *restrict stream, 
                                            FileBuf fbuf[restrict static 1])
{
    int fd = fileno(stream);

    if (fd == -1) {
        return false;
    }

    struct stat st;

    if (fstat(fd, &st) == -1) {
        close(fd);
        return false;
    }

    char *const map =
        mmap(nullptr, (size_t) st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
    close(fd);

    if (map == MAP_FAILED) {
        return false;
    }

    posix_madvise(map, (size_t) st.st_size, POSIX_MADV_SEQUENTIAL | POSIX_MADV_WILLNEED);

    char *lhs;
    char *rhs;
    ptrdiff_t size = 0;

    for (lhs = map; lhs < &map[st.st_size]; lhs = rhs + 1) {
        rhs = memchr(lhs, '\n', (size_t) (&map[st.st_size] - lhs));

        if (rhs == nullptr) {
            /* We have reached end-of-file or the file is malformed. */
            break;
        }

        size = rhs - lhs;

        if (size > 0) {
            /* Strip traling newline. */
            --size;
        }

        if (fbuf->capacity <= fbuf->count) {
            if (!resize_fbuf(fbuf)) {
                goto fail;
            }
        }

        if (!alloc_and_append_line(fbuf, (size_t) size, lhs)) {
            goto fail;
        }

    }

    /* Ignore errors on read-only file. */
    munmap(map, (size_t) st.st_size);

    /* Trim to maximum used. */
    fbuf->lines = safe_trim(fbuf->lines, fbuf->count * sizeof fbuf->lines[0]);

    return true;

  fail:
    munmap(map, (size_t) st.st_size);
    free_lines(fbuf);
    return false;
}

[[nodiscard, gnu::nonnull]] bool readlines1(FILE *restrict stream, 
                                            FileBuf fbuf[restrict static 1])
{
    char *line = nullptr;
    size_t capacity = 0;
    ssize_t nread = 0;

    for (;;) {
        nread = getline(&line, &capacity, stream);

        if (nread == -1) {
            if (feof(stream)) {
                /* Input ended due to end-of-file. */
                break;
            }

            if (errno == ENOMEM) {
                free(line);
                free_lines(fbuf);
                return false;
            }
        }

        if (nread > 1 && line[nread - 1] == '\n') {
            line[--nread] = '\0';
        }

        if (fbuf->capacity <= fbuf->count) {
            if (!resize_fbuf(fbuf)) {
                goto cleanup_and_fail;
            }
        }

        if (!alloc_and_append_line(fbuf, (size_t) nread, line)) {
            goto cleanup_and_fail;
        }
    }

    /* Trim to maximum used, */
    free(line);
    fbuf->lines = safe_trim(fbuf->lines, fbuf->count * sizeof fbuf->lines[0]);
    return true;

  cleanup_and_fail:
    free(line);
    free_lines(fbuf);
    return false;
}

[[gnu::always_inline]] static inline timestamp_t get_posix_clock_time_fallback(void)
{
    struct timeval tv;
    return gettimeofday (&tv, NULL) == 0 ? 
        (timestamp_t) (tv.tv_sec * 1'000'000 + tv.tv_usec) : 0;
}

/* Reference: https://stackoverflow.com/a/37920181/20017547 */
[[gnu::always_inline]] static inline timestamp_t get_posix_clock_time(void)
{
#ifdef _POSIX_MONOTONIC_CLOCK
    struct timespec ts;

    if (clock_gettime (CLOCK_MONOTONIC, &ts) == 0) {
        return (timestamp_t) (ts.tv_sec * 1'000'000 + ts.tv_nsec / 1'000);
    } 
    return get_posix_clock_time_fallback();
#else
    return get_posix_clock_time_fallback();
#endif /* _POSIX_MONOTONIC_CLOCK */
}

[[gnu::always_inline, gnu::const]] static inline timestamp_t get_clock_difftime(
        timestamp_t t0, 
        timestamp_t t1)
{
    return t1 - t0;
}

[[gnu::nonnull, gnu::always_inline]] static inline void usage(
        FILE *restrict stream, 
        const char argv0[restrict static 1])
{
    fprintf(stream, "Usage: %s OPTIONS FILE.\n"
                    "OPTION: --mmap, --getline.\n",
                    argv0);
}

int main(int argc, char *argv[])
{
    if (argc != 3) {
        fprintf(stderr, "error: expected 3 arguments.\n");
        usage(stderr, argv[0]);
        return EXIT_FAILURE;
    }

    FileBuf fbuf = {};
    bool (*fn)(FILE *, FileBuf *) = {};

    if (strcmp(argv[1], "--getline") == 0) {
        fn = readlines1;
    } else if (strcmp(argv[1], "--mmap") == 0) {
        fn = readlines2;
    } else {
        fprintf(stderr, "Unknown option: \"%s\".\n", argv[1]);
        usage(stderr, argv[0]);
        return EXIT_FAILURE;
    }

    FILE *const stream = fopen(argv[2], "rb");

    if (stream == nullptr) {
        perror(argv[2]);
        return EXIT_FAILURE;
    }
    
    const timestamp_t t0 = get_posix_clock_time();

    if (!fn(stream, &fbuf)) {
        fprintf(stderr, "error: failed to read file: %s.\n",
            strerror(errno));
        fclose(stream);
        return EXIT_FAILURE;
    }

    const timestamp_t msecs = get_clock_difftime(t0, get_posix_clock_time());
    
#ifndef BENCHMARKING 
    print_lines(&fbuf);
    fprintf(stderr, "Read %zu lines in: %f secs.\n", get_total_lines(&fbuf),
                  mcs_to_secs(msecs));
#else
    printf("%fs", mcs_to_secs((double) msecs));
#endif
    free_lines(&fbuf);
    fclose(stream);
    return EXIT_SUCCESS;
}

Run with:

./read_file --mmap <filename>

or:

./read_file --getline <filename>

Benchmarks:

Generated 9.9 GB of random text data, spanning 20 files. The L: is # of lines and W: is # of words.

F_01: L:65748 W:65000 112M
    1.233720s getline
    0.958801s mmap

F_02: L:927277 W:917225 1.1G
    10.951697s getline
    9.622239s mmap

F_03: L:994391 W:983497 1.7G
    16.703681s getline
    15.208185s mmap

F_04: L:144137 W:142530 116M
    1.176669s getline
    1.003888s mmap

F_05: L:212197 W:209866 390M
    4.157324s getline
    3.921001s mmap

F_06: L:232727 W:230108 357M
    3.651796s getline
    3.090482s mmap

F_07: L:89372 W:88356 65M
    0.649871s getline
    0.570212s mmap

F_08: L:569562 W:563291 550M
    5.024211s getline
    4.530035s mmap

F_09: L:453558 W:448606 675M
    6.238434s getline
    5.787951s mmap

F_10: L:138394 W:136879 243M
    2.458742s getline
    2.214154s mmap

F_11: L:290150 W:285186 22M
    0.300324s getline
    0.259979s mmap

F_12: L:318646 W:315055 527M
    5.417567s getline
    4.438138s mmap

F_13: L:576988 W:570725 1.1G
    9.372791s getline
    9.142827s mmap

F_14: L:783682 W:774984 855M
    8.091869s getline
    7.517599s mmap

F_15: L:814488 W:805405 806M
    7.947366s getline
    8.459972s mmap

F_16: L:249697 W:247063 423M
    4.075813s getline
    3.544945s mmap

F_17: L:586037 W:579086 236M
    2.571151s getline
    2.230638s mmap

F_18: L:407917 W:403336 329M
    3.323630s getline
    3.109054s mmap

F_19: L:592859 W:585774 222M
    2.274685s getline
    2.090911s mmap

F_20: L:233854 W:231300 292M
    2.772515s getline
    2.679925s mmap

I ran these on a Linux virtual machine. Some output from neofetch:

CPU: Intel i5-8350U (2) @ 1.895GHz
Memory: 454MiB / 2933MiB

The caches were cleared before each run with:

sync && echo 3 >| /proc/sys/vm/drop_caches

Makefile:

CC = gcc-13

CFLAGS += -DBENCHMARKING

CFLAGS += -O3
CFLAGS += -std=c2x
CFLAGS += -s
CFLAGS += -no-pie


CFLAGS += -fno-builtin
CFLAGS += -fno-common
CFLAGS += -fno-omit-frame-pointer

CFLAGS += -Wall
CFLAGS += -Wextra
CFLAGS += -Warray-bounds
CFLAGS += -Wconversion
CFLAGS += -Wformat-signedness
CFLAGS += -Wmissing-braces
CFLAGS += -Wno-parentheses
CFLAGS += -Wpedantic
CFLAGS += -Wstrict-prototypes
CFLAGS += -Wwrite-strings

CFLAGS += -Wsuggest-attribute=pure
CFLAGS += -Wsuggest-attribute=const
CFLAGS += -Wsuggest-attribute=noreturn
CFLAGS += -Wsuggest-attribute=cold
CFLAGS += -Wsuggest-attribute=malloc
CFLAGS += -Wsuggest-attribute=format

# CFLAGS += -fsanitize=address
# CFLAGS += -fsanitize=undefined
# CFLAGS += -fsanitize=bounds-strict
# CFLAGS += -fsanitize=leak
# CFLAGS += -fsanitize=null
# CFLAGS += -fsanitize=signed-integer-overflow
# CFLAGS += -fsanitize=bool
# CFLAGS += -fsanitize=pointer-overflow
# CFLAGS += -fsanitize-address-use-after-scope

RM = /bin/rm

TARGET = read_file

all: $(TARGET)

valgrind: $(TARGET)
    valgrind --tool=memcheck --leak-check=yes ./read_file --mmap data/F_01
    valgrind --tool=memcheck --leak-check=yes ./read_file --getline data/F_01

clean:
    $(RM) $(TARGET)

.PHONY: all clean valgrind
.DELETE_ON_ERROR:

The Sanitizer reported nothing awry, and has been commented out due to Valgrind.

Review Request:

Bugs, missing edge cases, better ways to read a file.

All those calls to malloc() and realloc() are surely adding to the total time, but they can be replaced with an arena allocator.

I only care about UNIX-like systems, I am using termios.h and other POSIX-specific header files in the text editor that I doubt would be available on Windows.

I have ignored the return value of munmap() and posix_madvise(). Should I not have?

Aside: One might wonder why I haven't elided void in the argument-list of functions in C23. My answer is that it looks better to me that way.

\$\endgroup\$
6
  • \$\begingroup\$ No, I haven't tested reading the file in chunks with fread(), or using fgets() instead of getline(). \$\endgroup\$
    – Harith
    Commented May 19 at 14:01
  • \$\begingroup\$ Or BSD's fgetln(). \$\endgroup\$
    – Harith
    Commented May 19 at 14:08
  • 1
    \$\begingroup\$ (The syntax decoration doesn't seem ready for ' as a digit separator.) \$\endgroup\$
    – greybeard
    Commented May 19 at 15:53
  • 1
    \$\begingroup\$ Please write an English sentence for readlines1() which explicilty promises that certain post-conditions shall be true, and similarly for readlines2(). We wish for both to offer the same post-conditions. // In readlines2() I really like the "goto fail". In readlines1() I have some reservations. I would like for it to contain exactly one free(line); call, which can be accessed via more than one "fail" label, depending on how much we've done and so must undo. I'm concerned that not every path out of the function will produce the same free'ing behavior. The code could make it more obvious. \$\endgroup\$
    – J_H
    Commented May 19 at 17:08
  • \$\begingroup\$ Do you want the methods to be reviewed against each other? In that case, it seems comparative-review may be a better fit than benchmarking. \$\endgroup\$
    – Mast
    Commented May 19 at 18:53

1 Answer 1

5
\$\begingroup\$

You can use getline() on memory

Instead of reinventing the wheel when you are using mmap(), wouldn't it be nice if you could still use getline() to extract the lines? You can in fact do this by using the POSIX function fmemopen(), which gives you a FILE* associated with a buffer in memory. So then your readlines2() can be written like so:

[[nodiscard, gnu::nonnull]] bool readlines2(FILE *restrict stream, 
                                            FileBuf fbuf[restrict static 1])
{
    int fd = fileno(stream);
    …
    posix_madvise(map, (size_t) st.st_size, POSIX_MADV_SEQUENTIAL | POSIX_MADV_WILLNEED);

    FILE *memstream = fmemopen(map, st.st_size, "r");
    bool result = readlines1(memstream, fbuf);
    fclose(memstream);

    munmap(map, st.st_size);
    …
}

Unnecessary copies

Your code makes unnecessary copies of lines, both in readlines1() and readlines2(). In readlines1() each line is first copied into line, then alloc_and_append_line() allocates a buffer and copies that line again. Instead, why not just move the pointer line into the array of lines, and then have getline() allocate a new buffer in the next iteration of the loop?

In readlines2() you also call alloc_and_append_line(), which allocates a buffer and copies the line. But you already have the line mapped in memory, so the only thing you really need is to store the pointer to the start of the line. Of course, there is that issue of the NUL bytes missing, but consider using the MAP_PRIVATE flag so you can write to the mapped memory without affecting the underlying file. This way, you can overwrite the newline characters with NUL bytes. Of course, that will still cause a copy to be made, but this time by the operating system. Alternatively, just make sure functions working on the lines treat \n as the end of the string, or use another way to deduce the length of the line.

Use posix_fadvise() in readline1()

You used posix_madvise() to tell the kernel about your access pattern in readline2(), but you didn't do that in readline1(). You can use posix_fadvise() to do that, which will make the benchmark results more comparable.

When to use what method

You'll have noticed that the performance difference is not very big. If you are just reading lines, then you are more likely to be bottlenecked by I/O than by the CPU. But standard I/O functions are also quite efficient; they read larger buffers at once behind the scenes, and in fact it is perfectly possible for a standard library implementation to actually do a mmap() behind the scenes.

There are benefits and drawbacks for both methods though. Standard I/O functions can work with huge files without needing lots of memory or a large address space (which is less of an issue on 64-bit machines, but is a big problem on 32-bit machines). Also, they work with things that are not regular files, for example you can read lines from stdin, or read from special files (pipes, UNIX sockets, character devices and so on).

mmap() shines when you have lots of small read/write operations to the file, such that even function calls into the standard library are getting expensive, or when you want to avoid making copies of a lot of data.

Missing error checking

In readlines1(), you only handle feof() and ENOMEM, all other errors are ignored. However, there can be lots of other reasons for getline() to fail.

In readlines2() you at least properly handle the return values of all function you call, but there is a problem: trying to read from the mapped memory will still rely on the actual file being read into memory. This is something the kernel will now transparently do for you, but this can still fail in the same ways getline() can fail. This time however, it will cause a page fault. You can catch those in your program, by installing a handler for the SIGSEGV signal, although that won't tell you where the fault happened, so that could also catch other issues causing segmentation faults. On Linux, you can also use userfaultfd() for more fine-grained page fault handling in userspace, but I don't know how suitable that is for handling memory-mapped file read errors.

\$\endgroup\$
13
  • 1
    \$\begingroup\$ I'm quite sure all the errors from read() are also possible. \$\endgroup\$
    – G. Sliepen
    Commented May 19 at 21:53
  • 1
    \$\begingroup\$ "or use another way to deduce the length of the line." ==> memchr() would do as the length is already known. This method also avoids N calls to free(), which would be replaced with a single munmap() at some place in the calling code, but now I'd have to keep track of the base address of the mapped-memory. \$\endgroup\$
    – Harith
    Commented May 19 at 22:26
  • 1
    \$\begingroup\$ You probably want to keep at least the madvise(), as that will tell the kernel to start mapping the pages from the file into the process's address space, whereas with just fadvise() it will readahead and populate the page cache, but not necessarily map those pages into the process, so you might get more page faults in the latter case. \$\endgroup\$
    – G. Sliepen
    Commented May 19 at 23:10
  • 1
    \$\begingroup\$ I might be nitpicking, but exit(EXIT_STATUS} would invoke undefined behaviour according to both ISO C and POSIX standards, because it is async-signal-unsafe. Exit() would be appropriate here, or better yet, reset the signal handler and raise() it again. \$\endgroup\$
    – Harith
    Commented May 19 at 23:41
  • 1
    \$\begingroup\$ Unfortunately, "there is no file descriptor associated with the file stream returned by fmemstream() (i.e., fileno(3) will return an error if called on a returned stream)." As we can't get a file descriptor, calling fadvise() is not possible with fmemstream(). I'd simply ignore the return value and let it fail. :) \$\endgroup\$
    – Harith
    Commented May 20 at 10:32

Not the answer you're looking for? Browse other questions tagged or ask your own question.