I required a function to read a file into lines for a text editor, and ended up with two routines. One uses getline()
, and the other uses mmap
.
Code:
#define _POSIX_C_SOURCE 200'809L // '
#define _XOPEN_SOURCE 700
#include <errno.h>
#include <inttypes.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <unistd.h>
typedef uint64_t timestamp_t;
typedef struct line {
char *line;
size_t size;
} Line;
typedef struct file_buf {
Line *lines;
uintmax_t capacity;
uintmax_t count;
} FileBuf;
[[gnu::always_inline, gnu::const]] static inline double mcs_to_secs(double mcs)
{
return mcs / 1'000'000.0;
}
[[nodiscard, gnu::always_inline, gnu::returns_nonnull, gnu::nonnull]] static
inline void *safe_trim(void *p, size_t n)
{
void *const cp = realloc(p, n);
return cp ? cp : p;
}
#ifndef BENCHMARKING
void print_lines(FileBuf fbuf[static 1])
{
for (size_t i = 0; i < fbuf->count; i++) {
puts(fbuf->lines[i].line);
}
}
[[gnu::always_inline, gnu::pure]] static inline size_t get_total_lines(
FileBuf fbuf[static 1])
{
return fbuf->count;
}
#endif
void free_lines(FileBuf fbuf[static 1])
{
for (size_t i = 0; i < fbuf->count; ++i) {
free(fbuf->lines[i].line);
}
free(fbuf->lines);
}
[[nodiscard, gnu::always_inline]] static inline bool resize_fbuf(
FileBuf fbuf[static 1])
{
fbuf->capacity += BUFSIZ;
void *const tmp = realloc(fbuf->lines, fbuf->capacity * sizeof fbuf->lines[0]);
return tmp == nullptr ? false : (fbuf->lines = tmp), true;
}
[[nodiscard]] bool alloc_and_append_line(FileBuf fbuf[static 1],
size_t size,
char line[static size])
{
fbuf->lines[fbuf->count].line = malloc(size + 1);
if (fbuf->lines[fbuf->count].line == nullptr) {
return false;
}
memcpy(fbuf->lines[fbuf->count].line, line, size);
fbuf->lines[fbuf->count].line[size] = '\0';
fbuf->lines[fbuf->count].size = size;
++fbuf->count;
return true;
}
[[nodiscard, gnu::nonnull]] bool readlines2(FILE *restrict stream,
FileBuf fbuf[restrict static 1])
{
int fd = fileno(stream);
if (fd == -1) {
return false;
}
struct stat st;
if (fstat(fd, &st) == -1) {
close(fd);
return false;
}
char *const map =
mmap(nullptr, (size_t) st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
close(fd);
if (map == MAP_FAILED) {
return false;
}
posix_madvise(map, (size_t) st.st_size, POSIX_MADV_SEQUENTIAL | POSIX_MADV_WILLNEED);
char *lhs;
char *rhs;
ptrdiff_t size = 0;
for (lhs = map; lhs < &map[st.st_size]; lhs = rhs + 1) {
rhs = memchr(lhs, '\n', (size_t) (&map[st.st_size] - lhs));
if (rhs == nullptr) {
/* We have reached end-of-file or the file is malformed. */
break;
}
size = rhs - lhs;
if (size > 0) {
/* Strip traling newline. */
--size;
}
if (fbuf->capacity <= fbuf->count) {
if (!resize_fbuf(fbuf)) {
goto fail;
}
}
if (!alloc_and_append_line(fbuf, (size_t) size, lhs)) {
goto fail;
}
}
/* Ignore errors on read-only file. */
munmap(map, (size_t) st.st_size);
/* Trim to maximum used. */
fbuf->lines = safe_trim(fbuf->lines, fbuf->count * sizeof fbuf->lines[0]);
return true;
fail:
munmap(map, (size_t) st.st_size);
free_lines(fbuf);
return false;
}
[[nodiscard, gnu::nonnull]] bool readlines1(FILE *restrict stream,
FileBuf fbuf[restrict static 1])
{
char *line = nullptr;
size_t capacity = 0;
ssize_t nread = 0;
for (;;) {
nread = getline(&line, &capacity, stream);
if (nread == -1) {
if (feof(stream)) {
/* Input ended due to end-of-file. */
break;
}
if (errno == ENOMEM) {
free(line);
free_lines(fbuf);
return false;
}
}
if (nread > 1 && line[nread - 1] == '\n') {
line[--nread] = '\0';
}
if (fbuf->capacity <= fbuf->count) {
if (!resize_fbuf(fbuf)) {
goto cleanup_and_fail;
}
}
if (!alloc_and_append_line(fbuf, (size_t) nread, line)) {
goto cleanup_and_fail;
}
}
/* Trim to maximum used, */
free(line);
fbuf->lines = safe_trim(fbuf->lines, fbuf->count * sizeof fbuf->lines[0]);
return true;
cleanup_and_fail:
free(line);
free_lines(fbuf);
return false;
}
[[gnu::always_inline]] static inline timestamp_t get_posix_clock_time_fallback(void)
{
struct timeval tv;
return gettimeofday (&tv, NULL) == 0 ?
(timestamp_t) (tv.tv_sec * 1'000'000 + tv.tv_usec) : 0;
}
/* Reference: https://stackoverflow.com/a/37920181/20017547 */
[[gnu::always_inline]] static inline timestamp_t get_posix_clock_time(void)
{
#ifdef _POSIX_MONOTONIC_CLOCK
struct timespec ts;
if (clock_gettime (CLOCK_MONOTONIC, &ts) == 0) {
return (timestamp_t) (ts.tv_sec * 1'000'000 + ts.tv_nsec / 1'000);
}
return get_posix_clock_time_fallback();
#else
return get_posix_clock_time_fallback();
#endif /* _POSIX_MONOTONIC_CLOCK */
}
[[gnu::always_inline, gnu::const]] static inline timestamp_t get_clock_difftime(
timestamp_t t0,
timestamp_t t1)
{
return t1 - t0;
}
[[gnu::nonnull, gnu::always_inline]] static inline void usage(
FILE *restrict stream,
const char argv0[restrict static 1])
{
fprintf(stream, "Usage: %s OPTIONS FILE.\n"
"OPTION: --mmap, --getline.\n",
argv0);
}
int main(int argc, char *argv[])
{
if (argc != 3) {
fprintf(stderr, "error: expected 3 arguments.\n");
usage(stderr, argv[0]);
return EXIT_FAILURE;
}
FileBuf fbuf = {};
bool (*fn)(FILE *, FileBuf *) = {};
if (strcmp(argv[1], "--getline") == 0) {
fn = readlines1;
} else if (strcmp(argv[1], "--mmap") == 0) {
fn = readlines2;
} else {
fprintf(stderr, "Unknown option: \"%s\".\n", argv[1]);
usage(stderr, argv[0]);
return EXIT_FAILURE;
}
FILE *const stream = fopen(argv[2], "rb");
if (stream == nullptr) {
perror(argv[2]);
return EXIT_FAILURE;
}
const timestamp_t t0 = get_posix_clock_time();
if (!fn(stream, &fbuf)) {
fprintf(stderr, "error: failed to read file: %s.\n",
strerror(errno));
fclose(stream);
return EXIT_FAILURE;
}
const timestamp_t msecs = get_clock_difftime(t0, get_posix_clock_time());
#ifndef BENCHMARKING
print_lines(&fbuf);
fprintf(stderr, "Read %zu lines in: %f secs.\n", get_total_lines(&fbuf),
mcs_to_secs(msecs));
#else
printf("%fs", mcs_to_secs((double) msecs));
#endif
free_lines(&fbuf);
fclose(stream);
return EXIT_SUCCESS;
}
Run with:
./read_file --mmap <filename>
or:
./read_file --getline <filename>
Benchmarks:
Generated 9.9 GB of random text data, spanning 20 files. The L:
is # of lines and W:
is # of words.
F_01: L:65748 W:65000 112M
1.233720s getline
0.958801s mmap
F_02: L:927277 W:917225 1.1G
10.951697s getline
9.622239s mmap
F_03: L:994391 W:983497 1.7G
16.703681s getline
15.208185s mmap
F_04: L:144137 W:142530 116M
1.176669s getline
1.003888s mmap
F_05: L:212197 W:209866 390M
4.157324s getline
3.921001s mmap
F_06: L:232727 W:230108 357M
3.651796s getline
3.090482s mmap
F_07: L:89372 W:88356 65M
0.649871s getline
0.570212s mmap
F_08: L:569562 W:563291 550M
5.024211s getline
4.530035s mmap
F_09: L:453558 W:448606 675M
6.238434s getline
5.787951s mmap
F_10: L:138394 W:136879 243M
2.458742s getline
2.214154s mmap
F_11: L:290150 W:285186 22M
0.300324s getline
0.259979s mmap
F_12: L:318646 W:315055 527M
5.417567s getline
4.438138s mmap
F_13: L:576988 W:570725 1.1G
9.372791s getline
9.142827s mmap
F_14: L:783682 W:774984 855M
8.091869s getline
7.517599s mmap
F_15: L:814488 W:805405 806M
7.947366s getline
8.459972s mmap
F_16: L:249697 W:247063 423M
4.075813s getline
3.544945s mmap
F_17: L:586037 W:579086 236M
2.571151s getline
2.230638s mmap
F_18: L:407917 W:403336 329M
3.323630s getline
3.109054s mmap
F_19: L:592859 W:585774 222M
2.274685s getline
2.090911s mmap
F_20: L:233854 W:231300 292M
2.772515s getline
2.679925s mmap
I ran these on a Linux virtual machine. Some output from neofetch:
CPU: Intel i5-8350U (2) @ 1.895GHz
Memory: 454MiB / 2933MiB
The caches were cleared before each run with:
sync && echo 3 >| /proc/sys/vm/drop_caches
Makefile:
CC = gcc-13
CFLAGS += -DBENCHMARKING
CFLAGS += -O3
CFLAGS += -std=c2x
CFLAGS += -s
CFLAGS += -no-pie
CFLAGS += -fno-builtin
CFLAGS += -fno-common
CFLAGS += -fno-omit-frame-pointer
CFLAGS += -Wall
CFLAGS += -Wextra
CFLAGS += -Warray-bounds
CFLAGS += -Wconversion
CFLAGS += -Wformat-signedness
CFLAGS += -Wmissing-braces
CFLAGS += -Wno-parentheses
CFLAGS += -Wpedantic
CFLAGS += -Wstrict-prototypes
CFLAGS += -Wwrite-strings
CFLAGS += -Wsuggest-attribute=pure
CFLAGS += -Wsuggest-attribute=const
CFLAGS += -Wsuggest-attribute=noreturn
CFLAGS += -Wsuggest-attribute=cold
CFLAGS += -Wsuggest-attribute=malloc
CFLAGS += -Wsuggest-attribute=format
# CFLAGS += -fsanitize=address
# CFLAGS += -fsanitize=undefined
# CFLAGS += -fsanitize=bounds-strict
# CFLAGS += -fsanitize=leak
# CFLAGS += -fsanitize=null
# CFLAGS += -fsanitize=signed-integer-overflow
# CFLAGS += -fsanitize=bool
# CFLAGS += -fsanitize=pointer-overflow
# CFLAGS += -fsanitize-address-use-after-scope
RM = /bin/rm
TARGET = read_file
all: $(TARGET)
valgrind: $(TARGET)
valgrind --tool=memcheck --leak-check=yes ./read_file --mmap data/F_01
valgrind --tool=memcheck --leak-check=yes ./read_file --getline data/F_01
clean:
$(RM) $(TARGET)
.PHONY: all clean valgrind
.DELETE_ON_ERROR:
The Sanitizer reported nothing awry, and has been commented out due to Valgrind.
Review Request:
Bugs, missing edge cases, better ways to read a file.
All those calls to malloc()
and realloc()
are surely adding to the total time, but they can be replaced with an arena allocator.
I only care about UNIX-like systems, I am using termios.h
and other POSIX-specific header files in the text editor that I doubt would be available on Windows.
I have ignored the return value of munmap()
and posix_madvise()
. Should I not have?
Aside: One might wonder why I haven't elided void
in the argument-list of functions in C23. My answer is that it looks better to me that way.
fread()
, or usingfgets()
instead ofgetline()
. \$\endgroup\$fgetln()
. \$\endgroup\$'
as a digit separator.) \$\endgroup\$free(line);
call, which can be accessed via more than one "fail" label, depending on how much we've done and so must undo. I'm concerned that not every path out of the function will produce the same free'ing behavior. The code could make it more obvious. \$\endgroup\$