kernel/sys/thread.c

//#include "arch/amd64/hw/timer.h"
#include "arch/amd64/mm/pool.h"
#include "arch/amd64/context.h"
//#include "arch/amd64/mm/map.h"
#include "sys/mem/vmalloc.h"
#include "arch/amd64/cpu.h"
#include "sys/binfmt_elf.h"
#include "sys/sys_proc.h"
#include "sys/mem/phys.h"
#include "user/signum.h"
//#include "net/socket.h"
#include "user/errno.h"
#include "user/fcntl.h"
#include "sys/assert.h"
#include "sys/string.h"
#include "sys/thread.h"
#include "sys/sched.h"
#include "sys/debug.h"
#include "fs/ofile.h"
#include "sys/heap.h"
//#include "fs/vfs.h"
//#include "sys/mm.h"

struct sys_fork_frame {
    uint64_t rdi, rsi, rdx, rcx;
    uint64_t r8, r9, r10, r11;
    uint64_t rbx;
    uint64_t rbp;
    uint64_t r12;
    uint64_t r13;
    uint64_t r14;
    uint64_t r15;
    uint64_t rsp;
    uint64_t rflags;
    uint64_t rip;
};

////

LIST_HEAD(proc_all_head);
static pid_t last_kernel_pid = 0;
static pid_t last_user_pid = 0;
// TODO: MAKE THIS PER-PROCESSOR
static uint64_t fxsave_buf[FXSAVE_REGION / 8] __attribute__((aligned(16)));

void context_save_fpu(struct thread *new, struct thread *old) {
    _assert(old);
    if (old->data.fxsave) {
        asm volatile ("fxsave (%0)"::"r"(fxsave_buf):"memory");
        memcpy(old->data.fxsave, fxsave_buf, FXSAVE_REGION);
        old->flags |= THREAD_FPU_SAVED;
    }

}

void context_restore_fpu(struct thread *new, struct thread *old) {
    _assert(new);
    if (new->flags & THREAD_FPU_SAVED) {
        memcpy(fxsave_buf, new->data.fxsave, FXSAVE_REGION);
        asm volatile ("fxrstor (%0)"::"r"(fxsave_buf):"memory");
        new->flags &= ~THREAD_FPU_SAVED;
    }
}

pid_t process_alloc_pid(int is_user) {
    if (is_user) {
        return ++last_user_pid;
    } else {
        return -(++last_kernel_pid);
    }
}


static void process_ioctx_empty(struct process *proc) {
    memset(&proc->ioctx, 0, sizeof(struct vfs_ioctx));
    memset(proc->fds, 0, sizeof(proc->fds));
}

void process_ioctx_fork(struct process *dst, struct process *src) {
    process_ioctx_empty(dst);

    dst->ioctx.cwd_vnode = src->ioctx.cwd_vnode;
    dst->ioctx.gid = src->ioctx.gid;
    dst->ioctx.uid = src->ioctx.uid;

    for (int i = 0; i < THREAD_MAX_FDS; ++i) {
        if (src->fds[i]) {
            dst->fds[i] = ofile_dup(src->fds[i]);
        }
    }
}


int process_signal_pgid(pid_t pgid, int signum) {
    int ret = 0;

    struct process *proc;
    list_for_each_entry(proc, &proc_all_head, g_link) {
        if (proc->proc_state != PROC_FINISHED && proc->pgid == pgid) {
            process_signal(proc, signum);
            ++ret;
        }
    }

    return ret == 0 ? -1 : ret;
}

struct process *process_find(pid_t pid) {
    struct process *proc;
    list_for_each_entry(proc, &proc_all_head, g_link) {
        if (proc->pid == pid) {
            return proc;
        }
    }

    return NULL;
}

struct process *process_child(struct process *of, pid_t pid) {
    for (struct process *proc = of->first_child; proc; proc = proc->next_child) {
        if (proc->pid == pid) {
            return proc;
        }
    }
    return NULL;
}

void process_unchild(struct process *proc) {
    struct process *par = proc->parent;
    _assert(par);

    struct process *p = NULL;
    struct process *c = par->first_child;
    int found = 0;

    while (c) {
        if (c == proc) {
            found = 1;
            if (p) {
                p->next_child = proc->next_child;
            } else {
                par->first_child = proc->next_child;
            }
            break;
        }

        p = c;
        c = c->next_child;
    }

    _assert(found);
}

void process_cleanup(struct process *proc) {
    // Leave only the system context required for hierachy tracking and error code/pid
    _assert(proc->proc_state == PROC_FINISHED);
    _assert(proc->thread_count == 1);
    proc->flags |= PROC_EMPTY;
    kdebug("Cleaning up %d\n", proc->pid);
    for (size_t i = 0; i < THREAD_MAX_FDS; ++i) {
        if (proc->fds[i]) {
            ofile_close(&proc->ioctx, proc->fds[i]);
            proc->fds[i] = NULL;
        }
    }

    // Release userspace pages
    mm_space_release(proc);
}

void process_free(struct process *proc) {
    // Make sure all the threads of the process have stopped -
    // only main remains
    _assert(proc->thread_count == 1);

    // Sure that no code of this thread will be running anymore -
    // can clean up its stuff
    process_cleanup(proc);

    _assert(proc->proc_state == PROC_FINISHED);
    struct thread *thr = proc->first_thread;
    _assert(thr);

    // Free kstack
    for (size_t i = 0; i < thr->data.rsp0_size / MM_PAGE_SIZE; ++i) {
        mm_phys_free_page(MM_PHYS(i * MM_PAGE_SIZE + thr->data.rsp0_base));
    }

    // Free page directory (if not mm_kernel)
    if (proc->space != mm_kernel) {
        // Make sure we don't shoot a leg off
        uintptr_t cr3;
        asm volatile ("movq %%cr3, %0":"=a"(cr3));
        _assert(MM_VIRTUALIZE(cr3) != (uintptr_t) proc->space);

        mm_space_free(proc);
    }

    // Free thread itself
    memset(thr, 0, sizeof(struct thread));
    kfree(thr);
    // Free the process
    memset(proc, 0, sizeof(struct process));
    kfree(proc);
}

int process_init_thread(struct process *proc, uintptr_t entry, void *arg, int user) {
    list_head_init(&proc->g_link);
    list_head_init(&proc->shm_list);
    thread_wait_io_init(&proc->pid_notify);

    proc->name[0] = 0;
    proc->flags = user ? 0 : THREAD_KERNEL;

    if (user) {
        proc->space = amd64_mm_pool_alloc();
        mm_space_clone(proc->space, mm_kernel, MM_CLONE_FLG_KERNEL);
    } else {
        proc->space = mm_kernel;
    }

    struct thread *main_thread = kmalloc(sizeof(struct thread));
    _assert(main_thread);
    main_thread->proc = proc;

    int res = thread_init(main_thread, entry, arg, user);
    _assert(res == 0);

    process_ioctx_empty(proc);

    proc->first_thread = main_thread;
    proc->thread_count = 1;
    proc->parent = NULL;
    proc->first_child = NULL;
    proc->next_child = NULL;
    proc->pgid = -1;
    proc->pid = -1;

    proc->sigq = 0;
    proc->proc_state = PROC_ACTIVE;

    list_add(&proc->g_link, &proc_all_head);

    return 0;
}

int thread_init(struct thread *thr, uintptr_t entry, void *arg, int user) {
    uintptr_t stack_pages = mm_phys_alloc_contiguous(2); //amd64_phys_alloc_contiguous(2);
    _assert(stack_pages != MM_NADDR);

    thr->data.rsp0_base = MM_VIRTUALIZE(stack_pages);
    thr->data.rsp0_size = MM_PAGE_SIZE * 2;
    thr->data.rsp0_top = thr->data.rsp0_base + thr->data.rsp0_size;
    thr->flags = user ? 0 : THREAD_KERNEL;

    if (user) {
        thr->data.fxsave = kmalloc(FXSAVE_REGION);
        _assert(thr->data.fxsave);
    } else {
        thr->data.fxsave = NULL;
    }

    list_head_init(&thr->wait_head);
    thread_wait_io_init(&thr->sleep_notify);

    uint64_t *stack = (uint64_t *) (thr->data.rsp0_base + thr->data.rsp0_size);

    mm_space_t space = NULL;
    if (thr->proc) {
        space = thr->proc->space;
    } else if (!user) {
        space = mm_kernel;
    }
    _assert(space);
    thr->data.cr3 = MM_PHYS(space);

    if (user) {
        // Allocate thread user stack
        uintptr_t ustack_base = vmalloc(space, 0x1000000, 0xF0000000, 4, MM_PAGE_WRITE | MM_PAGE_USER, PU_PRIVATE);
        thr->data.rsp3_base = ustack_base;
        thr->data.rsp3_size = MM_PAGE_SIZE * 4;
    }

    thr->state = THREAD_READY;

    // Initial thread context
    // Entry context
    if (user) {
        // ss
        *--stack = 0x1B;
        // rsp
        *--stack = thr->data.rsp3_base + thr->data.rsp3_size;
        // rflags
        *--stack = 0x200;
        // cs
        *--stack = 0x23;
        // rip
        *--stack = (uintptr_t) entry;
    } else {
        // ss
        *--stack = 0x10;
        // rsp. Once this context is popped from the stack, stack top is going to be a new
        //      stack pointer for kernel threads
        *--stack = thr->data.rsp0_base + thr->data.rsp0_size;
        // rflags
        *--stack = 0x200;
        // cs
        *--stack = 0x08;
        // rip
        *--stack = (uintptr_t) entry;
    }

    // Caller-saved
    // r11
    *--stack = 0;
    // r10
    *--stack = 0;
    // r9
    *--stack = 0;
    // r8
    *--stack = 0;
    // rcx
    *--stack = 0;
    // rdx
    *--stack = 0;
    // rsi
    *--stack = 0;
    // rdi
    *--stack = (uintptr_t) arg;
    // rax
    *--stack = 0;

    // Small stub so that context switch enters the thread properly
    *--stack = (uintptr_t) context_enter;
    // Callee-saved
    // r15
    *--stack = 0;
    // r14
    *--stack = 0;
    // r13
    *--stack = 0;
    // r12
    *--stack = 0;
    // rbp
    *--stack = 0;
    // rbx
    *--stack = 0;

    // Thread lifecycle:
    // * context_switch_to():
    //   - pops callee-saved registers (initializing them to 0)
    //   - enters context_enter()
    // * context_enter():
    //   - pops caller-saved registers (initializing them to 0 and setting up rdi)
    //   - enters proper execution context via iret
    //  ... Thread is running here until it yields
    // * yield leads to context_switch_to():
    //   - call to yield() automatically (per ABI) stores caller-saved registers
    //   - context_switch_to() pushes callee-saved registers onto current stack
    //   - selects a new thread
    //   - step one

    thr->data.rsp0 = (uintptr_t) stack;

    return 0;
}

// TODO: support kthread forking()
//       (Although I don't really think it's very useful -
//        threads can just be created by thread_init() and
//        sched_queue())
int sys_fork(struct sys_fork_frame *frame) {
    struct thread *src_thread = thread_self;
    _assert(src_thread);
    struct process *src = src_thread->proc;
    _assert(src);

    if (src->thread_count != 1) {
        panic("XXX: fork() a multithreaded process\n");
    }

    struct process *dst = kmalloc(sizeof(struct process));
    _assert(dst);
    struct thread *dst_thread = kmalloc(sizeof(struct thread));
    _assert(dst_thread);

    // Initialize dst process: memory space
    mm_space_t space = amd64_mm_pool_alloc();
    dst->space = space;
    mm_space_fork(dst, src, MM_CLONE_FLG_KERNEL | MM_CLONE_FLG_USER);

    // Initialize dst process state
    list_head_init(&dst->g_link);
    list_head_init(&dst->shm_list);
    thread_wait_io_init(&dst->pid_notify);
    dst->flags = 0;
    strcpy(dst->name, src->name);
    dst->signal_entry = src->signal_entry;
    process_ioctx_fork(dst, src);
    dst->parent = src;
    dst->next_child = src->first_child;
    src->first_child = dst;
    dst->first_child = NULL;
    dst->pid = process_alloc_pid(1);
    dst->pgid = src->pgid;
    dst->sigq = 0;
    dst->first_thread = dst_thread;
    dst->thread_count = 1;
    dst->proc_state = PROC_ACTIVE;

    // Initialize dst thread
    dst_thread->proc = dst;

    uintptr_t stack_pages = mm_phys_alloc_contiguous(2); //amd64_phys_alloc_contiguous(2);
    _assert(stack_pages != MM_NADDR);
    list_head_init(&dst_thread->wait_head);
    thread_wait_io_init(&dst_thread->sleep_notify);

    dst_thread->data.rsp0_base = MM_VIRTUALIZE(stack_pages);
    dst_thread->data.rsp0_size = MM_PAGE_SIZE * 2;
    dst_thread->data.rsp0_top = dst_thread->data.rsp0_base + dst_thread->data.rsp0_size;
    dst_thread->flags = 0;

    dst_thread->data.rsp3_base = src_thread->data.rsp3_base;
    dst_thread->data.rsp3_size = src_thread->data.rsp3_size;

    dst_thread->data.cr3 = MM_PHYS(space);

    dst_thread->data.fxsave = kmalloc(FXSAVE_REGION);
    _assert(dst_thread->data.fxsave);
    _assert(src_thread->data.fxsave);

    if (src_thread->flags & THREAD_FPU_SAVED) {
        memcpy(dst_thread->data.fxsave, src_thread->data.fxsave, FXSAVE_REGION);
    }

    dst_thread->state = THREAD_READY;

    uint64_t *stack = (uint64_t *) dst_thread->data.rsp0_top;

    // Initial thread context
    // Entry context
    // ss
    *--stack = 0x1B;
    // rsp
    *--stack = frame->rsp;
    // rflags
    _assert(frame->rflags & 0x200);
    *--stack = frame->rflags;
    // cs
    *--stack = 0x23;
    // rip
    *--stack = frame->rip;

    // Caller-saved
    // r11
    *--stack = frame->r11;
    // r10
    *--stack = frame->r10;
    // r9
    *--stack = frame->r9;
    // r8
    *--stack = frame->r8;
    // rcx
    *--stack = frame->rcx;
    // rdx
    *--stack = frame->rdx;
    // rsi
    *--stack = frame->rsi;
    // rdi
    *--stack = frame->rdi;
    // rax
    *--stack = 0;

    // Small stub so that context switch enters the thread properly
    *--stack = (uintptr_t) context_enter;
    // Callee-saved
    // r15
    *--stack = frame->r15;
    // r14
    *--stack = frame->r14;
    // r13
    *--stack = frame->r13;
    // r12
    *--stack = frame->r12;
    // rbp
    *--stack = frame->rbp;
    // rbx
    *--stack = frame->rbx;

    dst_thread->data.rsp0 = (uintptr_t) stack;

    list_add(&dst->g_link, &proc_all_head);
    sched_queue(dst->first_thread);

    return dst->pid;
}

void thread_sigenter(int signum) {
    if (signum == SIGCHLD) {
        kdebug("Skipping SIGCHLD\n");
        return;
    }
    struct thread *thr = thread_self;
    kdebug("%d: Handle signal %d\n", thr->proc->pid, signum);
    uintptr_t old_rsp0_top = thr->data.rsp0_top;
    // XXX: Either use a separate stack or ensure stuff doesn't get overwritten
    uintptr_t signal_rsp3 = thr->data.rsp3_base + 0x800;

    context_sigenter(thr->proc->signal_entry, signal_rsp3, signum);

    thr->data.rsp0_top = old_rsp0_top;
}

__attribute__((noreturn)) void sys_exit(int status) {
    struct thread *thr = thread_self;
    struct process *proc = thr->proc;

    if (proc->thread_count != 1) {
        panic("XXX: exit() in multithread process\n");
    }
    kdebug("Process %d exited with status %d\n", proc->pid, status);

    // Clear pending I/O (if exiting from signal interrupting select())
    if (!list_empty(&thr->wait_head)) {
        thread_wait_io_clear(thr);
    }

    proc->exit_status = status;

    // Notify waitpid()ers
    if (proc->pid_notify.owner) {
        thread_notify_io(&proc->pid_notify);
    }

    proc->proc_state = PROC_FINISHED;
    sched_unqueue(thr, THREAD_STOPPED);
    panic("This code shouldn't run\n");
}

int sys_waitpid(pid_t pid, int *status) {
    struct thread *thr = thread_self;
    _assert(thr);
    struct process *proc_self = thr->proc;
    _assert(proc_self);
    struct process *chld = process_child(proc_self, pid);
    int res;

    if (!chld) {
        return -ECHILD;
    }

    while (chld->proc_state != PROC_FINISHED) {
        res = thread_wait_io(thr, &chld->pid_notify);

        if (res < 0) {
            // Likely interrupted
            return res;
        }

        // State should already be "stopped" when notify is signalled
        _assert(chld->proc_state == PROC_FINISHED);
        break;
    }

    if (status) {
        *status = chld->exit_status;
    }

    // TODO: automatically cleanup threads which don't have
    //       a parent like PID 1
    process_unchild(chld);
    list_del(&chld->g_link);
    process_free(chld);

    return 0;
}

void sys_sigreturn(void) {
    context_sigreturn();
}

void process_signal(struct process *proc, int signum) {
    // First thread processes all signals
    // TODO: find first non-finished thread
    struct thread *thr = proc->first_thread;
    _assert(thr);

    if (thr->sleep_notify.owner) {
        thread_notify_io(&thr->sleep_notify);
        //thr->sleep_notify.owner = NULL;
        //timer_remove_sleep(thr);
    }

    if (thr->cpu == (int) get_cpu()->processor_id) {
        if (thr == thread_self) {
            kdebug("Signal will be handled now\n");
            thread_sigenter(signum);
        } else {
            kdebug("Signal will be handled later\n");
            process_signal_set(proc, signum);

            sched_queue(thr);
        }
    } else if (thr->cpu >= 0) {
        kdebug("Signal will be handled later (other cpu%d)\n", thr->cpu);
        process_signal_set(proc, signum);

        sched_queue(thr);
    } else {
        kdebug("Signal will be handled later (not running)\n");
        process_signal_set(proc, signum);

        sched_queue(thr);
    }
}

int thread_check_signal(struct thread *thr, int ret) {
    struct process *proc = thr->proc;
    if (!proc) {
        return ret;
    }

    if (proc->sigq) {
        // Pick one signal to handle at a time
        int signum = 0;
        for (int i = 0; i < 64; ++i) {
            if (proc->sigq & (1ULL << i)) {
                proc->sigq &= ~(1ULL << i);
                signum = i + 1;
                break;
            }
        }
        _assert(signum);
        thread_sigenter(signum);

        // Theoretically, a rogue thread could steal all the CPU time by sending itself signals
        // in normal context, as after returning from thread_sigenter() this code will return
        // to a normal execution
        // XXX: Maybe makes sense to just yield() here
        return -EINTR;
    }

    return ret;
}

int sys_kill(pid_t pid, int signum) {
    struct process *proc;

    if (pid > 0) {
        proc = process_find(pid);
    } else if (pid == 0) {
        proc = thread_self->proc;
    } else {
        // Not implemented
        proc = NULL;
    }

    if (!proc || proc->proc_state == PROC_FINISHED) {
        return -ESRCH;
    }

    if (signum == 0) {
        return 0;
    }

    if (signum <= 0 || signum >= 64) {
        return -EINVAL;
    }

    if (!proc) {
        // No such process
        return -ESRCH;
    }

    process_signal(proc, signum);

    return 0;
}

void sys_sigentry(uintptr_t entry) {
    thread_self->proc->signal_entry = entry;
}

pid_t sys_getpid(void) {
    _assert(thread_self && thread_self->proc);
    return thread_self->proc->pid;
}

pid_t sys_getpgid(pid_t pid) {
    struct process *proc;

    if (pid == 0) {
        proc = thread_self->proc;
    } else {
        proc = process_find(pid);
    }

    if (!proc) {
        return -ESRCH;
    }

    return proc->pgid;
}

int sys_setpgid(pid_t pid, pid_t pgrp) {
    struct process *proc = thread_self->proc;

    if (pid == 0 && pgrp == 0) {
        proc->pgid = proc->pid;
        return 0;
    }

    if (pid == proc->pid) {
        proc->pgid = pgrp;
        return 0;
    }
    // Find child with pid pid (guess only children can be setpgid'd)
    struct process *chld = process_child(proc, pid);
    if (!chld) {
        return -ESRCH;
    }
    if (chld->pgid != proc->pgid) {
        return -EACCES;
    }
    chld->pgid = pgrp;

    return 0;
}

int sys_setuid(uid_t uid) {
    struct process *proc = thread_self->proc;
    _assert(proc);

    if (proc->ioctx.uid != 0) {
        return -EACCES;
    }

    proc->ioctx.uid = uid;
    return 0;
}

int sys_setgid(gid_t gid) {
    struct process *proc = thread_self->proc;
    _assert(proc);

    if (proc->ioctx.gid != 0 && proc->ioctx.uid != 0) {
        return -EACCES;
    }

    proc->ioctx.gid = gid;
    return 0;
}

uid_t sys_getuid(void) {
    return thread_self->proc->ioctx.uid;
}

gid_t sys_getgid(void) {
    return thread_self->proc->ioctx.gid;
}