7
\$\begingroup\$

I wrote a JSON parser for C++. It is not particularly fast, efficient or elegant. I'd like to change that and primarily I'd like the code to be more elegant. How do I improve it and get rid of all the code smell?

Aside: I have never taken a computer science course in my life and I'm not sure how far this deviates from the theoretical model I was trying to implement .i.e a LL1 parser. Perhaps that's why it feels so hacky.

The header file is as follows.

#pragma once

#include <string>
#include <vector>
#include <map>
#include <variant>
#include <algorithm>
#include <fstream>
#include <stack>

// Debugging
#include <iostream>

// Types to store JSON ouput

struct jlist;
struct jobject;

using json_value = std::variant<int, float, bool, std::string, jlist, jobject>;

struct jlist {
    std::vector<json_value> vector_value;

    json_value & operator [](int index) {
        return vector_value[index];
    }

    void push_back(json_value & value) {
        vector_value.push_back(value);
    }

};

struct jobject {
    std::map<std::string, json_value> map_value;

    json_value & operator [](std::string key) {
        return map_value[key];
    }

    void insert(json_value & key, json_value & value) {
        map_value.insert( { std::get<std::string>(key), value } );
    }

};

class JSONParser
{
public:
        JSONParser();

        ~JSONParser();

        void parseFile(std::string);

private:
        json_value root;
        std::stack<std::string> s;
        std::stack<json_value> s_value;

        // Lexer
        bool checkDeliminator(char);
        std::vector<std::string> lexer(std::ifstream &);

        // FSM varaibles
        enum state { int_value, float_value, bool_value, string_value, default_value, bad_state};
        state current;

        // FSM
        void fsm(std::string);

        // Parser variables
        enum stack_map { list_open, list_close, object_open, object_close, colon, comma, buffer, follow};
        std::map<std::string, stack_map> stack_conversion;

        // Parser helper functions
        template<typename T> void addElement();

        template<typename T> void insert(std::string &, T (*)(const std::string &));
        template<typename T> void insert();
        void insert(std::string &);
        void pushBuffer();

        template<typename ... T> bool multiComparision(const char scope, T ... args);
        bool isDigit(const char);
        static int st2i(const std::string & value);
        static float st2f(const std::string & value);
        static bool st2b(const std::string & value);

        // Parser
        void parser(const std::string & cursor);
};

The implementation is below

#include "JSONParser.h"

JSONParser::JSONParser() {
    state current = default_value;
    stack_conversion = { { "[", list_open }, { "]", list_close }, { "{", object_open }, { "}", object_close }, { ":", colon }, { ",", comma }, { "buffer", buffer } };
}

JSONParser::~JSONParser() = default;

void JSONParser::parseFile(std::string FILE) {
    std::ifstream configfile(FILE);
    std::vector<std::string> scan = lexer(configfile);

    scan.push_back("terminate");
    for (auto it = scan.begin(); it != scan.end(); ++it) {
            parser(*it);
    }
    root = s_value.top();
    s_value.pop();
}

// Lexer
bool JSONParser::checkDeliminator(char piece) {
    switch (piece) {
        case '[':
            return true;
        case ']':
            return true;
        case '{':
            return true;
        case '}':
            return true;
        case ':':
            return true;
        case ',':
            return true;
        default:
            return false;
    }
}

std::vector<std::string> JSONParser::lexer(std::ifstream & configfile) {
    char piece;
    std::string capture = "";
    std::string conversion;
    std::vector<std::string> capture_list;

    while(configfile >> piece) {
        if (checkDeliminator(piece)) {
            conversion = piece;
            if (capture != "") {
                capture_list.push_back(capture);
                capture_list.push_back(conversion);
                capture = "";
            } else {
                capture_list.push_back(conversion);
            }
        } else {
            capture += piece;
        }
    }

    return capture_list;
}

// FSM
void JSONParser::fsm(std::string value) {
    current = default_value;
    char point;
    auto it = value.begin();

    while (it != value.end()) {
        point = *it;
        if (point == '"' & current == default_value) {
            current = string_value;
            return;
        } else if (isdigit(point)) {
            if (current == default_value | current == int_value) {
                current = int_value;
                ++it;
            } else if (current == float_value) {
                ++it;
            } else {
                current = bad_state;
                return;
            }
        } else if (point == '.' & current == int_value) {
            current = float_value;
            ++it;
        } else if (point == 'f' & current == float_value) {
            ++it;
        } else if (current == default_value) {
            if (value == "true" | value == "false") {
                current = bool_value;
                return;
            } else {
                current = bad_state;
                return;
            }
        } else {
            current = bad_state;
            return;
        }
    }
}

// Parser Helper functions
template<>
void JSONParser::addElement<jobject>() {
    json_value value_read;
    json_value key_read;

    value_read = s_value.top();
    s_value.pop();
    key_read = s_value.top();
    s_value.pop();

    std::get<jobject>(s_value.top()).insert(key_read, value_read);
}

template<>
void JSONParser::addElement<jlist>() {
    json_value value_read;

    value_read = s_value.top();
    s_value.pop();

    std::get<jlist>(s_value.top()).push_back(value_read);
}

template<typename T>
void JSONParser::insert(std::string & value, T (*fptr)(const std::string &)) {
        T T_value(fptr(value));
        s_value.push(T_value);
}

template<typename T>
void JSONParser::insert() {
        T T_value;
        s_value.push(T_value);
}

void JSONParser::insert(std::string & value) {
    value.erase(std::remove(value.begin(), value.end(), '"'), value.end());
        s_value.push(value);
}

void JSONParser::pushBuffer() {
    s.pop();
    s.push("buffer");
}

template<typename ... T>
bool JSONParser::multiComparision(const char scope, T ... args) {
    return (scope == (args || ...));
}

bool JSONParser::isDigit(const char c) {
    return multiComparision<char>(c, '1', '2', '3', '4', '5', '6', '7', '8', '9', '0');
}

int JSONParser::st2i(const std::string & value) {
        return stoi(value);
}

float JSONParser::st2f(const std::string & value) {
        return stof(value);
}

bool JSONParser::st2b(const std::string & value) {
        if (value == "true") {
                return true;
        } else {
                return false;
        }
}

// Parser
void JSONParser::parser(const std::string & cursor) {
    if(s.empty()) {
        s.push(cursor); 
    } else {
        stack_map stack_value;
        std::string value = s.top();

        if (stack_conversion.find(value) != stack_conversion.end()) {
            stack_value = stack_conversion[s.top()];
        } else {
            stack_value = follow;
        }

        switch (stack_value) {
            case buffer:
                s.pop();
                break;
            case list_open:
                insert<jlist>();
                if (cursor == "]") {
                    pushBuffer();
                    return;
                }
                break;
            case list_close:
                addElement<jlist>();
                s.pop();
                s.pop();
                break;
            case object_open:
                insert<jobject>();
                if (cursor == "}") {
                    pushBuffer();
                    return;
                }
                break;
            case object_close:
                addElement<jobject>();
                s.pop();
                s.pop();
                break;
            case colon:
                s.pop();
                break;
            case comma:
                s.pop();
                if (s.top() == "{") {
                    addElement<jobject>();
                } else {
                    addElement<jlist>();
                }
                break;
            default:
                s.pop();
                fsm(value);
                switch (current) {
                    case string_value:
                        insert(value);
                        break;
                    case int_value:
                        insert<int>(value, st2i);
                        break;
                    case float_value:
                        insert<float>(value, st2f);
                        break;
                    case bool_value:
                        insert<bool>(value, st2b);
                        break;
                    default:
                        std::cout << "Bad state\n"; 
                }
        }
        s.push(cursor);
    }
}

The idea was to have the lexer break at each deliminator and place all the generated tokens into a vector. This vector called scan could then be looped through. At each iteration of this loop, parser would be run. In general this reads the top of the stack s and determines whether a bracket/brace is opening or closing or a terminal value has been reached. If a bracket/brace is opening, a new jobject or jlist is generated and placed onto a new stack s_value, if a terminal value is reached fsm (finite state machine) runs and determines the type of value and places it on top of s_value, should a comma or closing bracket be reached the appropriate values are moved off the stack and the elements from s_value are inserted into their appropriate containers.

The biggest meatball in this spaghetti is how elements in the JSON tree are called.

std::cout << std::get<bool>(std::get<jobject>(std::get<jobject>(std::get<jlist>(root)[6])["input"])["bool"]); 

The nested std::get calls seem just plain wrong and I'm not sure if they can be incorporated into the operator [].

For completeness this is the JSON file parsed, so the above call would output 1.

[
{
    "libraries":[
        "terminal",
        "binary"
    ]
,
    "functions":[
        "terminal-basic",
        "binary-basic"
    ]
}
,
{
    "name":"addition",
    "type":"binary-basic",
    "function":"add_float",
    "input":{
        "float" : 2.0f
        },
    "output":"float",
    "max-number":2
}
,
{
    "name":"exponent",
    "type":"binary-basic",
    "function":"exponent_float",
    "input":{
        "float":2.0f
        },
    "output":"float",
    "max-number":2
}
,
{
    "name":"exponent",
    "type":"binary-basic",
    "function":"exponent_float",
    "input":{
        "float":2.0f,
        "int":1
        },
    "output":"float",
    "max-number":1
}
,
{
    "name":"constant_1",
    "type":"terminal-basic",
    "function":"non_random_constant",
    "value":0.5f,
    "input":{ },
    "output":"float",
    "max-number":3
}
,
{
    "name":"constant_2",
    "type":"terminal-basic",
    "function":"non_random_constant",
    "value":2.0f,
    "input":{ },
    "output":"float",
    "max-number":3
}
,
{
    "name":"constant_3",
    "type":"terminal-basic",
    "function":"non_random_constant",
    "value":true,
    "input":{
        "bool":true
    },
    "output":"bool",
    "max-number":1
}
]

How can I improve on what I have?

\$\endgroup\$
6
  • \$\begingroup\$ You might want to look into a lexical analiser generated such as flex. en.wikipedia.org/wiki/Flex_(lexical_analyser_generator) \$\endgroup\$
    – pacmaninbw
    Commented Nov 24, 2019 at 23:04
  • \$\begingroup\$ @pacmaninbw I'll check it out but I'd really like to try do this by hand. \$\endgroup\$ Commented Nov 24, 2019 at 23:15
  • \$\begingroup\$ Just FYI, after an answer has been added, there really should be no edits to the question. Since the answer didn't cover that portion of the code I won't roll back the edit. \$\endgroup\$
    – pacmaninbw
    Commented Nov 25, 2019 at 16:18
  • \$\begingroup\$ OK it has been rolled back. \$\endgroup\$
    – pacmaninbw
    Commented Nov 25, 2019 at 16:33
  • \$\begingroup\$ Since no one mentioned it: Don't declare and define a destructor if it is empty. = default; is the right approach, but only if it appears in the class declaration. If it is in a separate file there is no difference to defining it with {}, which one shouldn't do. \$\endgroup\$
    – Rakete1111
    Commented Nov 26, 2019 at 11:43

2 Answers 2

4
\$\begingroup\$

Overview

My main issue is that you convert the whole stream into tokens first. Then you parse the tokens. This can get very expensive. Normally you would parse (and push) enough tokens to understand the next part to interpret. Then pop the bit you can to convert the next part.

I dislike the way you have two different types of token state:

    stack_conversion = { { "[", list_open }, { "]", list_close }, { "{", object_open }, { "}", object_close }, { ":", colon }, { ",", comma }, { "buffer", buffer } };

    ....
    enum state { int_value, float_value, bool_value, string_value, default_value, bad_state};

I would have a single list of all tokens (there is not that many in JSON).

    {, }, [, ], :, ',', null, true, false, number(int), number(float), string

Probably a better way to write this is with lex and yacc (or really there more modern equivalents) flex and bison. A lot of research has gone into these tools to achieve exactly this and you can specify a json parser in about 20 lines of code.


This does not do what you think.

template<typename ... T>
bool JSONParser::multiComparision(const char scope, T ... args) {
    return (scope == (args || ...));
}

This expands

JSONParser::multiComparision('a', '1', '2', '3');

=>
   return ('a' == ('1' || '2' || '3'));

I don't think that is what you want.

\$\endgroup\$
0
1
\$\begingroup\$

For testing purposes I created a main(). I used Visual Studio 2019 for the testing, and used C++17. Note that the posted code did not compile in C++11.

#include <iostream>
#include <string>
#include <cstdlib>
#include "JSONParser.h"

int main(int argc, char* argv[])
{
    std::string jsonFile;

    if (argc > 1)
    {
        jsonFile = argv[1];
    }
    else
    {
        std::cout << "Please enter a JSON file name." << std::endl;
        std::cin >> jsonFile;
    }

    JSONParser jsonParser;
    try
    {
        jsonParser.parseFile(jsonFile);
    }
    catch (std::runtime_error ex)
    {
        std::cerr << ex.what() << std::endl;
        return EXIT_FAILURE;
    }

    return EXIT_SUCCESS;
}

I discovered several possible issues during testing, some are not listed here because I ran out of time to debug.

Error Checking

There is no test for if the input file containing the JSON was found or not, this can cause the program to terminate abnormally. Since the user has to indicate what file to open somewhere, there needs to be tests to see if the file can be opened and if the file can be read from. When there is no input file the function std::vector<std::string> JSONParser::lexer(std::string fileName) returns an empty list of strings that causes the program to terminate in the processing of the empty list rather than in the input itself.

One possible alternative implementation of lexor is :

std::vector<std::string> JSONParser::lexer(std::string fileName) {
    char piece;
    std::string capture = "";
    std::string conversion;
    std::vector<std::string> capture_list;

    std::ifstream configfile(fileName);

    piece = configfile.get();
    if (configfile.bad() || configfile.fail())
    {
        std::string emsg("Can't read json from file: ");
        emsg += fileName;
        throw std::runtime_error(emsg);
    }

    while (configfile.good()) {
        if (checkDeliminator(piece)) {
            conversion = piece;
            if (capture != "") {
                capture_list.push_back(capture);
                capture_list.push_back(conversion);
                capture = "";
            }
            else {
                capture_list.push_back(conversion);
            }
        }
        else {
            capture += piece;
        }
        piece = configfile.get();
    }

    configfile.close();

    return capture_list;
}

Note the change of the input for lexor: The ifstream configfile is only used within the function lexor so it is better to instantiate the configfile variable within the body of lexor. This also permits better error messages by passing the file name.

Possible Syntax Errors or Typos

There are several if statements in void JSONParser::fsm(std::string value) that may not return the proper results, instead of using the logical operators && and || the bit wise operators | and & were used. This may cause a program using the parser to fail when it should pass, or pass when it should fail. It should also be noted that the function name fsm is not clear to anyone that needs to maintain the code.

Algorithm

Fast lexical analyzers (such as those generated by lex or flex) are generally implemented as state machines using tables. Parsers generally accept a single token at a time from the lexical analyzer. Parser generators such as YACC or Bison generate push down automata which as state machines using tables coupled with a stack. To improve performance it might be better to implement the lexer and the parser this way.

If you want to experiment with these compiler development tools you can find flex here and bison here.

\$\endgroup\$

Not the answer you're looking for? Browse other questions tagged or ask your own question.