ComplianceAsCode/content

View on GitHub
utils/fix_rules.py

Summary

Maintainability
F
1 wk
Test Coverage
#!/usr/bin/python3

from __future__ import print_function

import sys
import os
import jinja2
import argparse
import json
import re

from ssg import yaml, cce, products
from ssg.shims import input_func
from ssg.utils import read_file_list
import ssg
import ssg.products
import ssg.rules
import ssg.rule_yaml


SSG_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
TO_SORT = ['identifiers', 'references']


_COMMANDS = dict()


def command(name, description):
    def wrapper(wrapped):
        _COMMANDS[name] = wrapped
        wrapped.description = description
        return wrapped
    return wrapper


def has_empty_identifier(rule_path, rule, rule_lines):
    if 'identifiers' in rule and rule['identifiers'] is None:
        return True

    if 'identifiers' in rule and rule['identifiers'] is not None:
        for _, value in rule['identifiers'].items():
            if str(value).strip() == "":
                return True
    return False


def has_no_cce(yaml_file, product_yaml=None):
    rule = yaml.open_and_macro_expand(yaml_file, product_yaml)
    product = product_yaml["product"]
    if 'identifiers' in rule and rule['identifiers'] is None:
        return True

    if 'identifiers' in rule and rule['identifiers'] is not None:
        for ident in rule['identifiers']:
            if ident == "cce@" + product:
                return False
    return True


def has_empty_references(rule_path, rule, rule_lines):
    if 'references' in rule and rule['references'] is None:
        return True

    if 'references' in rule and rule['references'] is not None:
        for _, value in rule['references'].items():
            if str(value).strip() == "":
                return True
    return False


def has_prefix_cce(rule_path, rule, rule_lines):
    if 'identifiers' in rule and rule['identifiers'] is not None:
        for i_type, i_value in rule['identifiers'].items():
            if i_type[0:3] == 'cce':
                has_prefix = i_value[0:3].upper() == 'CCE'
                remainder_valid = cce.is_cce_format_valid("CCE-" + i_value[3:])
                remainder_valid |= cce.is_cce_format_valid("CCE-" + i_value[4:])
                return has_prefix and remainder_valid
    return False


def has_invalid_cce(rule_path, rule, rule_lines):
    if 'identifiers' in rule and rule['identifiers'] is not None:
        for i_type, i_value in rule['identifiers'].items():
            if i_type[0:3] == 'cce':
                if not cce.is_cce_value_valid("CCE-" + str(i_value)):
                    return True
    return False


def has_int_identifier(rule_path, rule, rule_lines):
    if 'identifiers' in rule and rule['identifiers'] is not None:
        for _, value in rule['identifiers'].items():
            if type(value) != str:
                return True
    return False


def has_int_reference(rule_path, rule, rule_lines):
    if 'references' in rule and rule['references'] is not None:
        for _, value in rule['references'].items():
            if type(value) != str:
                return True
    return False


def has_duplicated_subkeys(rule_path, rule, rule_lines):
    return ssg.rule_yaml.has_duplicated_subkeys(rule_path, rule_lines, TO_SORT)


def _human_sort(line):
    # Based on: https://blog.codinghorror.com/sorting-for-humans-natural-sort-order/
    def convert(text): return int(text) if text.isdigit() else text
    return [convert(text) for text in re.split(r'(\d+)', line)]


def has_unordered_sections(rule_path, rule, rule_lines):
    if 'references' in rule or 'identifiers' in rule:
        new_lines = ssg.rule_yaml.sort_section_keys(rule_path, rule_lines, TO_SORT,
                                                    sort_func=_human_sort)

        # Compare string representations to avoid issues with references being
        # different.
        return "\n".join(rule_lines) != "\n".join(new_lines)

    return False


def rule_data_generator(args):
    # Iterates over all know rules in the build system (according to
    # rule_dir_json.py) and attempts to load the resulting YAML files.
    # If they parse correctly, yield them as a result.
    #
    # Note: this has become a generator rather than returning a list of
    # results.

    product_yamls = dict()

    rule_dirs = json.load(open(args.json))
    for rule_id in rule_dirs:
        rule_obj = rule_dirs[rule_id]

        if 'products' not in rule_obj or not rule_obj['products']:
            print(rule_id, rule_obj)
        assert rule_obj['products']
        product = rule_obj['products'][0]

        if product not in product_yamls:
            product_path = ssg.products.product_yaml_path(args.root, product)
            product_yaml = ssg.products.load_product_yaml(product_path)
            properties_directory = os.path.join(args.root, "product_properties")
            product_yaml.read_properties_from_directory(properties_directory)
            product_yamls[product] = product_yaml

        local_env_yaml = dict(cmake_build_type='Debug')
        local_env_yaml.update(product_yamls[product])
        local_env_yaml['rule_id'] = rule_id

        rule_path = ssg.rules.get_rule_dir_yaml(rule_obj['dir'])
        try:
            rule = yaml.open_and_macro_expand(rule_path, local_env_yaml)
            rule_lines = read_file_list(rule_path)
            yield rule_path, rule, rule_lines, product_path, local_env_yaml
        except jinja2.exceptions.UndefinedError as ue:
            msg = "Failed to parse file {0} (with product.yml: {1}). Skipping. {2}"
            msg = msg.format(rule_path, product_path, ue)
            print(msg, file=sys.stderr)


def find_rules_generator(args, func):
    for item in rule_data_generator(args):
        rule_path, rule, rule_lines, product_path, local_env_yaml = item
        if func(rule_path, rule, rule_lines):
            yield (rule_path, product_path, local_env_yaml)


def find_rules(args, func):
    # Returns find_rules_generator as a list
    return list(find_rules_generator(args, func))


def print_file(file_contents):
    for line_num in range(0, len(file_contents)):
        print("%d: %s" % (line_num, file_contents[line_num]))


def find_section_lines(file_contents, sec):
    # Hack to find a global key ("section"/sec) in a YAML-like file.
    # All indented lines until the next global key are included in the range.
    # For example:
    #
    # 0: not_it:
    # 1:     - value
    # 2: this_one:
    # 3:      - 2
    # 4:      - 5
    # 5:
    # 6: nor_this:
    #
    # for the section "this_one", the result [(2, 5)] will be returned.
    # Note that multiple sections may exist in a file and each will be
    # identified and returned.
    sec_ranges = []

    sec_id = sec + ":"
    sec_len = len(sec_id)
    end_num = len(file_contents)
    line_num = 0

    while line_num < end_num:
        if len(file_contents[line_num]) >= sec_len:
            if file_contents[line_num][0:sec_len] == sec_id:
                begin = line_num
                line_num += 1
                while line_num < end_num:
                    if len(file_contents[line_num]) > 0 and file_contents[line_num][0] != ' ':
                        break
                    line_num += 1

                end = line_num - 1
                sec_ranges.append((begin, end))
        line_num += 1
    return sec_ranges


def remove_lines(file_contents, lines):
    # Returns a series of lines and returns a new copy
    new_file = []
    for line_num in range(0, len(file_contents)):
        if line_num not in lines:
            new_file.append(file_contents[line_num])

    return new_file


def remove_section_keys(file_contents, yaml_contents, section, removed_keys):
    # Remove a series of keys from a section. Refuses to operate if there is more
    # than one instance of the section. If the section is empty (because all keys
    # are removed), then the section is also removed. Otherwise, only matching keys
    # are removed. Note that all instances of the keys will be removed, if it appears
    # more than once.
    sec_ranges = find_section_lines(file_contents, section)
    if len(sec_ranges) != 1:
        raise RuntimeError("Refusing to fix file: %s -- could not find one section: %d"
                           % (path, sec_ranges))

    begin, end = sec_ranges[0]
    r_lines = set()

    if (yaml_contents[section] is None or len(yaml_contents[section].keys()) == len(removed_keys)):
        r_lines = set(range(begin, end+1))
        print("Removing entire section since all keys are empty")
    else:
        # Don't include section header
        for line_num in range(begin+1, end+1):
            line = file_contents[line_num].strip()
            len_line = len(line)

            for key in removed_keys:
                k_l = len(key)+1
                k_i = key + ":"
                if len_line >= k_l and line[0:k_l] == k_i:
                    r_lines.add(line_num)
                    break

    return remove_lines(file_contents, r_lines)


def rewrite_value_int_str(line):
    # Rewrites a key's value to explicitly be a string. Assumes it starts
    # as an integer. Takes a line.
    key_end = line.index(':')
    key = line[0:key_end]
    value = line[key_end+1:].strip()
    str_value = '"' + value + '"'
    return key + ": " + str_value


def rewrite_keyless_section(file_contents, yaml_contents, section, content):
    new_contents = file_contents[:]

    sec_ranges = find_section_lines(file_contents, section)
    if len(sec_ranges) != 1:
        raise RuntimeError("Refusing to fix file: %s -- could not find one section: %d"
                           % (path, sec_ranges))

    if len(sec_ranges[0]) != 2:
        raise RuntimeError("Section has more than one line")

    new_contents[sec_ranges[0][0]] = "{section}: {content}".format(section=section, content=content)

    return new_contents


def rewrite_value_remove_prefix(line):
    # Rewrites a key's value to remove a "CCE" prefix.
    key_end = line.index(':')
    key = line[0:key_end]
    value = line[key_end+1:].strip()
    new_value = value
    if cce.is_cce_format_valid("CCE-" + value[3:]):
        new_value = value[3:]
    elif cce.is_cce_format_valid("CCE-" + value[4:]):
        new_value = value[4:]
    return key + ": " + new_value


def add_to_the_section(file_contents, yaml_contents, section, new_keys):
    to_insert = []

    sec_ranges = find_section_lines(file_contents, section)
    if len(sec_ranges) != 1:
        raise RuntimeError("could not find one section: %s"
                           % section)

    begin, end = sec_ranges[0]

    assert end > begin, "We need at least one identifier there already"
    template_line = str(file_contents[end - 1])
    leading_whitespace = re.match(r"^\s*", template_line).group()
    for key, value in new_keys.items():
        to_insert.append(leading_whitespace + key + ": " + value)

    new_contents = file_contents[:end] + to_insert + file_contents[end:]
    return new_contents


def sort_section(file_contents, yaml_contents, section):
    new_contents = ssg.rule_yaml.sort_section_keys(yaml_contents, file_contents, section)
    return new_contents


def rewrite_section_value(file_contents, yaml_contents, section, keys, transform):
    # For a given section, rewrite the keys in int_keys to be strings. Refuses to
    # operate if the given section appears more than once in the file. Assumes all
    # instances of key are an integer; all will get updated.
    new_contents = file_contents[:]

    sec_ranges = find_section_lines(file_contents, section)
    if len(sec_ranges) != 1:
        raise RuntimeError("Refusing to fix file: %s -- could not find one section: %d"
                           % (path, sec_ranges))

    begin, end = sec_ranges[0]
    r_lines = set()

    # Don't include section header
    for line_num in range(begin+1, end+1):
        line = file_contents[line_num].strip()
        len_line = len(line)

        for key in keys:
            k_l = len(key)+1
            k_i = key + ":"

            if len_line >= k_l and line[0:k_l] == k_i:
                new_contents[line_num] = transform(file_contents[line_num])
                break

    return new_contents


def rewrite_section_value_int_str(file_contents, yaml_contents, section, int_keys):
    return rewrite_section_value(file_contents, yaml_contents, section, int_keys,
                                 rewrite_value_int_str)


def fix_empty_identifier(file_contents, yaml_contents):
    section = 'identifiers'

    empty_identifiers = []
    if yaml_contents[section] is not None:
        for i_type, i_value in yaml_contents[section].items():
            if str(i_value).strip() == "":
                empty_identifiers.append(i_type)

    return remove_section_keys(file_contents, yaml_contents, section, empty_identifiers)


def fix_empty_reference(file_contents, yaml_contents):
    section = 'references'

    empty_identifiers = []

    if yaml_contents[section] is not None:
        for i_type, i_value in yaml_contents[section].items():
            if str(i_value).strip() == "":
                empty_identifiers.append(i_type)

    return remove_section_keys(file_contents, yaml_contents, section, empty_identifiers)


def fix_prefix_cce(file_contents, yaml_contents):
    section = 'identifiers'

    prefixed_identifiers = []

    if yaml_contents[section] is not None:
        for i_type, i_value in yaml_contents[section].items():
            if i_type[0:3] == 'cce':
                has_prefix = i_value[0:3].upper() == 'CCE'
                remainder_valid = cce.is_cce_format_valid("CCE-" + str(i_value[3:]))
                remainder_valid |= cce.is_cce_format_valid("CCE-" + str(i_value[4:]))
                if has_prefix and remainder_valid:
                    prefixed_identifiers.append(i_type)

    return rewrite_section_value(file_contents, yaml_contents, section, prefixed_identifiers,
                                 rewrite_value_remove_prefix)


def fix_invalid_cce(file_contents, yaml_contents):
    section = 'identifiers'

    invalid_identifiers = []

    if yaml_contents[section] is not None:
        for i_type, i_value in yaml_contents[section].items():
            if i_type[0:3] == 'cce':
                if not cce.is_cce_value_valid("CCE-" + str(i_value)):
                    invalid_identifiers.append(i_type)

    return remove_section_keys(file_contents, yaml_contents, section, invalid_identifiers)


def has_product_cce(yaml_contents, product):
    section = 'identifiers'

    invalid_identifiers = []

    if not yaml_contents[section]:
        return False

    for i_type, i_value in yaml_contents[section].items():
        if i_type[0:3] != 'cce' or "@" not in i_type:
            continue

        _, cce_product = i_type.split("@", 1)
        if product == cce_product:
            return True

    return False


def add_product_cce(file_contents, yaml_contents, product, cce):
    section = 'identifiers'

    if section not in yaml_contents:
        return file_contents

    new_contents = add_to_the_section(
        file_contents, yaml_contents, section, {"cce@{product}".format(product=product): cce})
    new_contents = sort_section(new_contents, yaml_contents, section)
    return new_contents


def fix_int_identifier(file_contents, yaml_contents):
    section = 'identifiers'

    int_identifiers = []
    for i_type, i_value in yaml_contents[section].items():
        if type(i_value) != str:
            int_identifiers.append(i_type)

    return rewrite_section_value_int_str(file_contents, yaml_contents, section, int_identifiers)


def fix_int_reference(file_contents, yaml_contents):
    section = 'references'

    int_identifiers = []
    for i_type, i_value in yaml_contents[section].items():
        if type(i_value) != str:
            int_identifiers.append(i_type)

    return rewrite_section_value_int_str(file_contents, yaml_contents, section, int_identifiers)


def sort_rule_subkeys(file_contents, yaml_contents):
    return ssg.rule_yaml.sort_section_keys(None, file_contents, TO_SORT, sort_func=_human_sort)


def _fixed_file_contents(path, file_contents, product_yaml, func):
    if file_contents[-1] == '':
        file_contents = file_contents[:-1]

    subst_dict = product_yaml
    yaml_contents = yaml.open_and_macro_expand(path, subst_dict)

    try:
        new_file_contents = func(file_contents, yaml_contents)
    except Exception as exc:
        msg = "Refusing to fix file: {path}: {error}".format(path=path, error=str(exc))
        raise RuntimeError(msg)

    return new_file_contents


def fix_file(path, product_yaml, func):
    file_contents = open(path, 'r').read().split("\n")

    new_file_contents = _fixed_file_contents(path, file_contents, product_yaml, func)
    if file_contents == new_file_contents:
        return False

    with open(path, 'w') as f:
        for line in new_file_contents:
            print(line, file=f)
    return True


def fix_file_prompt(path, product_yaml, func, args):
    file_contents = open(path, 'r').read().split("\n")

    new_file_contents = _fixed_file_contents(path, file_contents, product_yaml, func)
    changes = file_contents != new_file_contents

    if not changes:
        return changes

    need_input = not args.assume_yes and not args.dry_run

    if need_input:
        print("====BEGIN BEFORE====")
        print_file(file_contents)
        print("====END BEFORE====")

    if need_input:
        print("====BEGIN AFTER====")
        print_file(new_file_contents)
        print("====END AFTER====")

    response = 'n'
    if need_input:
        response = input_func("Confirm writing output to %s: (y/n): " % path)

    if args.assume_yes or response.strip().lower() == 'y':
        changes = True
        with open(path, 'w') as f:
            for line in new_file_contents:
                print(line, file=f)
    else:
        changes = False
    return changes


def add_cce(args, product_yaml):
    directory = os.path.join(args.root, args.subdirectory)
    cce_pool = cce.CCE_POOLS[args.cce_pool]()
    return _add_cce(directory, cce_pool, args.rule, product_yaml, args)


def _add_cce(directory, cce_pool, rules, product_yaml, args):
    product = product_yaml["product"]

    def is_relevant_rule(rule_path, rule, rule_lines):
        for r in rules:
            if (
                    rule_path.endswith("/{r}/rule.yml".format(r=r))
                    and has_no_cce(rule_path, product_yaml)):
                return True
        return False

    results = find_rules(args, is_relevant_rule)

    for result in results:
        rule_path = result[0]

        cce = cce_pool.random_cce()

        def fix_callback(file_contents, yaml_contents):
            return add_product_cce(file_contents, yaml_contents, product_yaml["product"], cce)

        try:
            changes = fix_file(rule_path, product_yaml, fix_callback)
        except RuntimeError as exc:
            msg = (
                "Error adding CCE into {rule_path}: {exc}"
                .format(rule_path=rule_path, exc=str(exc)))
            raise RuntimeError(exc)

        if changes:
            cce_pool.remove_cce_from_file(cce)


@command("empty_identifiers", "check and fix rules with empty identifiers")
def fix_empty_identifiers(args, product_yaml):
    results = find_rules(args, has_empty_identifier)
    print("Number of rules with empty identifiers: %d" % len(results))

    for result in results:
        rule_path = result[0]

        product_yaml_path = result[2]

        if product_yaml_path is not None:
            product_yaml = yaml.open_raw(product_yaml_path)

        if args.dry_run:
            print(rule_path + " has one or more empty identifiers")
            continue

        fix_file_prompt(rule_path, product_yaml, fix_empty_identifier, args)

    exit(int(len(results) > 0))


@command("empty_references", "check and fix rules with empty references")
def fix_empty_references(args, product_yaml):
    results = find_rules(args, has_empty_references)
    print("Number of rules with empty references: %d" % len(results))

    for result in results:
        rule_path = result[0]
        product_yaml = result[2]

        if args.dry_run:
            print(rule_path + " has one or more empty references")
            continue

        fix_file_prompt(rule_path, product_yaml, fix_empty_reference, args)

    exit(int(len(results) > 0))


@command("prefixed_identifiers", "check and fix rules with prefixed (CCE-) identifiers")
def find_prefix_cce(args):
    results = find_rules(args, has_prefix_cce)
    print("Number of rules with prefixed CCEs: %d" % len(results))

    for result in results:
        rule_path = result[0]
        product_yaml = result[2]

        if args.dry_run:
            print(rule_path + " has one or more CCE with CCE- prefix")
            continue

        fix_file_prompt(rule_path, product_yaml, fix_prefix_cce, args)

    exit(int(len(results) > 0))


@command("invalid_identifiers", "check and fix rules with invalid identifiers")
def find_invalid_cce(args, product_yamls):
    results = find_rules(args, has_invalid_cce)
    print("Number of rules with invalid CCEs: %d" % len(results))

    for result in results:
        rule_path = result[0]
        product_yaml = result[2]

        if args.dry_run:
            print(rule_path + " has one or more invalid CCEs")
            continue

        fix_file_prompt(rule_path, product_yaml, fix_invalid_cce, args)
    exit(int(len(results) > 0))


@command("int_identifiers", "check and fix rules with pseudo-integer identifiers")
def find_int_identifiers(args, product_yaml):
    results = find_rules(args, has_int_identifier)
    print("Number of rules with integer identifiers: %d" % len(results))

    for result in results:
        rule_path = result[0]
        product_yaml = result[2]

        if args.dry_run:
            print(rule_path + " has one or more integer references")
            continue

        fix_file_prompt(rule_path, product_yaml, fix_int_identifier, args)

    exit(int(len(results) > 0))


@command("int_references", "check and fix rules with pseudo-integer references")
def find_int_references(args, product_yaml):
    results = find_rules(args, has_int_reference)
    print("Number of rules with integer references: %d" % len(results))

    for result in results:
        rule_path = result[0]
        product_yaml = result[2]

        if args.dry_run:
            print(rule_path + " has one or more unsorted integer references")
            continue

        fix_file_prompt(rule_path, product_yaml, fix_int_reference, args)

    exit(int(len(results) > 0))


@command("duplicate_subkeys", "check for duplicated references and identifiers")
def duplicate_subkeys(args, product_yaml):
    results = find_rules(args, has_duplicated_subkeys)
    print("Number of rules with duplicated subkeys: %d" % len(results))

    for result in results:
        print(result[0] + " has one or more duplicated subkeys")

    exit(int(len(results) > 0))


@command("sort_subkeys", "sort references and identifiers")
def sort_subkeys(args, product_yaml):
    results = find_rules(args, has_unordered_sections)
    print("Number of modified rules: %d" % len(results))

    for result in results:
        rule_path = result[0]
        product_yaml = result[2]

        if args.dry_run:
            print(rule_path + " has one or more unsorted references")
            continue

        fix_file_prompt(rule_path, product_yaml, sort_rule_subkeys, args)

    exit(int(len(results) > 0))


@command("test_all", "Perform all checks on all rules")
def test_all(args, product_yaml):
    result = 0
    checks = [
        (has_empty_identifier, "empty identifiers"),
        (has_invalid_cce, "invalid CCEs"),
        (has_int_identifier, "integer references"),
        (has_empty_references, "empty references"),
        (has_int_reference, "unsorted references"),
        (has_duplicated_subkeys, "duplicated subkeys"),
        (has_unordered_sections, "unsorted references")
    ]
    for item in rule_data_generator(args):
        rule_path, rule, rule_lines, _, _ = item
        for func, msg in checks:
            if func(rule_path, rule, rule_lines):
                print("Rule '%s' has %s" % (rule_path, msg))
                result = 1
    exit(result)


def create_parser_from_functions(subparsers):
    for name, function in _COMMANDS.items():
        subparser = subparsers.add_parser(name, description=function.description)
        subparser.set_defaults(func=function)


def create_other_parsers(subparsers):
    subparser = subparsers.add_parser("add-cce", description="Add CCE to rule files")
    subparser.add_argument("rule", nargs="+")
    subparser.add_argument("--subdirectory", default="linux_os")
    subparser.add_argument(
        "--cce-pool", "-p", default="redhat", choices=list(cce.CCE_POOLS.keys()),
    )
    subparser.set_defaults(func=add_cce)


def parse_args():
    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     description="Utility for fixing mistakes in rule files")
    parser.add_argument(
        "-y", "--assume-yes", default=False, action="store_true",
        help="Assume yes and overwrite all files (no prompt)")
    parser.add_argument(
        "-d", "--dry-run", default=False, action="store_true",
        help="Assume no and don't overwrite any files")
    parser.add_argument(
        "-j", "--json", type=str, action="store",
        default="build/rule_dirs.json", help="File to read json "
        "output of rule_dir_json.py from (defaults to "
        "build/rule_dirs.json")
    parser.add_argument(
        "-r", "--root", default=SSG_ROOT,
        help="Path to root of the project directory")
    parser.add_argument("--product", "-p", help="Path to the main product.yml")
    subparsers = parser.add_subparsers(title="command", help="What to perform.")
    subparsers.required = True
    create_parser_from_functions(subparsers)
    create_other_parsers(subparsers)
    return parser.parse_args()


def __main__():
    args = parse_args()
    project_root = args.root
    if not project_root:
        project_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.path.pardir)

    subst_dict = dict()
    if args.product:
        subst_dict = dict()
        product = products.load_product_yaml(args.product)
        product.read_properties_from_directory(os.path.join(project_root, "product_properties"))
        subst_dict.update(product)

    args.func(args, subst_dict)


if __name__ == "__main__":
    __main__()