"""

Stand alone function for reading avro files and
converting them to multiple (one per avro file)
newspaper files.

This is called by readavrofiles_batch.py and takes a sysarg.

"""
import pandas as pd
from datetime import datetime
import numpy as np
import sys

# import other modules specific to this project
try:
    import fastavro as avro
except ImportError:
    import subprocess

    subprocess.check_call(
        ["python", "-m", "pip", "install", "fastavro"]
    )  # install fastavro

import fastavro as avro


def readavro():
    textColName = "text"
    dateColName = "date"
    idColName = "ID"
    df = pd.DataFrame()
    records = []
    body = []
    dates = []
    dateStrings = []
    ID = []
    title = []
    sourceCode = []
    section = []
    companyCodes = []
    publisherName = []
    wordCount = []
    subjectCodes = []
    industryCodes = []
    personCodes = []
    currencyCodes = []
    marketIndexCodes = []
    companyCodesAbout = []
    companyCodesAssociation = []
    companyCodesLineage = []
    companyCodesOccur = []
    companyCodesRelevance = []
    sourceName = []
    modification_date = []
    snippet = []
    action = []

    inputFile = sys.argv[1]

    with open(inputFile, "rb") as fo:
        reader = avro.reader(fo)
        for record in reader:
            records.append(record)
        for val in records:
            for key, value in val.items():
                if key == "body":
                    body.append(value)
                if key == "publication_datetime":
                    dateStrings.append(str(value))
                    dates.append(datetime.fromtimestamp(int(str(value)[:-3])))
                if key == "an":
                    ID.append(value)
                if key == "title":
                    title.append(value)
                if key == "source_code":
                    sourceCode.append(value)
                if key == "section":
                    section.append(value)
                if key == "company_codes":
                    companyCodes.append(value)
                if key == "publisher_name":
                    publisherName.append(value)
                if key == "word_count":
                    wordCount.append(value)
                if key == "subject_codes":
                    subjectCodes.append(value)
                if key == "industry_codes":
                    industryCodes.append(value)
                if key == "person_codes":
                    personCodes.append(value)
                if key == "currency_codes":
                    currencyCodes.append(value)
                if key == "market_index_codes":
                    marketIndexCodes.append(value)
                if key == "company_codes_about":
                    companyCodesAbout.append(value)
                if key == "company_codes_association":
                    companyCodesAssociation.append(value)
                if key == "company_codes_lineage":
                    companyCodesLineage.append(value)
                if key == "company_codes_occur":
                    companyCodesOccur.append(value)
                if key == "company_codes_relevance":
                    companyCodesRelevance.append(value)
                if key == "source_name":
                    sourceName.append(value)
                if key == " modification_date":
                    modification_date.append(value)
                if key == "snippet":
                    snippet.append(value)
                if key == "action":
                    action.append(value)

    df = pd.DataFrame(
        np.column_stack(
            [
                body,
                ID,
                dateStrings,
                dates,
                title,
                sourceCode,
                section,
                companyCodes,
                publisherName,
                wordCount,
                subjectCodes,
                personCodes,
                industryCodes,
                currencyCodes,
                marketIndexCodes,
                companyCodesAbout,
                companyCodesAssociation,
                companyCodesLineage,
                companyCodesOccur,
                companyCodesRelevance,
                sourceName,
                modification_date,
                snippet,
                action,
            ]
        ),
        columns=[
            textColName,
            "ID",
            "dateStr",
            "date",
            "title",
            "source_code",
            "section",
            "company_codes",
            "publisher_name",
            "word_count",
            "subject_codes",
            "person_codes",
            "industry_codes",
            "currency_codes",
            "market_index_codes",
            "company_codes_about",
            "company_codes_association",
            "company_codes_lineage",
            "company_codes_occur",
            "company_codes_relevance",
            "source_name",
            "modification_date",
            "snippet",
            "action",
        ],
    )

    return df, dateColName, idColName
