"""

This looks on blob storage for .avro files, pulls them all together,
deduplicates them, and saves them in one newspaper per csv in tidy format
following this structure:

text,ID,dateStr,date,title,source_code,section,company_codes,publisher_name,
word_count,subject_codes,person_codes,industry_codes,currency_codes,
market_index_codes,company_codes_about,company_codes_association,
company_codes_lineage,company_codes_occur,company_codes_relevance,
source_name,modification_date,snippet,action,Newspaper

"""
import os
import fastavro as avro
import glob
import pandas as pd
import configparser
import csv

# ---------------------------------------------------------------------------
# Settings & file paths
# ---------------------------------------------------------------------------
config = configparser.ConfigParser()
config.optionxform = str
config.read("config.ini")
new_col_names_dict = {"body": "text", "an": "ID", "publication_datetime": "dateStr"}
list_of_cols_to_keep = [
    "text",
    "ID",
    "dateStr",
    "date",
    "title",
    "source_code",
    "section",
    "company_codes",
    "publisher_name",
    "word_count",
    "subject_codes",
    "person_codes",
    "industry_codes",
    "currency_codes",
    "market_index_codes",
    "company_codes_about",
    "company_codes_association",
    "company_codes_lineage",
    "company_codes_occur",
    "company_codes_relevance",
    "snippet",
    "source_name",
]


# ---------------------------------------------------------------------------
# Functions
# ---------------------------------------------------------------------------
def process_avro_file(input_file):
    """
    Cleans a single avro file given its path and returns it as a tidy dataframe
    """
    with open(input_file, "rb") as fp:
        # Configure Avro reader
        reader = avro.reader(fp)
        # Load records in memory
        records = [r for r in reader]
        # Populate pandas.DataFrame with records
        df = pd.DataFrame.from_records(records)
        # Drop any article that is a rep rather than an add
        df = df[~df.action.str.contains("rep")]
        # Convert datestrings to datetimes
        # Changed to timestamp to preserve full info on hrs/s
        # Division by 1000 to get milliseconds to seconds from this SO post:
        # https://stackoverflow.com/questions/9744775/how-to-convert-integer-timestamp-to-python-datetime
        df["date"] = df["publication_datetime"].apply(
            lambda x: pd.to_datetime(x / 1e3, unit="s")
        )
        # # Apply new column names (sensitive to position within this code)
        df = df.rename(columns=new_col_names_dict)
        # Drop any columns not defined above in the list
        cols_in_df = df.columns
        cols_to_drop = [x for x in cols_in_df if x not in list_of_cols_to_keep]
        df = df.drop(cols_to_drop, axis=1)
        return df


# ---------------------------------------------------------------------------
# Use Azcopy to grab raw data files from blob storage
# ---------------------------------------------------------------------------
# sas_creds_blob = ''
# input_dir = os.path.join('data', 'raw')
# copy_in_command = ('./azcopy' + ' ' + sas_creds_blob + ' ' +
#                    input_dir + ' --recursive')
# copy_in_command_win = ('azcopy copy' + ' ' + sas_creds_blob + ' ' +
#                        input_dir + ' --recursive')
# print(copy_in_command)
# ---------------------------------------------------------------------------
# Find the avros and say where to store created csvs
# ---------------------------------------------------------------------------
avro_paths = [
    name
    for name in glob.glob(os.path.join("data", "raw", "*_MakingTextCount*", "*.avro"))
]
# ---------------------------------------------------------------------------
# Process raw data into source per csv files
# ---------------------------------------------------------------------------
# Combine all avro files
df = pd.concat([process_avro_file(x) for x in avro_paths], axis=0)
# Time order and set time as index
df = df.set_index("date")
df = df.sort_index()
df = df.drop_duplicates(subset=["ID"], keep="first")
# Create combined text and title field - note that snippet is first para
# of main article
# This avoids propagation of NANs in one column of the text
cols_to_str_rem_nans = ["title", "snippet", "text"]
for col in cols_to_str_rem_nans:
    df.loc[df[col].isna(), col] = ""
df["text"] = df["title"] + " " + df["snippet"] + " " + df["text"]
df = df.drop_duplicates(subset=["text"], keep="first")
# Write out to file
unique_source_names = df["source_code"].unique()
for source in unique_source_names:
    # Note that when viewed in Excel CSV viewer, this output file
    # may look broken up. But when read back in to pandas,
    # it will read okay.
    (
        df.loc[df["source_code"] == source].to_csv(
            os.path.join("data", "raw", str(source.ljust(4, "0"))[:4] + ".csv"),
            quoting=csv.QUOTE_NONNUMERIC,
        )
    )
# ---------------------------------------------------------------------------
# Copy processed data back onto blob storage
# ---------------------------------------------------------------------------
# # Note that this will copy the entire directory
# sas_creds_fileshare = ''
# copy_out_command = ('./azcopy * ' + local_csv_directory +
#                     ' ' + sas_creds_fileshare)
# copy_out_command_win = ('azcopy copy * ' + local_csv_directory +
#                         ' ' + sas_creds_fileshare)
# print(copy_out_command)
