Data prep
In [ ]:
Copied!
import pandas as pd
import glob
import pandas as pd
import glob
In [ ]:
Copied!
comm_paths = glob.glob("data/Commentaries/*.csv")
len(comm_paths)
comm_paths = glob.glob("data/Commentaries/*.csv")
len(comm_paths)
In [ ]:
Copied!
report_paths = glob.glob("data/Reports/*.txt")
len(report_paths)
report_paths = glob.glob("data/Reports/*.txt")
len(report_paths)
In [ ]:
Copied!
comm_fno = []
for i in comm_paths:
fno = int(i.split("\\")[-1].split("_")[-1].split(".")[0])
comm_fno.append(fno)
report_fno = []
for j in report_paths:
fno = int(j.split("\\")[-1].replace("report", "").split(".")[0])
report_fno.append(fno)
comm_fno = set(comm_fno)
report_fno = set(report_fno)
comm_fno = []
for i in comm_paths:
fno = int(i.split("\\")[-1].split("_")[-1].split(".")[0])
comm_fno.append(fno)
report_fno = []
for j in report_paths:
fno = int(j.split("\\")[-1].replace("report", "").split(".")[0])
report_fno.append(fno)
comm_fno = set(comm_fno)
report_fno = set(report_fno)
In [ ]:
Copied!
to_read_fno = list(comm_fno.intersection(report_fno))
to_read_fno = list(comm_fno.intersection(report_fno))
In [ ]:
Copied!
comm_read_paths = []
for i in comm_paths:
fno = int(i.split("\\")[-1].split("_")[-1].split(".")[0])
if fno in to_read_fno:
comm_read_paths.append(i)
report_read_paths = []
for j in report_paths:
fno = int(j.split("\\")[-1].replace("report", "").split(".")[0])
if fno in to_read_fno:
report_read_paths.append(j)
comm_read_paths = []
for i in comm_paths:
fno = int(i.split("\\")[-1].split("_")[-1].split(".")[0])
if fno in to_read_fno:
comm_read_paths.append(i)
report_read_paths = []
for j in report_paths:
fno = int(j.split("\\")[-1].replace("report", "").split(".")[0])
if fno in to_read_fno:
report_read_paths.append(j)
In [ ]:
Copied!
len(comm_read_paths) == len(report_read_paths)
len(comm_read_paths) == len(report_read_paths)
In [ ]:
Copied!
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
In [ ]:
Copied!
def commentary_read(path):
test_str_lst = pd.read_csv(path)["Data"].to_list()
comm_inp_lst = []
cnt = 0
for i in test_str_lst:
j = i.lower()
if "run" not in j:
if cnt==0 or cnt>=len(test_str_lst)-3:
comm_inp_lst.append(i)
elif " bye " not in j and " wide " not in j and "extra" not in j:
if " out " in j:
comm_inp_lst.append(i)
elif "wicket" in j:
comm_inp_lst.append(i)
else:
if random.choice([0,1]) == 1:
comm_inp_lst.append(i)
elif " six " in j:
if random.choice([0,1,0,0]) == 1:
comm_inp_lst.append(i)
cnt = cnt + 1
return ' '.join(comm_inp_lst)
def commentary_read(path):
test_str_lst = pd.read_csv(path)["Data"].to_list()
comm_inp_lst = []
cnt = 0
for i in test_str_lst:
j = i.lower()
if "run" not in j:
if cnt==0 or cnt>=len(test_str_lst)-3:
comm_inp_lst.append(i)
elif " bye " not in j and " wide " not in j and "extra" not in j:
if " out " in j:
comm_inp_lst.append(i)
elif "wicket" in j:
comm_inp_lst.append(i)
else:
if random.choice([0,1]) == 1:
comm_inp_lst.append(i)
elif " six " in j:
if random.choice([0,1,0,0]) == 1:
comm_inp_lst.append(i)
cnt = cnt + 1
return ' '.join(comm_inp_lst)
In [ ]:
Copied!
def report_read(path):
with open(path) as f:
report_str = f.read()
return report_str
def report_read(path):
with open(path) as f:
report_str = f.read()
return report_str
In [ ]:
Copied!
final_commentaries = []
final_reports = []
for i in comm_read_paths:
final_commentaries.append(commentary_read(i))
for j in report_read_paths:
final_reports.append(report_read(j))
final_commentaries = []
final_reports = []
for i in comm_read_paths:
final_commentaries.append(commentary_read(i))
for j in report_read_paths:
final_reports.append(report_read(j))
In [ ]:
Copied!
final_df = pd.DataFrame({
"Commentary_Highlights" : final_commentaries,
"Match_Report" : final_reports
})
final_df
final_df = pd.DataFrame({
"Commentary_Highlights" : final_commentaries,
"Match_Report" : final_reports
})
final_df
In [ ]:
Copied!
final_df.to_parquet("data/match_report_gen.parquet", index=False)
final_df.to_csv("data/match_report_gen.csv", index=False)
final_df.to_parquet("data/match_report_gen.parquet", index=False)
final_df.to_csv("data/match_report_gen.csv", index=False)
In [ ]:
Copied!
final_df
final_df
In [ ]:
Copied!
final_df["src_len"] = final_df["Commentary_Highlights"].map(lambda x : len(tokenizer(x)["input_ids"]))
final_df["tgt_len"] = final_df["Match_Report"].map(lambda x : len(tokenizer(x)["input_ids"]))
final_df["src_len"] = final_df["Commentary_Highlights"].map(lambda x : len(tokenizer(x)["input_ids"]))
final_df["tgt_len"] = final_df["Match_Report"].map(lambda x : len(tokenizer(x)["input_ids"]))
In [ ]:
Copied!
final_df
final_df
In [ ]:
Copied!
final_df["Commentary_Highlights"][0]
final_df["Commentary_Highlights"][0]
In [ ]:
Copied!
import seaborn as sns
sns.distplot(final_df["src_len"])
import seaborn as sns
sns.distplot(final_df["src_len"])
In [ ]:
Copied!
final_df.describe()
final_df.describe()
In [ ]:
Copied!
import pandas as pd
df = pd.read_excel("data_new_T5.xlsx")
import pandas as pd
df = pd.read_excel("data_new_T5.xlsx")
In [ ]:
Copied!
df.iloc[0]
df.iloc[0]
In [ ]:
Copied!
df["src_len"] = df["input_text"].map(lambda x : len(tokenizer(x)["input_ids"]))
df["tgt_len"] = df["target_text"].map(lambda x : len(tokenizer(x)["input_ids"]))
df["src_len"] = df["input_text"].map(lambda x : len(tokenizer(x)["input_ids"]))
df["tgt_len"] = df["target_text"].map(lambda x : len(tokenizer(x)["input_ids"]))
In [ ]:
Copied!
df.describe()
df.describe()
In [ ]:
Copied!