How to create a defaultdict from txt files?

130 views Asked by At

I want to create a defaultdict using data from two txt files.

The first txt file has the following pattern (word idf):

acceler             4.634728988229636 
accept              2.32949254591397 
access              3.0633909220278057 
accid               3.9512437185814275 
acclaim             4.634728988229636 

The second file has the following pattern (textnum word tf)

0097       about        0.07894736842105263 
0097        abus        0.02631578947368421 
0098      acceler       0.02631578947368421 
0098       across       0.02631578947368421 
0099      admonish      0.02631578947368421 
0099       after        0.05263157894736842

The dict I want to create must have the following structure:{textnum : {word : tf*idf}}.

Can you suggest any libraries?

EDIT

That's my new code.

from collections import defaultdict
tf_idf_dict = defaultdict(dict) 

def read_in(path):
    with open(path, "r") as r:
        l = r.readlines()
        return l


def tf_idf_calc(tf_file, idf_file, d):
    tf_line = [[item for item in line.split()] for line in read_in(tf_file)]
    idf_line = [[item for item in line.split()] for line in read_in(idf_file)]
    for line in tf_line:
        for row in idf_line:
            if line[1] == row[0]:
                d[line[0]][line[1]] = float(line[2]) * float(row[1])



    return d




def nested_to_defaultdict(d): #converts the nested dictionary to defaultdict
    if not isinstance(d, dict):
        return d
    return defaultdict(lambda: 0, {key: nested_to_defaultdict(value) for key, value in d.items()})

I have a smaller file which works just fine. But in the large file I get the following error.

line 15, in tf_idf_calc if line[1] == row[0]: IndexError: list index out of range

1

There are 1 answers

10
Dennis On

EDITED

Edited per your comments, this does what you want.

from collections import defaultdict
myDict = defaultdict(dict)

class ReadTxt:
    def __init__(self, path):
          self.path = path

    def __iter__(self):
        with open(self.path,mode='r') as reader:
            for line in reader:
                    yield line

three_col_file = ReadTxt("text2.txt")
two_col_file = ReadTxt("text1.txt")  

def builder():
    for a in three_col_file:
        textnum, word, tf = a.split()
        for b in two_col_file:
            wrd, idf = b.split()
            if wrd == word:
                subDict = myDict[textnum]
                subDict[word] = float(tf) * float(idf)
            
builder()

print(dict(myDict))

OUTPUT:

{'0098': {'acceler': 0.12196655232183251}}