fasta file is a widely used format of DNA, RNA and protein sequence database, here is an example from yeast protein database,
>YAL069W YAL069W SGDID:S000002143, Chr I from 335-649, Genome Release 64-3-1, Dubious ORF, "Dubious open reading frame" ATGATCGTAAATAACACACACGTGCTTACCCTACCACTTTATACCACCACCACATGCCAT ACTCACCCTCACTTGTATACTGATTTTACGTACGCACACGGATGCTACAGTATATACCAT CTCAAACTTACCCTACTCTCAGATTCCACTTCACTCCATGGCCCATCTCTCACTGAATCA GTACCAAATGCACTCACATCATTATGCACGGCACTTGCCTCAGCGGTCTATACCCTGTGC CATTTACCCATAACGCCCATCATTATCCACATTTTGATATCTATATCTCATTCGGCGGTC CCAAATATTGTATAA >YAL068W-A YAL068W-A SGDID:S000028594, Chr I from 538-792, Genome Release 64-3-1, Dubious ORF, "Dubious open reading frame" ATGCACGGCACTTGCCTCAGCGGTCTATACCCTGTGCCATTTACCCATAACGCCCATCAT TATCCACATTTTGATATCTATATCTCATTCGGCGGTCCCAAATATTGTATAACTGCCCTT AATACATACGTTATACCACTTTTGCACCATATACTTACCACTCCATTTATATACACTTAT GTCAATATTACAGAAAAATCCCCACAAAAATCACCTAAACATAAAAATATTCTACTTTTC AACAATAATACATAA
Below is a code to convert fasta file into python dictionary.
#!/usr/bin/python3
import sys
file_name = str(sys.argv[1])
# Function to remove empty lines in a file
def remove_empty_lines(input_file):
with open(input_file, 'r') as infile:
lines = infile.readlines()
# Remove empty lines
non_empty_lines = [line for line in lines if line.strip()]
return non_empty_lines
# Define a function to transform fasta file into dictionary
# {gene_name:{'seq':sequence,'length':gene length}}.
def Fasta2Dict(file_name):
gene_name = 'Bob'
seq = ''
Gene_dict = {}
lines = remove_empty_lines(file_name)
for line in lines:
line = line.strip()
if line[0] == '>':
if gene_name != 'Bob':
Gene_dict[gene_name] = {'seq': seq, 'length': len(seq)}
gene_name = line.split(' ')[0][1:]
seq = ''
else:
seq += line
Gene_dict[gene_name] = {'seq': seq, 'length': len(seq)}
return Gene_dict
# Test the function
#file_name = 'test.fasta' # Provide the path to your FASTA file here
result = Fasta2Dict(file_name)
print(result)