I I have a xml file like this <?xml version="1.0" encoding="UTF-8"?>
<Automation_Config>
<Path>
<Log>.\SERVER.log</Log>
<Flag_Path>.\Flag</Flag_Path>
<files>.\PO</files>
</Path>
</Automation_Config>
I want to read the xml file and get the element of those, and assign to variable. I tried this, but I can not get the element of Log. import xml.dom.minidom
def main ():
Load_XML = xml.dom.minidom.parse('D:/Config.xml')
print (Load_XML.nodeName)
print (Load_XML.firstChild.tagName)
Log = Load_XML.getElementsByTagName ("Log")
print (Log)
main()
mzjn 46.5k11 gold badges122 silver badges237 bronze badges asked Aug 5, 2019 at 3:34 3 Use ElementTree : import xml.etree.ElementTree as ET
tree = ET.parse('Config.xml')
root = tree.getroot()
print(root.findall('.//Log'))
Output: pawel@pawel-XPS-15-9570:~/test$ python parse_xml.py
[<Element 'Log' at 0x7fb3f2eee9f
answered Aug 5, 2019 at 7:00 pawelbylinapawelbylina 1,27710 silver badges15 bronze badges
Below: import xml.etree.ElementTree as ET
xml = '''<?xml version="1.0" encoding="UTF-8"?>
<Automation_Config>
<Path>
<Log>.\SERVER.log</Log>
<Flag_Path>.\Flag</Flag_Path>
<files>.\PO</files>
</Path>
</Automation_Config>'''
root = ET.fromstring(xml)
for idx,log_element in enumerate(root.findall('.//Log')):
print('{}) Log value: {}'.format(idx,log_element.text))
output 0) Log value: .\SERVER.log
answered Aug 5, 2019 at 8:07
baldermanbalderman 21.5k6 gold badges31 silver badges45 bronze badges I have a 60gb+ XML file and, as you can see, I am using a Python script to extract the data and execute 'INSERT' statements to update my database. Being that the file is so large, will I run into speed issues once all is migrated to the database? Or would I need to add more relationships between tables to make the schema more cohesive? To note: the XML file includes all StackOverflow Questions and responses since 2008. import sqlite3
from xml.etree.ElementTree import iterparse
# Takes the path of my XML file, parses line-by-line, returns a
# dictionary of a single posting that is checked and inserted into
# 1 of 3 tables and repeats 100 million + times for each posting
def parse_and_move(filename, path):
doc = iterparse(filename, ('start', 'end'))
next(doc)
name = 'parents'
con = sqlite3.connect("{}2.db".format(name))
cur = con.cursor()
# Below are the 3 tables. The first includes posts (from Stack
# Overflow) that have accepted answers.
# The second includes posts that do not have accepted answers.
# And the third includes all responses posts categorized in the
# former 2 tables.
cur.executescript('''
CREATE TABLE postsWithAnswers(
Id integer primary key,
PostTypeId,
AcceptedAnswerId,
CreationDate,
Score,
ViewCount,
Body,
OwnerUserId,
OwnerDisplayName,
LastEditorUserId,
LastEditorDisplayName,
LastEditDate,
LastActivityDate,
Title,
Tags,
AnswerCount,
CommentCount,
FavoriteCount,
ClosedDate,
CommunityOwnedDate,
FOREIGN KEY(
AcceptedAnswerId) REFERENCES
responses(Id)
);
CREATE TABLE postsWithOutAnswers(
Id integer primary key,
PostTypeId,
CreationDate,
Score,
ViewCount,
Body,
OwnerUserId,
OwnerDisplayName,
LastEditorUserId,
LastEditorDisplayName,
LastEditDate,
LastActivityDate,
Title,
Tags,
AnswerCount,
CommentCount,
FavoriteCount,
ClosedDate,
CommunityOwnedDate
);
CREATE TABLE responses(
Id integer primary key,
PostTypeId,
ParentId,
CreationDate,
Score,
Body,
OwnerUserId,
OwnerDisplayName,
LastEditorUserId,
LastEditorDisplayName,
LastEditDate,
LastActivityDate,
CommentCount,
CommunityOwnedDate,
FOREIGN KEY(ParentId) REFERENCES postsWithAnswers(Id)
);
''')
i=0 # used to track the number of records added to database
# Parsing XML 'tree' - the document is not nested at all. It is line
# by line.
for event, xml_element in doc:
if event == 'start':
dict_of_posting_data = xml_element.attrib
if dict_of_posting_data:
'''
The following 3 blocks of code check:
a.) If the Post is a question or a response (PostTypeId = 1)
b.) If the Post question does NOT have a response (
PostTypeId = 1, and the XML line includes an
"AcceptedAnswerId" category.
c.) If the post is a response to a question (PostTypeId = 2)
Finally, every possible field is set to None (info for a given
posting in a given category differs) - the dict is then updated
by the actual data pulled from the XML file.
'''
if int(dict_of_posting_data['PostTypeId']) == 1 and 'AcceptedAnswerId' in dict_of_posting_data:
li1 = ['Id',
'PostTypeId',
'AcceptedAnswerId',
'CreationDate',
'Score',
'ViewCount',
'Body',
'OwnerUserId',
'OwnerDisplayName',
'LastEditorUserId',
'LastEditorDisplayName',
'LastEditDate',
'LastActivityDate',
'Title',
'Tags',
'AnswerCount',
'CommentCount',
'FavoriteCount',
'ClosedDate',
'CommunityOwnedDate'
]
li_non1 = [None]*len(li1)
base_d1 = dict(zip(li1, li_non1))
base_d1.update(dict_of_posting_data)
cur.execute(
'INSERT INTO postsWithAnswers VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)',
(base_d1.get('Id'),
base_d1.get('PostTypeId'),
base_d1.get('AcceptedAnswerId'),
base_d1.get('CreationDate'),
base_d1.get('Score'),
base_d1.get('ViewCount'),
base_d1.get('Body'),
base_d1.get('OwnerUserId'),
base_d1.get('OwnerDisplayName'),
base_d1.get('LastEditorUserId'),
base_d1.get('LastEditorDisplayName'),
base_d1.get('LastEditDate'),
base_d1.get('LastActivityDate'),
base_d1.get('Title'),
base_d1.get('Tags'),
base_d1.get('AnswerCount'),
base_d1.get('CommentCount'),
base_d1.get('FavoriteCount'),
base_d1.get('ClosedDate'),
base_d1.get('CommunityOwnedDate'),
)
)
elif int(dict_of_posting_data['PostTypeId']) == 1 and 'AcceptedAnswerId' not in dict_of_posting_data:
li2 = ['Id',
'PostTypeId',
'CreationDate',
'Score',
'ViewCount',
'Body',
'OwnerUserId',
'OwnerDisplayName',
'LastEditorUserId',
'LastEditorDisplayName',
'LastEditDate',
'LastActivityDate',
'Title',
'Tags',
'AnswerCount',
'CommentCount',
'FavoriteCount',
'ClosedDate',
'CommunityOwnedDate'
]
li_non2 = [None] * len(li2)
base_d2 = dict(zip(li2, li_non2))
base_d2.update(dict_of_posting_data)
cur.execute(
'INSERT INTO postsWithOutAnswers VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)',
(base_d2.get('Id'),
base_d2.get('PostTypeId'),
base_d2.get('CreationDate'),
base_d2.get('Score'),
base_d2.get('ViewCount'),
base_d2.get('Body'),
base_d2.get('OwnerUserId'),
base_d2.get('OwnerDisplayName'),
base_d2.get('LastEditorUserId'),
base_d2.get('LastEditorDisplayName'),
base_d2.get('LastEditDate'),
base_d2.get('LastActivityDate'),
base_d2.get('Title'),
base_d2.get('Tags'),
base_d2.get('AnswerCount'),
base_d2.get('CommentCount'),
base_d2.get('FavoriteCount'),
base_d2.get('ClosedDate'),
base_d2.get('CommunityOwnedDate'),
)
)
else: # dict will be referring to a response to one of the posts
li3 = ['Id',
'PostTypeId',
'ParentId',
'CreationDate',
'Score',
'Body',
'OwnerUserId',
'OwnerDisplayName',
'LastEditorUserId',
'LastEditorDisplayName',
'LastEditDate',
'LastActivityDate',
'CommentCount',
'CommunityOwnedDate'
]
li_non3 = [None]*len(li3)
base_d3 = dict(zip(li3, li_non3))
base_d3.update(dict_of_posting_data)
cur.execute(
'INSERT INTO responses VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)',
(base_d3.get('Id'),
base_d3.get('PostTypeId'),
base_d3.get('ParentId'),
base_d3.get('CreationDate'),
base_d3.get('Score'),
base_d3.get('Body'),
base_d3.get('OwnerUserId'),
base_d3.get('OwnerDisplayName'),
base_d3.get('LastEditorUserId'),
base_d3.get('LastEditorDisplayName'),
base_d3.get('LastEditDate'),
base_d3.get('LastActivityDate'),
base_d3.get('CommentCount'),
base_d3.get('CommunityOwnedDate'),
)
)
con.commit()
i+=1
print(i)
parse_and_move('/Users/BR05URF/Downloads/stackexchange/Posts.xml', 'row/row')
|