Python read xml file stack overflow

I I have a xml file like this

<?xml version="1.0" encoding="UTF-8"?>
<Automation_Config>
    <Path>
        <Log>.\SERVER.log</Log>
        <Flag_Path>.\Flag</Flag_Path>
        <files>.\PO</files>
    </Path>

</Automation_Config>

I want to read the xml file and get the element of those, and assign to variable.

I tried this, but I can not get the element of Log.

import xml.dom.minidom
def main ():
    Load_XML = xml.dom.minidom.parse('D:/Config.xml')
    print (Load_XML.nodeName)
    print (Load_XML.firstChild.tagName)

    Log = Load_XML.getElementsByTagName ("Log")
    print (Log)

main()

mzjn

46.5k11 gold badges122 silver badges237 bronze badges

asked Aug 5, 2019 at 3:34

3

Use ElementTree:

import xml.etree.ElementTree as ET
tree = ET.parse('Config.xml')
root = tree.getroot()
print(root.findall('.//Log'))

Output:

pawel@pawel-XPS-15-9570:~/test$ python parse_xml.py 
[<Element 'Log' at 0x7fb3f2eee9f

answered Aug 5, 2019 at 7:00

pawelbylinapawelbylina

1,27710 silver badges15 bronze badges

Below:

import xml.etree.ElementTree as ET
xml = '''<?xml version="1.0" encoding="UTF-8"?>
<Automation_Config>
    <Path>
        <Log>.\SERVER.log</Log>
        <Flag_Path>.\Flag</Flag_Path>
        <files>.\PO</files>
    </Path>

</Automation_Config>'''

root = ET.fromstring(xml)
for idx,log_element in enumerate(root.findall('.//Log')):
  print('{}) Log value: {}'.format(idx,log_element.text))

output

0) Log value: .\SERVER.log

answered Aug 5, 2019 at 8:07

baldermanbalderman

21.5k6 gold badges31 silver badges45 bronze badges

I have a 60gb+ XML file and, as you can see, I am using a Python script to extract the data and execute 'INSERT' statements to update my database.

Being that the file is so large, will I run into speed issues once all is migrated to the database? Or would I need to add more relationships between tables to make the schema more cohesive?

To note: the XML file includes all StackOverflow Questions and responses since 2008.

import sqlite3
from xml.etree.ElementTree import iterparse


# Takes the path of my XML file, parses line-by-line, returns a
# dictionary of a single posting that is checked and inserted into
# 1 of 3 tables and repeats 100 million + times for each posting

def parse_and_move(filename, path):

  doc = iterparse(filename, ('start', 'end'))
  next(doc)
  name = 'parents'
  con = sqlite3.connect("{}2.db".format(name))
  cur = con.cursor()

  # Below are the 3 tables. The first includes posts (from Stack
  # Overflow) that have accepted answers.
  # The second includes posts that do not have accepted answers.
  # And the third includes all responses posts categorized in the
  # former 2 tables.

  cur.executescript('''
    CREATE TABLE postsWithAnswers(
                                Id integer primary key,
                                PostTypeId,
                                AcceptedAnswerId,
                                CreationDate,
                                Score,
                                ViewCount,
                                Body,
                                OwnerUserId,
                                OwnerDisplayName,
                                LastEditorUserId,
                                LastEditorDisplayName,
                                LastEditDate,
                                LastActivityDate,
                                Title,
                                Tags,
                                AnswerCount,
                                CommentCount,
                                FavoriteCount,
                                ClosedDate,
                                CommunityOwnedDate,
                                FOREIGN KEY(
                                AcceptedAnswerId) REFERENCES
                                responses(Id)
                                );


    CREATE TABLE postsWithOutAnswers(
                                Id integer primary key,
                                PostTypeId,
                                CreationDate,
                                Score,
                                ViewCount,
                                Body,
                                OwnerUserId,
                                OwnerDisplayName,
                                LastEditorUserId,
                                LastEditorDisplayName,
                                LastEditDate,
                                LastActivityDate,
                                Title,
                                Tags,
                                AnswerCount,
                                CommentCount,
                                FavoriteCount,
                                ClosedDate,
                                CommunityOwnedDate
                                );

    CREATE TABLE responses(
                                Id integer primary key,
                                PostTypeId,
                                ParentId,
                                CreationDate,
                                Score,
                                Body,
                                OwnerUserId,
                                OwnerDisplayName,
                                LastEditorUserId,
                                LastEditorDisplayName,
                                LastEditDate,
                                LastActivityDate,
                                CommentCount,
                                CommunityOwnedDate,

                                FOREIGN KEY(ParentId) REFERENCES postsWithAnswers(Id)
                                );

                                ''')

i=0 # used to track the number of records added to database


# Parsing XML 'tree' - the document is not nested at all. It is line
#  by line.

for event, xml_element in doc:
    if event == 'start':
        dict_of_posting_data = xml_element.attrib

        if dict_of_posting_data:

            '''

            The following 3 blocks  of code check:

            a.) If the Post is a question or a response (PostTypeId = 1)
            b.) If the Post question does NOT have a response (
                   PostTypeId = 1, and the XML line includes an
                   "AcceptedAnswerId" category.
            c.) If the post is a response to a question (PostTypeId = 2)

            Finally, every possible field is set to None (info for a given
            posting in a given category differs) - the dict is then updated
            by the actual data pulled from the XML file.


            '''

            if int(dict_of_posting_data['PostTypeId']) == 1 and 'AcceptedAnswerId' in dict_of_posting_data:
                li1 = ['Id',
                       'PostTypeId',
                       'AcceptedAnswerId',
                       'CreationDate',
                       'Score',
                       'ViewCount',
                       'Body',
                       'OwnerUserId',
                       'OwnerDisplayName',
                       'LastEditorUserId',
                       'LastEditorDisplayName',
                       'LastEditDate',
                       'LastActivityDate',
                       'Title',
                       'Tags',
                       'AnswerCount',
                       'CommentCount',
                       'FavoriteCount',
                       'ClosedDate',
                       'CommunityOwnedDate'
                       ]

                li_non1 = [None]*len(li1)
                base_d1 = dict(zip(li1, li_non1))
                base_d1.update(dict_of_posting_data)


                cur.execute(
                    'INSERT INTO postsWithAnswers VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)',
                                                        (base_d1.get('Id'),
                                                         base_d1.get('PostTypeId'),
                                                         base_d1.get('AcceptedAnswerId'),
                                                         base_d1.get('CreationDate'),
                                                         base_d1.get('Score'),
                                                         base_d1.get('ViewCount'),
                                                         base_d1.get('Body'),
                                                         base_d1.get('OwnerUserId'),
                                                         base_d1.get('OwnerDisplayName'),
                                                         base_d1.get('LastEditorUserId'),
                                                         base_d1.get('LastEditorDisplayName'),
                                                         base_d1.get('LastEditDate'),
                                                         base_d1.get('LastActivityDate'),
                                                         base_d1.get('Title'),
                                                         base_d1.get('Tags'),
                                                         base_d1.get('AnswerCount'),
                                                         base_d1.get('CommentCount'),
                                                         base_d1.get('FavoriteCount'),
                                                         base_d1.get('ClosedDate'),
                                                         base_d1.get('CommunityOwnedDate'),
                                                            )
                    )
            elif int(dict_of_posting_data['PostTypeId']) == 1 and 'AcceptedAnswerId' not in dict_of_posting_data:
                li2 = ['Id',
                       'PostTypeId',
                       'CreationDate',
                       'Score',
                       'ViewCount',
                       'Body',
                       'OwnerUserId',
                       'OwnerDisplayName',
                       'LastEditorUserId',
                       'LastEditorDisplayName',
                       'LastEditDate',
                       'LastActivityDate',
                       'Title',
                       'Tags',
                       'AnswerCount',
                       'CommentCount',
                       'FavoriteCount',
                       'ClosedDate',
                       'CommunityOwnedDate'
                       ]
                li_non2 = [None] * len(li2)
                base_d2 = dict(zip(li2, li_non2))
                base_d2.update(dict_of_posting_data)

                cur.execute(
                    'INSERT INTO postsWithOutAnswers VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)',
                                                        (base_d2.get('Id'),
                                                         base_d2.get('PostTypeId'),
                                                         base_d2.get('CreationDate'),
                                                         base_d2.get('Score'),
                                                         base_d2.get('ViewCount'),
                                                         base_d2.get('Body'),
                                                         base_d2.get('OwnerUserId'),
                                                         base_d2.get('OwnerDisplayName'),
                                                         base_d2.get('LastEditorUserId'),
                                                         base_d2.get('LastEditorDisplayName'),
                                                         base_d2.get('LastEditDate'),
                                                         base_d2.get('LastActivityDate'),
                                                         base_d2.get('Title'),
                                                         base_d2.get('Tags'),
                                                         base_d2.get('AnswerCount'),
                                                         base_d2.get('CommentCount'),
                                                         base_d2.get('FavoriteCount'),
                                                         base_d2.get('ClosedDate'),
                                                         base_d2.get('CommunityOwnedDate'),
                                                         )
                    )

            else:   # dict will be referring to a response to one of the posts
                li3 = ['Id',
                       'PostTypeId',
                       'ParentId',
                       'CreationDate',
                       'Score',
                       'Body',
                       'OwnerUserId',
                       'OwnerDisplayName',
                       'LastEditorUserId',
                       'LastEditorDisplayName',
                       'LastEditDate',
                       'LastActivityDate',
                       'CommentCount',
                       'CommunityOwnedDate'
                       ]
                li_non3 = [None]*len(li3)
                base_d3 = dict(zip(li3, li_non3))
                base_d3.update(dict_of_posting_data)

                cur.execute(
                    'INSERT INTO responses VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)',
                                                        (base_d3.get('Id'),
                                                         base_d3.get('PostTypeId'),
                                                         base_d3.get('ParentId'),
                                                         base_d3.get('CreationDate'),
                                                         base_d3.get('Score'),
                                                         base_d3.get('Body'),
                                                         base_d3.get('OwnerUserId'),
                                                         base_d3.get('OwnerDisplayName'),
                                                         base_d3.get('LastEditorUserId'),
                                                         base_d3.get('LastEditorDisplayName'),
                                                         base_d3.get('LastEditDate'),
                                                         base_d3.get('LastActivityDate'),
                                                         base_d3.get('CommentCount'),
                                                         base_d3.get('CommunityOwnedDate'),
                                                        )
                    )

            con.commit()
            i+=1
            print(i)




parse_and_move('/Users/BR05URF/Downloads/stackexchange/Posts.xml', 'row/row')