Difference between revisions of "Narrative Website Import"

From Gramps
Jump to: navigation, search
(See also)
(6 intermediate revisions by 2 users not shown)
Line 1: Line 1:
{{out of date}}
+
{{languages|Narrative Website Import}}
{{languages}}
 
 
{{man warn|Outdated code|The Script below was written for the Gramps 2.2 NarrativeWeb and needs to be updated to work with a Gramps 3+ or higher versions.}}
 
{{man warn|Outdated code|The Script below was written for the Gramps 2.2 NarrativeWeb and needs to be updated to work with a Gramps 3+ or higher versions.}}
Use a '''Gramps-created [[Gramps_4.1_Wiki_Manual_-_Reports_-_part_7#Narrated_Web_Site|Narrative Website report]]''' to restore your Gramps database.
+
{{stub}}
 +
Use a '''Gramps-created [[Gramps_{{man version}}_Wiki_Manual_-_Reports_-_part_7#Narrated_Web_Site|Narrative Website report]]''' to restore your Gramps database.
  
The program below works by parsing the HTML website (also called "screen scrapping") and places the data into a [[Gramps_4.1_Wiki_Manual_-_Manage_Family_Trees:_CSV_Import_and_Export#Gramps_Spreadsheet_Import.2FExport|comma-separated value spreadsheet]]. You can then import it directly into Gramps.
+
The program below works by parsing the HTML website (also called "screen scrapping") and places the data into a [[Gramps_{{man version}}_Wiki_Manual_-_Manage_Family_Trees:_CSV_Import_and_Export#Gramps_Spreadsheet_Import.2FExport|comma-separated value spreadsheet]]. You can then import it directly into Gramps.
  
 
To run the program from the command line, provide the URL of the surname list, like:
 
To run the program from the command line, provide the URL of the surname list, like:
Line 210: Line 210:
  
 
==See also==
 
==See also==
Read the following discussion about this code at [http://comments.gmane.org/gmane.comp.genealogy.gramps.user/4986 Lost grdb] & [http://gramps.1791082.n4.nabble.com/Re-return-from-NAVWEB-to-GRAMPS-and-NOT-HIDE-td3780312.html#a3780390]
+
Read the following discussion about this code at [https://sourceforge.net/p/gramps/mailman/gramps-users/thread/20071126154619.GA41656%40eris.discordians.net/#msg11668864 Lost grdb(2007)] & [https://sourceforge.net/p/gramps/mailman/message/28021878/]
  
 
[[Category:Documentation]]
 
[[Category:Documentation]]
 
[[Category:Developers/General]]
 
[[Category:Developers/General]]

Revision as of 03:08, 28 October 2020

Gnome-important.png
Outdated code

The Script below was written for the Gramps 2.2 NarrativeWeb and needs to be updated to work with a Gramps 3+ or higher versions.

Gramps-notes.png

This article's content is incomplete or a placeholder stub.
Please update or expand this section.


Use a Gramps-created Narrative Website report to restore your Gramps database.

The program below works by parsing the HTML website (also called "screen scrapping") and places the data into a comma-separated value spreadsheet. You can then import it directly into Gramps.

To run the program from the command line, provide the URL of the surname list, like:

 python getnarrative.py http://somewebsite.com/myfamily/ > import.csv

Then, in Gramps you should be able to import the file "import.csv" into an empty database.

Tango-Dialog-information.png
Code

Use the following code getnarrative.py (404 link gone) as a good starting point (As the Script was written for Gramps 2.2 NarrativeWeb and needs to be updated to work with a Gramps 3+ or higher version.
# Python script for sucking a GRAMPS Narrative Website back into
# GRAMPS.

# By Doug Blank <[email protected]>
# License: GPL
# (c) 2007


import os, sys, urllib, re

count = 0
person = {None: None}
family = {}
family_pair = {}
event = {}

def loadPerson(url, surname, firstname):
    global count, person, family, event, family_pair
    junk, handle = url.rsplit("/",1)
    handle = handle.replace(".html", "")
    print >> sys.stderr, "   ", count, surname, ", ", firstname
    count += 1
    pfp = urllib.urlopen(gurl + "/" + url)
    contents = pfp.read()
    state = None
    pairs = []
    pdata = {}
    children = []
    for line in contents.split("\n"):
        matches = re.findall("""<td class="(.*?)">(.*?)</td>""", line)
        for match in matches:
            key, data = match
            if key in ["box"]:
                pass # ignore
            elif key in ["field", "data", "category"]:
                pairs.append((key, data))
        if state == "Families" and line.startswith("<a href"): # child?
            matches = re.match("""<a href="(.*?)">(.*?)</a>.*""", line)
            if matches:
                match = matches.groups()[0]
                if "/ppl/" in match:
                    junk, chandle = match.rsplit("/", 1)
                    chandle = chandle.replace(".html","")
                    children.append(chandle)
        elif "<h" in line:
            matches = re.match("<h.>(.*?)</h.>", line)
            if matches:
                if state != None:
                    if state == "Parents":
                        #print "      Parents:", pairs
                        father, mother = None, None
                        for i in range(len(pairs)):
                            if pairs[i][1] == "Father":
                                father = pairs[i+1][1]
                            if pairs[i][1] == "Mother":
                                mother = pairs[i+1][1]
                        if father:
                            father = father.replace("</a>", "")
                            if "/" in father:
                                junk, fhandle = father.rsplit("/", 1)
                                father, name = fhandle.split(".html",1)
                        if mother:
                            mother = mother.replace("</a>", "")
                            if "/" in mother:
                                junk, mhandle = mother.rsplit("/", 1)
                                mother, name = mhandle.split(".html",1)
                        if (father, mother) in family:
                            family[(father, mother)].append(handle)
                        else:
                            family[(father, mother)] = [handle]
                    elif state == "Families":
                        #print "      Families:", pairs
                        mdata = {"me": handle}
                        mhandle = None
                        for (key, value) in pairs:
                            if key == "category":
                                mdata["type"] = value
                            elif key == "field":
                                mdata["spouse"] = value
                            elif key == "data":
                                value = value.replace("</a>", "")
                                if "/" in value:
                                    junk, handle_name = value.rsplit("/", 1)
                                    mhandle, name = handle_name.split(".html",1)
                        handles = [handle, mhandle]
                        handles.sort()
                        #print "adding", handles, mdata
                        family_pair[tuple(handles)] = mdata
                    elif state == "Events":
                        #print "      Events:", pairs
                        event[(handle, pairs[0][1])] = pairs
                    elif state.strip() == (firstname + " " + surname).strip():
                        pdata = {"surname": surname, 
                                 "firstname": firstname,
                                 "children": children,
                                 "suffix": ""}
                        for i in range(len(pairs)):
                            if pairs[i][0] == "field":
                                pdata[pairs[i][1]] = pairs[i+1][1]
                            i += 1
                        person[handle] = pdata
                    elif state == "Pedigree":
                        state = None
                    elif state in ["Ancestors", "Narrative"]:
                        pass
                    else: # name didn't match exactly
                        state = state.replace(surname, "")
                        state = state.replace(firstname, "")
                        suffix = state.strip()
                        pdata = {"surname": surname, 
                                 "firstname": firstname,
                                 "children": children,
                                 "suffix": ""}
                        if suffix:
                            pdata["suffix"] = suffix
                        for i in range(len(pairs)):
                            if pairs[i][0] == "field":
                                pdata[pairs[i][1]] = pairs[i+1][1]
                            i += 1
                        person[handle] = pdata
                else:
                    pass # new person
                pairs = []
                state = matches.groups()[0]

def loadSurname(url, surname):
    sfp = urllib.urlopen(gurl + "/" + url)
    contents = sfp.read()
    for line in contents.split("\n"):
        list = re.findall("""<a href="(.*?)">(.*?)</a>""", line)
        for surnameURL in list:
            url, firstname = surnameURL
            if url.endswith(".html") and "/ppl/" in url:
                prefix, purl = url.split("/ppl/")
                loadPerson("/ppl/" + purl, surname, firstname)


gurl = sys.argv[1] # URL of surnames
fp = urllib.urlopen(gurl) 
contents = fp.read() # read in website
for line in contents.split("\n"):
    list = re.findall("""<a href="(.*?)">(.*?)</a>""", line)
    for surnameURL in list:
        url, surname = surnameURL
        if url.endswith(".html") and url.startswith("srn"):
            print >> sys.stderr, "Processing surname", surname, "..."
            loadSurname(*surnameURL)

print "person,firstname,lastname,suffix,gender"
for h in person:
    if h:
        print '"%s","%s","%s","%s","%s"' % (h, person[h]["firstname"], 
                                            person[h]["surname"], 
                                            person[h]["suffix"], 
                                            person[h]["Gender"])

for fam in family_pair:
    data = family_pair[fam]
    h1, h2 = fam
    p1, p2 = None, None
    if h1 in person:
        p1 = person[h1]
    if h2 in person:
        p2 = person[h2]
    if p1 and p2:
        if p1["Gender"] == "male" and p2["Gender"] == "female":
            if (h1, h2) in family:
                family[(h1,h2)].append(data["me"])
            else:
                family[(h1,h2)] = [data["me"]]
        else:
            if (h2, h1) in family:
                family[(h2,h1)].append(data["me"])
            else:
                family[(h2,h1)] = [data["me"]]

print
print "marriage,parent1,parent2"
count = 1
marriage = {}
for pair in family:
    marriage[pair] = "F%04d" % count
    print '"%s","%s","%s"' % (marriage[pair], pair[0], pair[1])
    count += 1

print
print "family,child"
for pair in family:
    kids = family[pair]
    kids = set(kids)
    for kid in kids:
        if (kid != pair[0]) or (kid != pair[1]):
            print '"%s","%s"' % (marriage[pair], kid)


See also

Read the following discussion about this code at Lost grdb(2007) & [1]