Skip to content

Commit

Permalink
Merge pull request #24 from akb89/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
akb89 authored Aug 15, 2018
2 parents ae92520 + 48dce7e commit 9587ca5
Show file tree
Hide file tree
Showing 6 changed files with 201 additions and 27 deletions.
1 change: 1 addition & 0 deletions config/development.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ const config = {
splitsDir: '/Users/AKB/Dropbox/FrameNetData/fndata-1.7',
importLexUnits: true,
importFullTexts: true,
importHierarchy: true,
frameChunkSize: 150,
lexUnitChunkSize: 200,
};
Expand Down
1 change: 1 addition & 0 deletions config/production.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ const config = {
splitsDir: '/Users/AKB/Dropbox/FrameNetData/fndata-1.7',
importLexUnits: true,
importFullTexts: true,
importHierarchy: true,
frameChunkSize: 150,
lexUnitChunkSize: 200,
};
Expand Down
8 changes: 4 additions & 4 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "noframenet",
"version": "4.0.5",
"version": "4.1.0",
"description": "A set of scripts to import FrameNet XML data to a MongoDB database",
"keywords": [
"FrameNet",
Expand All @@ -17,7 +17,7 @@
"check": "NODE_ENV=production node ./scripts/check.js",
"checkdev": "NODE_ENV=development node ./scripts/check.js",
"import": "npm run clean && NODE_ENV=production node --max-old-space-size=8192 ./scripts/extract.js",
"importdev": "npm run cleandev && NODE_ENV=development node --max-old-space-size=8192 ./scripts/extract.js",
"importdev": "npm run cleandev && NODE_ENV=development node --max-old-space-size=8192 --trace-warnings ./scripts/extract.js",
"postimport": "npm run fix",
"postimportdev": "npm run fixdev",
"fix": "NODE_ENV=production node ./scripts/fix.js",
Expand All @@ -39,8 +39,8 @@
"ascii-progress": "^1.0.5",
"bluebird": "^3.5.0",
"jsonix": "^2.4.1",
"mongoose": "5.0.0",
"noframenet-core": "^5.0.4",
"mongoose": "^5.0.0",
"noframenet-core": "^5.4.1",
"winston": "^2.4.0"
},
"devDependencies": {
Expand Down
59 changes: 36 additions & 23 deletions scripts/extract.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ const Corpus = require('noframenet-core').Corpus;
const Document = require('noframenet-core').Document;
const Frame = require('noframenet-core').Frame;
const FrameElement = require('noframenet-core').FrameElement;
const FEHierarchy = require('noframenet-core').FEHierarchy;
const FrameHierarchy = require('noframenet-core').FrameHierarchy;
const FERelation = require('noframenet-core').FERelation;
const FrameRelation = require('noframenet-core').FrameRelation;
const FrameRelationType = require('noframenet-core').FrameRelationType;
Expand All @@ -16,6 +18,7 @@ const Sentence = require('noframenet-core').Sentence;
const ValenceUnit = require('noframenet-core').ValenceUnit;
const config = require('./../config');
const driver = require('./../db/mongoose');
const hierarchiesExtractor = require('./extraction/hierarchies');
const framesExtractor = require('./extraction/frames');
const fullTextsExtractor = require('./extraction/fullTexts');
const lexUnitsExtractor = require('./extraction/lexUnits');
Expand Down Expand Up @@ -91,29 +94,19 @@ function saveFramesDataToDatabase(framesMap, fesMap, lexUnitsMap, lexemes) {
]);
}

async function extractLexUnits(lexUnitDir, lexUnitChunkSize, annoSetsMap,
labels, patternsMap, sentencesMap,
valenceUnitsMap) {
await lexUnitsExtractor.extractLexUnits(lexUnitDir, lexUnitChunkSize,
annoSetsMap, labels, patternsMap,
sentencesMap, valenceUnitsMap);
logger.info('Done extracting lexUnits');
}

async function extractFullTexts(fullTextDir, annoSetsMap, corporaMap,
documentsMap, labels, patternsMap,
sentencesMap, valenceUnitsMap) {
await fullTextsExtractor.extractFullTexts(fullTextDir, annoSetsMap,
corporaMap, documentsMap, labels,
patternsMap, sentencesMap,
valenceUnitsMap);
logger.info('Done extracting fullTexts');
function saveHierarchiesToDatabase(frameHierarchyMap, feHierarchyMap) {
return Promise.all([
FrameHierarchy.collection.insertMany(Array.from(frameHierarchyMap.values()),
{ ordered: false }),
FEHierarchy.collection.insertMany(Array.from(feHierarchyMap.values()), { ordered: false }),
]);
}

async function importFrameNetData(dbUri, lexUnitDir, lexUnitChunkSize,
frameDir, frameChunkSize, fullTextDir,
relationsFilePath, semTypesFilePath,
importLexUnits, importFullTexts) {
importLexUnits, importFullTexts,
importHierarchy) {
await driver.connectToDatabase(dbUri);

// Maps are for unique documents
Expand All @@ -122,6 +115,8 @@ async function importFrameNetData(dbUri, lexUnitDir, lexUnitChunkSize,
const documentsMap = new Map();
const framesMap = new Map();
const fesMap = new Map();
const frameHierarchyMap = new Map();
const feHierarchyMap = new Map();
const lexUnitsMap = new Map();
const patternsMap = new Map();
const sentencesMap = new Map();
Expand Down Expand Up @@ -157,14 +152,28 @@ async function importFrameNetData(dbUri, lexUnitDir, lexUnitChunkSize,
await saveRelationsAndSemTypesToDatabase(feRelations, frameRelations,
frameRelationTypes, semTypes);

if (importHierarchy) {
hierarchiesExtractor.extractHierarchies(feRelations, frameRelations,
frameRelationTypes, framesMap,
fesMap, frameHierarchyMap,
feHierarchyMap);
await saveHierarchiesToDatabase(frameHierarchyMap, feHierarchyMap);
logger.info('Done extracting hierarchy');
}

if (importLexUnits) {
await extractLexUnits(lexUnitDir, lexUnitChunkSize, annoSetsMap,
labels, patternsMap, sentencesMap, valenceUnitsMap);
await lexUnitsExtractor.extractLexUnits(lexUnitDir, lexUnitChunkSize,
annoSetsMap, labels, patternsMap,
sentencesMap, valenceUnitsMap);
logger.info('Done extracting lexUnits');
}

if (importFullTexts) {
await extractFullTexts(fullTextDir, annoSetsMap, corporaMap, documentsMap,
labels, patternsMap, sentencesMap, valenceUnitsMap);
await fullTextsExtractor.extractFullTexts(fullTextDir, annoSetsMap,
corporaMap, documentsMap, labels,
patternsMap, sentencesMap,
valenceUnitsMap);
logger.info('Done extracting fullTexts');
}

logger.info('Saving data to database. This can take several minutes...');
Expand All @@ -176,6 +185,8 @@ async function importFrameNetData(dbUri, lexUnitDir, lexUnitChunkSize,
logger.verbose(` documentsMap.size = ${documentsMap.size}`);
logger.verbose(` fesMap.size = ${fesMap.size}`);
logger.verbose(` framesMap.size = ${framesMap.size}`);
logger.verbose(` frameHierarchyMap.size = ${frameHierarchyMap.size}`);
logger.verbose(` feHierarchyMap.size = ${feHierarchyMap.size}`);
logger.verbose(` labels.length = ${labels.length}`);
logger.verbose(` lexemes.length = ${lexemes.length}`);
logger.verbose(` lexUnitsMap.size = ${lexUnitsMap.size}`);
Expand All @@ -190,6 +201,7 @@ if (require.main === module) {
const dbUri = config.dbUri;
const importLexUnits = config.importLexUnits;
const importFullTexts = config.importFullTexts;
const importHierarchy = config.importHierarchy;
const lexUnitDir = path.join(config.splitsDir, 'lu');
const lexUnitChunkSize = config.lexUnitChunkSize;
const frameDir = path.join(config.frameNetDir, 'frame');
Expand All @@ -199,6 +211,7 @@ if (require.main === module) {
const semTypesFilePath = path.join(config.frameNetDir, 'semTypes.xml');
importFrameNetData(dbUri, lexUnitDir, lexUnitChunkSize, frameDir,
frameChunkSize, fullTextDir, relationsFilePath,
semTypesFilePath, importLexUnits, importFullTexts)
semTypesFilePath, importLexUnits, importFullTexts,
importHierarchy)
.then(() => logger.info(`FrameNet data import completed in ${process.hrtime(startTime)[0]}s`));
}
152 changes: 152 additions & 0 deletions scripts/extraction/hierarchies.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
const FEHierarchy = require('noframenet-core').FEHierarchy;
const FrameHierarchy = require('noframenet-core').FrameHierarchy;
const FRTYPE_NAMES_FOR_FRAME_HIERARCHY = require('../../utils/constants').FRTYPE_NAMES_FOR_FRAME_HIERARCHY;
const FRTYPE_NAMES_FOR_FE_HIERARCHY = require('../../utils/constants').FRTYPE_NAMES_FOR_FE_HIERARCHY;
const config = require('./../../config');

const logger = config.logger;

function getRelatives(itemName, childrenParentsMap, type, visitedSet) {
if (!childrenParentsMap.has(itemName)) {
return [];
}
return Array.from(childrenParentsMap.get(itemName)).reduce((array, item) => {
if (!visitedSet.has(item)) {
visitedSet.add(item);
array.push({
name: item,
[type]: getRelatives(item, childrenParentsMap, type, visitedSet),
});
}
return array;
}, []);
}

function fillFEhierarchyMap(feHierarchyMap, fesMap, parentsMap, childrenMap) {
fesMap.forEach((fe) => {
if (!feHierarchyMap.has(fe.name)) {
const visitedSet = new Set([fe.name]);
feHierarchyMap.set(fe.name, new FEHierarchy({
name: fe.name,
parents: getRelatives(fe.name, parentsMap, 'parents', visitedSet),
children: getRelatives(fe.name, childrenMap, 'children', visitedSet),
}));
}
});
return feHierarchyMap;
}

function fillFrameHierarchyMap(frameHierarchyMap, framesMap, parentsMap,
childrenMap) {
framesMap.forEach((frame) => {
if (!frameHierarchyMap.has(frame.name)) {
const visitedSet = new Set([frame.name]);
frameHierarchyMap.set(frame.name, new FrameHierarchy({
name: frame.name,
parents: getRelatives(frame.name, parentsMap, 'parents', visitedSet),
children: getRelatives(frame.name, childrenMap, 'children', visitedSet),
}));
}
});
return frameHierarchyMap;
}

function getFEparentsChildrenMaps(feRelations, frIDset, fesMap) {
const parentsMap = new Map();
const childrenMap = new Map();
feRelations.forEach((feRelation) => {
if (frIDset.has(feRelation.frameRelation)) {
const supFEname = fesMap.get(feRelation.supFE).name;
const subFEname = fesMap.get(feRelation.subFE).name;
if (subFEname !== supFEname) {
if (!childrenMap.has(supFEname)) {
childrenMap.set(supFEname, new Set([subFEname]));
} else {
childrenMap.get(supFEname).add(subFEname);
}
if (!parentsMap.has(subFEname)) {
parentsMap.set(subFEname, new Set([supFEname]));
} else {
parentsMap.get(subFEname).add(supFEname);
}
}
}
});
return { parentsMap, childrenMap };
}

function getFrameParentsChildrenMaps(frameRelations, frTypesIDset, framesMap) {
const parentsMap = new Map();
const childrenMap = new Map();
frameRelations.forEach((frameRelation) => {
if (frTypesIDset.has(frameRelation.type)) {
const supFrameName = framesMap.get(frameRelation.supFrame).name;
const subFrameName = framesMap.get(frameRelation.subFrame).name;
if (!childrenMap.has(supFrameName)) {
childrenMap.set(supFrameName, new Set([subFrameName]));
} else {
childrenMap.get(supFrameName).add(subFrameName);
}
if (!parentsMap.has(subFrameName)) {
parentsMap.set(subFrameName, new Set([supFrameName]));
} else {
parentsMap.get(subFrameName).add(supFrameName);
}
}
});
return { parentsMap, childrenMap };
}

function getFRidSet(frameRelations, frTypesIDset) {
return frameRelations.reduce((frSet, frameRelation) => {
if (frTypesIDset.has(frameRelation.type)) {
frSet.add(frameRelation._id);
}
return frSet;
}, new Set());
}

function getFRtypesIDset(frameRelationTypes, frtypeNames) {
return frameRelationTypes.reduce((idSet, relationType) => {
if (frtypeNames.includes(relationType.name)) {
idSet.add(relationType._id);
}
return idSet;
}, new Set());
}

function extractFEhierarchy(feRelations, frameRelations, frameRelationTypes,
fesMap, feHierarchyMap) {
const frTypesIDset = getFRtypesIDset(frameRelationTypes, FRTYPE_NAMES_FOR_FE_HIERARCHY);
const frIDset = getFRidSet(frameRelations, frTypesIDset);
const { parentsMap, childrenMap } = getFEparentsChildrenMaps(feRelations,
frIDset, fesMap);
fillFEhierarchyMap(feHierarchyMap, fesMap, parentsMap, childrenMap);
}

function extractFrameHierarchy(frameRelations, frameRelationTypes, framesMap,
frameHierarchyMap) {
const frTypesIDset = getFRtypesIDset(frameRelationTypes, FRTYPE_NAMES_FOR_FRAME_HIERARCHY);
const { parentsMap, childrenMap } = getFrameParentsChildrenMaps(frameRelations,
frTypesIDset,
framesMap);
fillFrameHierarchyMap(frameHierarchyMap, framesMap, parentsMap, childrenMap);
}

function extractHierarchies(feRelations, frameRelations, frameRelationTypes,
framesMap, fesMap, frameHierarchyMap,
feHierarchyMap) {
logger.info('Extracting hierarchies...');
logger.info('Extracting frames hierarchy...');
extractFrameHierarchy(frameRelations, frameRelationTypes, framesMap,
frameHierarchyMap);
logger.info('Done extracting frames hierarchy');
logger.info('Extracting FE hierarchy...');
extractFEhierarchy(feRelations, frameRelations, frameRelationTypes, fesMap,
feHierarchyMap);
logger.info('Done extracting FE hierarchy');
}

module.exports = {
extractHierarchies,
};
7 changes: 7 additions & 0 deletions utils/constants.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
const FRTYPE_NAMES_FOR_FRAME_HIERARCHY = ['Inheritance'];
const FRTYPE_NAMES_FOR_FE_HIERARCHY = ['Inheritance'];

module.exports = {
FRTYPE_NAMES_FOR_FRAME_HIERARCHY,
FRTYPE_NAMES_FOR_FE_HIERARCHY,
};

0 comments on commit 9587ca5

Please sign in to comment.