Skip to content

Commit

Permalink
[INLONG-11000][SDK] Add XML formatted data source for Transform (#11001)
Browse files Browse the repository at this point in the history
  • Loading branch information
Zkplo authored Oct 9, 2024
1 parent 6ce73d2 commit 923d423
Show file tree
Hide file tree
Showing 7 changed files with 510 additions and 0 deletions.
4 changes: 4 additions & 0 deletions inlong-sdk/transform-sdk/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@
<artifactId>parquet-hadoop</artifactId>
<version>${parquet.version}</version>
</dependency>
<dependency>
<groupId>org.dom4j</groupId>
<artifactId>dom4j</artifactId>
</dependency>
</dependencies>

<build>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.apache.inlong.sdk.transform.pojo.KvSourceInfo;
import org.apache.inlong.sdk.transform.pojo.ParquetSourceInfo;
import org.apache.inlong.sdk.transform.pojo.PbSourceInfo;
import org.apache.inlong.sdk.transform.pojo.XmlSourceInfo;
import org.apache.inlong.sdk.transform.pojo.YamlSourceInfo;

public class SourceDecoderFactory {
Expand All @@ -40,6 +41,10 @@ public static JsonSourceDecoder createJsonDecoder(JsonSourceInfo sourceInfo) {
return new JsonSourceDecoder(sourceInfo);
}

public static XmlSourceDecoder createXmlDecoder(XmlSourceInfo sourceInfo) {
return new XmlSourceDecoder(sourceInfo);
}

public static PbSourceDecoder createPbDecoder(PbSourceInfo sourceInfo) {
return new PbSourceDecoder(sourceInfo);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.inlong.sdk.transform.decode;

import lombok.Data;

/**
* XmlNode
*/
@Data
public class XmlNode {

private String name;
private Object value;

public XmlNode() {
}

public XmlNode(String name, Object value) {
this.name = name;
this.value = value;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.inlong.sdk.transform.decode;

import java.util.List;
import java.util.Map;

/**
* XmlSourceData
*/
public class XmlSourceData implements SourceData {

public static final String ROOT_KEY = "$root";

public static final String CHILD_KEY = "$child";

private XmlNode root;

private XmlNode childRoot;

public XmlSourceData(XmlNode root, XmlNode childRoot) {
this.root = root;
this.childRoot = new XmlNode();
if (childRoot != null) {
Object value = childRoot.getValue();
if (value instanceof XmlNode) {
this.childRoot = (XmlNode) value;
}
}
}

@Override
public int getRowCount() {
if (this.childRoot == null) {
return 1;
} else {
Object value = this.childRoot.getValue();
if (value instanceof List) {
return ((List<XmlNode>) value).size();
} else {
return 1;
}
}
}

@Override
public String getField(int rowNum, String fieldName) {
try {
String[] nodeString = fieldName.split("\\.");
Object cur = null, last = null;
int start = -1;
if (nodeString[0].equals(ROOT_KEY)) {
cur = root;
} else if (nodeString[0].equals(CHILD_KEY)) {
cur = ((List<XmlNode>) childRoot.getValue()).get(rowNum);
}
for (int i = 1; i < nodeString.length; i++) {
if (cur == null) {
cur = last;
continue;
}
last = cur;
if (cur instanceof List) {
int idx = 0;
start = nodeString[i].indexOf('(');
if (start != -1) {
idx = Integer.parseInt(nodeString[1].substring(start + 1, nodeString[1].indexOf(')')));
}
cur = ((List<XmlNode>) cur).get(idx).getValue();
} else if (cur instanceof Map) {
start = nodeString[i].indexOf('(');
String key = nodeString[i];
if (start != -1) {
key = key.substring(0, start);
}
cur = ((Map<String, XmlNode>) cur).get(key);
} else if (cur instanceof XmlNode) {
cur = ((XmlNode) cur).getValue();
} else {
i++;
}
i--;
}
if (cur == null) {
return "";
}
return cur.toString();
} catch (Exception e) {
return "";
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.inlong.sdk.transform.decode;

import org.apache.inlong.sdk.transform.pojo.XmlSourceInfo;
import org.apache.inlong.sdk.transform.process.Context;

import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;

import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
* XmlSourceDecoder
*/
@Slf4j
public class XmlSourceDecoder implements SourceDecoder<String> {

protected XmlSourceInfo sourceInfo;
private Charset srcCharset = Charset.defaultCharset();
private String rowsNodePath;
private List<String> childNodes;

public XmlSourceDecoder(XmlSourceInfo sourceInfo) {
this.sourceInfo = sourceInfo;
if (!StringUtils.isBlank(sourceInfo.getCharset())) {
this.srcCharset = Charset.forName(sourceInfo.getCharset());
}
this.rowsNodePath = sourceInfo.getRowsNodePath();
if (!StringUtils.isBlank(rowsNodePath)) {
this.childNodes = new ArrayList<>();
String[] nodeStrings = this.rowsNodePath.split("\\.");
childNodes.addAll(Arrays.asList(nodeStrings));
}
}

@Override
public SourceData decode(byte[] srcBytes, Context context) {
String srcString = new String(srcBytes, srcCharset);
return this.decode(srcString, context);
}

@Override
public SourceData decode(String srcString, Context context) {
try {
Document doc = DocumentHelper.parseText(srcString);
Element root = doc.getRootElement();
XmlNode rootObj = parser(root).get(root.getName());
Object cur = rootObj.getValue();
XmlNode child = null;
if (childNodes != null) {
for (String node : childNodes) {
if (cur instanceof Map) {
child = ((Map<String, XmlNode>) cur).get(node);
} else if (cur instanceof List) {
int start = node.indexOf('(') + 1, end = node.indexOf(')');
int idx = Integer.parseInt(node.substring(start, end));
child = ((List<XmlNode>) cur).get(idx);
}
cur = child.getValue();
}
}
return new XmlSourceData(rootObj, child);
} catch (Exception e) {
log.error("Data parsing failed", e);
return null;
}
}

public static Map<String, XmlNode> parser(Element root) {
Map<String, XmlNode> xmlData = new HashMap<>();
if (root.isTextOnly()) {
xmlData.put(root.getName(), new XmlNode(root.getName(), root.getText()));
} else {
ArrayList<Map<String, XmlNode>> childNodes = new ArrayList<>();
for (Object elementObj : root.elements()) {
Element element = (Element) elementObj;
childNodes.add(parser(element));
}
Map<String, XmlNode> mergeMap = new HashMap<>();
for (Map<String, XmlNode> childNode : childNodes) {
for (String key : childNode.keySet()) {
XmlNode nowNode = mergeMap.get(key);
XmlNode tarNode = childNode.get(key);
if (nowNode == null) {
mergeMap.put(key, tarNode);
} else {
if (nowNode.getValue() instanceof List) {
((List<XmlNode>) nowNode.getValue()).add(tarNode);
} else {
ArrayList<XmlNode> list = new ArrayList<>();
list.add(nowNode);
list.add(tarNode);
mergeMap.put(key, new XmlNode(key, list));
}
}
}
}
if (mergeMap.size() == 1) {
XmlNode childValue = new ArrayList<>(mergeMap.values()).get(0);
xmlData.put(root.getName(), new XmlNode(root.getName(), childValue));
} else {
xmlData.put(root.getName(), new XmlNode(root.getName(), mergeMap));
}
}
return xmlData;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.inlong.sdk.transform.pojo;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonProperty;

/**
* XmlSourceInfo
*/
@JsonIgnoreProperties(ignoreUnknown = true)
public class XmlSourceInfo extends SourceInfo {

private String rowsNodePath;

@JsonCreator
public XmlSourceInfo(
@JsonProperty("charset") String charset,
@JsonProperty("rowsNodePath") String rowsNodePath) {
super(charset);
this.rowsNodePath = rowsNodePath;
}

/**
* get rowsNodePath
* @return the rowsNodePath
*/
@JsonProperty("rowsNodePath")
public String getRowsNodePath() {
return rowsNodePath;
}

/**
* set rowsNodePath
* @param rowsNodePath the rowsNodePath to set
*/
public void setRowsNodePath(String rowsNodePath) {
this.rowsNodePath = rowsNodePath;
}
}
Loading

0 comments on commit 923d423

Please sign in to comment.