Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions data-upload/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Data uploading tool
Use this command line tool to upload json files to MongoBD server.

You can run this tool either in Intellij or in command line.

If you are using command line, first build this tool by running`./gradlew build`.
Find the distribution in the `build/distribution` directory, untar the compressed file and
you should fine executables in the `bin` directory.

There are two arguments in this tool. Specify json files directory by using `-d` and specify MongoDB endpoint by using `-e`.
11 changes: 8 additions & 3 deletions data-upload/build.gradle
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
plugins {
id 'java'
id 'application'
}

group 'org.example'
version '1.0-SNAPSHOT'
group 'org.techVault.webScrapping'
version '0.0.1-SNAPSHOT'

application {
mainClass = 'uploader'
}

repositories {
mavenCentral()
Expand All @@ -22,4 +27,4 @@ dependencies {

test {
useJUnitPlatform()
}
}
2 changes: 1 addition & 1 deletion data-upload/gradlew
100644 → 100755

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

193 changes: 104 additions & 89 deletions data-upload/gradlew.bat

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

50 changes: 26 additions & 24 deletions data-upload/src/main/java/uploader.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,27 +12,35 @@

import java.io.File;
import java.io.FileReader;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
import java.util.stream.Collectors;


public class uploader {
public static final String DBNAME = "techVault";
public static final ImmutableList<String> COMPANIES =
ImmutableList.of("Linkedin", "Yelp", "Yahoo", "Twilio", "Stack", "AWS");
ImmutableList.of("airbnb", "aws", "babble", "confluent", "criteo", "deepmind", "ebay", "facebook", "linkedin", "medium", "netflix", "nvidia", "quora", "slack", "stackoverflow", "twilio", "uber", "yahoo", "yelp");

private static List<String> readFileToJsonString(File file) {
JSONParser parser = new JSONParser();
List<String> list = new ArrayList<>();
try {
Object obj = parser.parse(new FileReader(file));
JSONObject jsonObject = (JSONObject) obj;
JSONArray blogs = (JSONArray) jsonObject.get("Linkedin");
for(Object blog: blogs.toArray()){
JSONObject jsonObj = (JSONObject)blog;
list.add(jsonObj.toJSONString());
for(String s : COMPANIES) {
if(jsonObject.containsKey(s)) {
JSONArray blogs = (JSONArray) jsonObject.get(s);
for (Object blog : blogs.toArray()) {
JSONObject jsonObj = (JSONObject) blog;
jsonObj.put("company", s);
final String uuid = UUID.randomUUID().toString().replace("-", "");
jsonObj.put("uuid", uuid);
list.add(jsonObj.toJSONString());
}
break;
}
}
} catch (Exception e) {
e.printStackTrace();
Expand All @@ -43,21 +51,17 @@ private static List<String> readFileToJsonString(File file) {

public static void main(String[] args) {
Options options = new Options();
Option f = new Option("f", "file", true, "input file path");
f.setRequired(true);
options.addOption(f);
Option directoryOption = new Option("d", "directory", true, "json file directory");
directoryOption.setRequired(true);
options.addOption(directoryOption);

Option u = new Option("u", "user", true, "user name for mongodb");
u.setRequired(true);
options.addOption(u);

Option p = new Option("p", "password", true, "password");
p.setRequired(true);
options.addOption(p);
Option endpointOption = new Option("e", "endpoint", true, "mongoDB endpoint, e.g., mongodb+srv://<user>:<password>@cluster0.0eph1.mongodb.net/<DB name>?retryWrites=true&w=majority");
endpointOption.setRequired(true);
options.addOption(endpointOption);

CommandLineParser parser = new DefaultParser();
HelpFormatter formatter = new HelpFormatter();
CommandLine cmd;
CommandLine cmd = null;

try {
cmd = parser.parse(options, args);
Expand All @@ -68,20 +72,18 @@ public static void main(String[] args) {
}
Preconditions.checkNotNull(cmd);

String folderName = cmd.getOptionValue("folder");
String username = cmd.getOptionValue("user");
String password = cmd.getOptionValue("password");
String directory = cmd.getOptionValue("directory");
String endpoint = cmd.getOptionValue("endpoint");

File folder = new File(folderName);
File folder = new File(directory);
File[] listOfFiles = folder.listFiles();
Preconditions.checkNotNull(listOfFiles);

String endpoint = String.format("mongodb+srv://%s:%s@cluster0.0eph1.mongodb.net/%s?retryWrites=true&w=majority", username, password, DBNAME);
MongoClient mongoClient = MongoClients.create(endpoint);
MongoDatabase database = mongoClient.getDatabase(DBNAME);
MongoCollection<Document> collection = database.getCollection("collection");
MongoCollection<Document> collection = database.getCollection("blogs");

for(File file : listOfFiles) {
for (File file : listOfFiles) {
List<Document> docs = readFileToJsonString(file).stream().map(Document::parse).collect(Collectors.toList());
collection.insertMany(docs);
}
Expand Down