diff --git a/.github/workflows/build-go-tools.yml b/.github/workflows/build-go-tools.yml new file mode 100644 index 00000000..dd8a2051 --- /dev/null +++ b/.github/workflows/build-go-tools.yml @@ -0,0 +1,118 @@ +# Ref: https://github.com/dmwm/dbs2go/blob/master/.github/workflows/build.yml +name: Build + +on: + push: + tags: + - 'go-*.*.*' + +jobs: + build: + name: Build + runs-on: ubuntu-latest + steps: + - name: Set up Go + uses: actions/setup-go@v2 + with: + go-version: ^1.15 + + - name: Check out code into the Go module directory + uses: actions/checkout@v2 + + - name: Build + run: | + mkdir cmsmon-tools + cd src/go/MONIT + go build -o monit monit.go + go build -o alert alert.go + go build -o annotationManager annotationManager.go + go build -o datasources datasources.go + go build -o ggus_alerting ggus_alerting.go + go build -o ggus_parser ggus_parser.go + go build -o intelligence intelligence.go + go build -o ssb_alerting ssb_alerting.go + go build -o es_exporter es_exporter.go + mv monit alert annotationManager datasources ggus_alerting \ + ggus_parser intelligence ssb_alerting es_exporter ../../../cmsmon-tools + cd ../NATS + go build -o dbs_vm dbs_vm.go + go build -o nats-pub nats-pub.go + go build -o nats-sub nats-sub.go + mv dbs_vm nats-pub nats-sub ../../../cmsmon-tools + cd ../../../ + tar cfz cmsmon-tools.tar.gz cmsmon-tools + + - name: Create Release + id: create_release + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + tag_name: ${{ github.ref }} + release_name: Release ${{ github.ref }} + draft: false + prerelease: false + + - name: Upload binaries + id: upload-cmsmon-tools + uses: actions/upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ steps.create_release.outputs.upload_url }} + asset_path: ./cmsmon-tools.tar.gz + asset_name: cmsmon-tools.tar.gz + asset_content_type: application/octet-stream + + # --- Build and push docker images --- + - name: Get git tag + id: get_tag + run: echo ::set-output name=tag::${GITHUB_REF/refs\/tags\//} + + - name: Build cmsmon-int image + run: | + echo Image tag: ${{ steps.get_tag.outputs.tag }} + curl -ksLO https://raw.githubusercontent.com/dmwm/CMSKubernetes/master/docker/cmsmon-intelligence/Dockerfile + sed -i -e "s,ENV CMSMON_TAG=.*,ENV CMSMON_TAG=${{steps.get_tag.outputs.tag}},g" Dockerfile + docker build . --tag docker.pkg.github.com/dmwm/cmsmon-int/cmsmon-int + docker tag docker.pkg.github.com/dmwm/cmsmon-int/cmsmon-int registry.cern.ch/cmsmonitoring/cmsmon-int + + - name: Login to registry.cern.ch + uses: docker/login-action@v1.6.0 + with: + registry: registry.cern.ch + username: ${{ secrets.CERN_LOGIN }} + password: ${{ secrets.CERN_TOKEN }} + + - name: Publish cmsmon-int image to registry.cern.ch + uses: docker/build-push-action@v1 + with: + username: ${{ secrets.CERN_LOGIN }} + password: ${{ secrets.CERN_TOKEN }} + registry: registry.cern.ch + repository: cmsmonitoring/cmsmon-int + tag_with_ref: true + + - name: Build cmsmon-alerts image + run: | + echo Image tag: ${{ steps.get_tag.outputs.tag }} + curl -ksLO https://raw.githubusercontent.com/dmwm/CMSKubernetes/master/docker/cmsmon-alerts/Dockerfile + sed -i -e "s,ENV CMSMON_TAG=.*,ENV CMSMON_TAG=${{steps.get_tag.outputs.tag}},g" Dockerfile + docker build . --tag docker.pkg.github.com/dmwm/cmsmon-alerts/cmsmon-alerts + docker tag docker.pkg.github.com/dmwm/cmsmon-alerts/cmsmon-alerts registry.cern.ch/cmsmonitoring/cmsmon-alerts + + - name: Login to registry.cern.ch + uses: docker/login-action@v1.6.0 + with: + registry: registry.cern.ch + username: ${{ secrets.CERN_LOGIN }} + password: ${{ secrets.CERN_TOKEN }} + + - name: Publish cmsmon-alerts image to registry.cern.ch + uses: docker/build-push-action@v1 + with: + username: ${{ secrets.CERN_LOGIN }} + password: ${{ secrets.CERN_TOKEN }} + registry: registry.cern.ch + repository: cmsmonitoring/cmsmon-alerts + tag_with_ref: true diff --git a/.github/workflows/build-sqoop.yml b/.github/workflows/build-sqoop.yml new file mode 100644 index 00000000..9e5bbc6a --- /dev/null +++ b/.github/workflows/build-sqoop.yml @@ -0,0 +1,49 @@ +# Ref: https://github.com/dmwm/dbs2go/blob/master/.github/workflows/build.yml + +name: Build + +on: + push: + tags: + - 'sqoop-*.*.*' + +jobs: + build: + name: Build + runs-on: ubuntu-latest + steps: + - name: Set up Go + uses: actions/setup-go@v2 + with: + go-version: ^1.15 + + - name: Check out code into the Go module directory + uses: actions/checkout@v2 + + - name: Get git tag + id: get_tag + run: echo ::set-output name=tag::${GITHUB_REF/refs\/tags\//} + + - name: Build sqoop image + run: | + echo Image tag: ${{ steps.get_tag.outputs.tag }} + curl -ksLO https://raw.githubusercontent.com/dmwm/CMSKubernetes/master/docker/sqoop/Dockerfile + sed -i -e "s,ENV CMSMON_TAG=.*,ENV CMSMON_TAG=${{steps.get_tag.outputs.tag}},g" Dockerfile + docker build . --tag docker.pkg.github.com/dmwm/sqoop/sqoop + docker tag docker.pkg.github.com/dmwm/sqoop/sqoop registry.cern.ch/cmsmonitoring/sqoop + + - name: Login to registry.cern.ch + uses: docker/login-action@v1.6.0 + with: + registry: registry.cern.ch + username: ${{ secrets.CERN_LOGIN }} + password: ${{ secrets.CERN_TOKEN }} + + - name: Publish sqoop image to registry.cern.ch + uses: docker/build-push-action@v1 + with: + username: ${{ secrets.CERN_LOGIN }} + password: ${{ secrets.CERN_TOKEN }} + registry: registry.cern.ch + repository: cmsmonitoring/sqoop + tag_with_ref: true diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml deleted file mode 100644 index 003621ae..00000000 --- a/.github/workflows/build.yml +++ /dev/null @@ -1,136 +0,0 @@ -name: Build - -on: - push: - tags: - - '*.*.*' - -jobs: - - build: - name: Build - runs-on: ubuntu-latest - steps: - - - name: Set up Go - uses: actions/setup-go@v2 - with: - go-version: ^1.15 - - - name: Check out code into the Go module directory - uses: actions/checkout@v2 - -# - name: Get dependencies -# run: | -# go get github.com/vkuznet/x509proxy -# go get github.com/buger/jsonparser -# go get github.com/pkg/profile -# go get github.com/stretchr/testify -# go get github.com/go-stomp/stomp -# go get github.com/elastic/go-elasticsearch -# go get github.com/nats-io/nats.go -# go get github.com/dmwm/cmsauth -# go get github.com/sirupsen/logrus -# go get github.com/prometheus/client_golang/prometheus -# go get github.com/prometheus/client_golang/prometheus/promhttp -# go get github.com/prometheus/common/log -# go get github.com/prometheus/common/version -# go get github.com/shirou/gopsutil/cpu -# go get github.com/shirou/gopsutil/mem -# go get github.com/shirou/gopsutil/load -# go get github.com/shirou/gopsutil/process - - - name: Build - run: | - mkdir cmsmon-tools - cd src/go/MONIT - go build -o monit monit.go - go build -o alert alert.go - go build -o annotationManager annotationManager.go - go build -o datasources datasources.go - go build -o ggus_alerting ggus_alerting.go - go build -o ggus_parser ggus_parser.go - go build -o intelligence intelligence.go - go build -o ssb_alerting ssb_alerting.go - go build -o es_exporter es_exporter.go - mv monit alert annotationManager datasources ggus_alerting \ - ggus_parser intelligence ssb_alerting es_exporter ../../../cmsmon-tools - cd ../NATS - go build -o dbs_vm dbs_vm.go - go build -o nats-pub nats-pub.go - go build -o nats-sub nats-sub.go - mv dbs_vm nats-pub nats-sub ../../../cmsmon-tools - cd ../../../ - tar cfz cmsmon-tools.tar.gz cmsmon-tools - - - name: Create Release - id: create_release - uses: actions/create-release@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - tag_name: ${{ github.ref }} - release_name: Release ${{ github.ref }} - draft: false - prerelease: false - - - name: Upload binaries - id: upload-cmsmon-tools - uses: actions/upload-release-asset@v1 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - with: - upload_url: ${{ steps.create_release.outputs.upload_url }} - asset_path: ./cmsmon-tools.tar.gz - asset_name: cmsmon-tools.tar.gz - asset_content_type: application/octet-stream - - - name: Get the Ref - id: get-ref - uses: ankitvgupta/ref-to-tag-action@master - with: - ref: ${{ github.ref }} - head_ref: ${{ github.head_ref }} - - - name: Login to Registry - uses: docker/login-action@v1.6.0 - with: - registry: docker.pkg.github.com - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Login to DockerHub - uses: docker/login-action@v1 - with: - username: ${{ secrets.DOCKER_HUB_USERNAME }} - password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }} - - - name: Build cmsmon-intelligence image - run: | - curl -ksLO https://raw.githubusercontent.com/dmwm/CMSKubernetes/master/docker/cmsmon-intelligence/Dockerfile - sed -i -e "s,ENV TAG=.*,ENV TAG=${{steps.get-ref.outputs.tag}},g" Dockerfile - docker build . --tag docker.pkg.github.com/dmwm/cmsmon-intelligence/cmsmon-intelligence - - - name: Build and push - uses: docker/build-push-action@v2 - with: - context: . - file: ./Dockerfile - load: true - tags: cmssw/cmsmon-intelligence:${{steps.get-ref.outputs.tag}} - - run: docker push cmssw/cmsmon-intelligence:${{steps.get-ref.outputs.tag}} - - - name: Build cmsmon-alerts image - run: | - curl -ksLO https://raw.githubusercontent.com/dmwm/CMSKubernetes/master/docker/cmsmon-alerts/Dockerfile - sed -i -e "s,ENV TAG=.*,ENV TAG=${{steps.get-ref.outputs.tag}},g" Dockerfile - docker build . --tag docker.pkg.github.com/dmwm/cmsmon-alerts/cmsmon-alerts - - - name: Build and push - uses: docker/build-push-action@v2 - with: - context: . - file: ./Dockerfile - load: true - tags: cmssw/cmsmon-alerts:${{steps.get-ref.outputs.tag}} - - run: docker push cmssw/cmsmon-alerts:${{steps.get-ref.outputs.tag}} diff --git a/.github/workflows/test.yml b/.github/workflows/syntaxcheck.yml similarity index 89% rename from .github/workflows/test.yml rename to .github/workflows/syntaxcheck.yml index e1415fd8..a292656a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/syntaxcheck.yml @@ -1,21 +1,25 @@ -name: Continuous integration testing +name: Syntax check -on: [pull_request, push] +on: + push: + paths: + - '**.json' + - '**.schema' + - '**.yaml' + + pull_request: jobs: - test: - + syntaxcheck: runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - + - name: check valid json run: | echo "Checking valid json..." diff --git a/README.md b/README.md index b00c3271..3766b18e 100644 --- a/README.md +++ b/README.md @@ -6,3 +6,15 @@ infrastructure. - Available [here](https://cmsmonit-docs.web.cern.ch/) - source [code](https://gitlab.cern.ch/cmsmonitoring/cmsmonit-docs) + + +### Git Workflows + +- On tag `go-*.*.*` + - Builds go executables and release `cmsmon-tools` + - Builds `cmsmonitoring/cmsmonit-int` docker image and push to registry.cern.ch + - Builds `cmsmonitoring/cmsmonit-alert` docker image and push to registry.cern.ch +- On tag `sqoop-*.*.*` + - Builds `cmsmonitoring/sqoop` docker image and push to registry.cern.ch +- Syntaxcheck on specical condition + - Check validations of json and yaml files only that kind of files are changed diff --git a/sqoop/cronjobs.txt b/sqoop/cronjobs.txt new file mode 100644 index 00000000..2e2f7c1a --- /dev/null +++ b/sqoop/cronjobs.txt @@ -0,0 +1,36 @@ +# +# Sqoop jobs taken from AWG and CERN DB IT group to dump CMS databases +# + +# jobmon dump +#13 04 * * * cd /data/sqoop; ./run.sh ./scripts/cms-jm.sh +#13 04 * * * cd /data/sqoop; ./run.sh ./scripts/jm-cms-data-pop.sh + +# cms-jm-data-popularity dumps +#23 04 * * * cd /data/sqoop; ./run.sh ./scripts/cmssw-popularity.sh + +# cmssw popularity +#42 04 * * * cd /data/sqoop/cmssw-popularity; ./run.sh ./sqoop-load.sh + +# PhEDEX dumps +22 03 * * * cd /data/sqoop; ./run.sh ./scripts/phedex-blk-replicas-snapshot.sh +43 03 * * * cd /data/sqoop; ./run.sh ./scripts/phedex-file-catalog.sh + +# Rucio dumps +30 06 * * * cd /data/sqoop; ./run.sh ./scripts/rucio_replicas.sh +00 01 * * * cd /data/sqoop; ./run.sh ./scripts/rucio_dids.sh +30 02 * * * cd /data/sqoop; ./run.sh ./scripts/rucio_contents.sh + +# DBS dumps +27 03 * * * cd /data/sqoop; ./run.sh ./scripts/cms-dbs3-datasets.sh +32 03 * * * cd /data/sqoop; ./run.sh ./scripts/cms-dbs3-blocks.sh +37 03 * * * cd /data/sqoop; ./run.sh ./scripts/cms-dbs3-files.sh +03 22 * * MON cd /data/sqoop; ./run.sh ./scripts/cms-dbs3-full-copy-PHYS01.sh +03 22 * * TUE cd /data/sqoop; ./run.sh ./scripts/cms-dbs3-full-copy-PHYS02.sh +03 22 * * WED cd /data/sqoop; ./run.sh ./scripts/cms-dbs3-full-copy-PHYS03.sh +12 22 * * THU cd /data/sqoop; ./run.sh ./scripts/cms-dbs3-full-copy.sh + +# ASO dump +42 03 * * * cd /data/sqoop; ./run.sh ./scripts/cms-aso.sh + +40 14 * * * cd /data; /data/sqoop/run.sh /data/monit -query="stats" -token /etc/cmsdb/token -hdfs=/etc/cmsdb/hdfs.json -creds=/etc/cmsdb/cms-es-size.json -verbose 1 -inject 2>&1 1>& monit.log diff --git a/sqoop/daemon.sh b/sqoop/daemon.sh new file mode 100755 index 00000000..5b3d44af --- /dev/null +++ b/sqoop/daemon.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# clean-up log daemon + +wdir=$1 +mtime=$2 +interval=$3 + +if [ "$wdir" == "" ]; then + wdir=/data/sqoop/log # default directory to look +fi +if [ "$mtime" == "" ]; then + mtime=7 # default modification time to find +fi +if [ "$interval" == "" ]; then + interval=3600 # default sleep interval +fi +echo "daemon: $wdir with interval=$interval, mtime=$mtime" + +# start crond if it is not run +if [ -z "`ps auxww | grep crond | grep -v grep`" ]; then + crond -n & +fi + +# run daemon +while true; do + files=`find $wdir -mtime +$mtime` + for f in $files; do + if [ -f $f ] && [ ! -d $f ]; then + echo "delete: $f" + fi + done + sleep $interval +done diff --git a/sqoop/queries/CMS-JM-DataPop-history-data.sql b/sqoop/queries/CMS-JM-DataPop-history-data.sql new file mode 100755 index 00000000..62b6ed4e --- /dev/null +++ b/sqoop/queries/CMS-JM-DataPop-history-data.sql @@ -0,0 +1,41 @@ +select + JOBID as "SchedulerJobId", + FILENAME as "FileName", + ISPARENT as "IsParentFile", + PROTOCOL as "ProtocolUsed", + FILEEXITFLAG as "SuccessFlag", + FILETYPE as "FileType", + LUMIRANGE as "LumiRanges", + STRIPPEDFILES as "StrippedFiles", + + BLOCKID as "BlockId", + STRIPPEDBLOCKS as "StrippedBlocks", + BLOCKNAME as "BlockName", + INPUTCOLLECTION as "InputCollection", + APPLICATION as "Application", + TASKTYPE as "Type", + SUBMISSIONTOOL as "SubmissionTool", + INPUTSE as "InputSE", + TARGETCE as "TargetCE", + SITENAME as "SiteName", + SCHEDULERNAME as "SchedulerName", + + JOBMONITORID as "JobMonitorId", + TASKJOBID as "TaskJobId", + TASKID as "TaskId", + TASKMONITORID as "TaskMonitorId", + JOBEXECEXITCODE as "JobExecExitCode", + JOBEXECEXITTIMESTAMP as "JobExecExitTimeStamp", + STARTEDRUNNINGTIMESTAMP as "StartedRunningTimeStamp", + FINISHEDTIMESTAMP as "FinishedTimeStamp", + WALLCLOCKCPUTIME as "WrapWC", + CPUTIME as "WrapCPU", + Null as "ExeCPU", + USERID as "UserId", + USERNAME as "GridName" + +from + CMS_POPULARITY_SYSTEM.RAW_FILE +where + RAW_FILE."FINISHEDTIMESTAMP" >= to_date(:startdate,'YY-MM-DD HH24:MI:SS') and + RAW_FILE."FINISHEDTIMESTAMP" < to_date(:enddate,'YY-MM-DD HH24:MI:SS') diff --git a/sqoop/queries/CMS-JM-DataPop-new-query.sql b/sqoop/queries/CMS-JM-DataPop-new-query.sql new file mode 100644 index 00000000..8c65dc59 --- /dev/null +++ b/sqoop/queries/CMS-JM-DataPop-new-query.sql @@ -0,0 +1,66 @@ +select + job_file."JobId", + job_file."FileName", + job_file."IsParent" as "IsParentFile", + job_file."ProtocolUsed", + job_file."SuccessFlag", + job_file."FileType", + job_file."LumiRanges", + job_file."StrippedFiles", + + job_block."BlockId", + job_block."StrippedBlocks", + data_block."BlockName", + input_collection."InputCollection", + application."Application", + task_type."Type", + submission_tool."SubmissionTool", + job."InputSE", + job."TargetCE", + site."VOName" as "SiteName", + scheduler."SchedulerName", + + job."JobMonitorId", + job."TaskJobId", + job."SchedulerJobId" as "SchedulerJobIdV2", + task."TaskId", + task."TaskMonitorId", + job."JobExecExitCode", + job."JobExecExitTimeStamp", + job."StartedRunningTimeStamp", + job."FinishedTimeStamp", + job."WrapWC", + job."WrapCPU", + job."ExeCPU", + users."UserId", + users."GridName" + + from CMS_DASHBOARD.job, + CMS_DASHBOARD.job_file, + CMS_DASHBOARD.job_block, + CMS_DASHBOARD.data_block, + CMS_DASHBOARD.input_collection, + CMS_DASHBOARD.application, + CMS_DASHBOARD.task_type, + CMS_DASHBOARD.submission_tool, + CMS_DASHBOARD.task, + CMS_DASHBOARD.site, + CMS_DASHBOARD.scheduler, + CMS_DASHBOARD.users + +where + job."TaskId" = task."TaskId" and + task."TaskTypeId" = task_type."TaskTypeId" and + TASK."InputCollectionId" = input_collection."InputCollectionId" and + job."SiteId" = site."SiteId" and + job."SchedulerId" = scheduler."SchedulerId" and + task."UserId" = users."UserId" and + task."SubmissionToolId" = submission_tool."SubmissionToolId" and + task."ApplicationId" = application."ApplicationId" and + job."JobId" = job_block."JobId" and + job_block."BlockId" = data_block."BlockId" and + job."JobId" = job_file."JobId" and + job."FinishedTimeStamp" >= to_date(:startdate,'YY-MM-DD HH24:MI:SS') and + job."FinishedTimeStamp" < to_date(:enddate,'YY-MM-DD HH24:MI:SS') + +order by job_block."JobId" diff --git a/sqoop/queries/CMS-JM-newQuery-selectAll.sql b/sqoop/queries/CMS-JM-newQuery-selectAll.sql new file mode 100755 index 00000000..e3812603 --- /dev/null +++ b/sqoop/queries/CMS-JM-newQuery-selectAll.sql @@ -0,0 +1,160 @@ +-- cms job monitoring (new query) +select +job."SchedulerJobId", +job."JobId", +job."JobMonitorId", +decode(JOB."DboardJobEndId",'S',(decode(JOB."DboardGridEndId",'D','success','U','success','failed')),'failed') state, +"FinishedTimeStamp" - "StartedRunningTimeStamp" as "duration", +job."TaskJobId", +job."LocalBatchJobId", +job."VOJobId", +job."NextJobId", +job."RbId", +job."EventRange", +job."SubNodeIp", +job."LongCEId", +job."ShortCEId", +job."SiteId", +job."WNIp" as "JobWNIp", +job."DboardGridEndId", +job."DboardStatusEnterTimeStamp", +job."DboardFirstInfoTimeStamp", +job."DboardLatestInfoTimeStamp", +job."GridStatusId", +job."GridStatusReasonId", +job."GridStatusTimeStamp", +job."GridStatusSourceId", +job."GridEndStatusId", +job."GridEndStatusReasonId", +job."GridEndStatusTimeStamp", +job."GridFinishedTimeStamp", +job."ExecutableFinishedTimeStamp", +job."JobExecExitCode", +job."JobExecExitReasonId", +job."JobExecExitTimeStamp", +job."JobApplExitCode", +job."JobApplExitReasonId", +job."CreatedTimeStamp", +job."SubmittedTimeStamp", +job."ScheduledTimeStamp", +job."StartedRunningTimeStamp", +job."FinishedTimeStamp", +job."SchedulerId", +job."JobProcessingDetailsId", +job."SubAttemptStartTimeStamp", +job."SubAttemptCount", +job."UpdateStmtTimeStamp", +job."TimeOutFlag", +job."DboardGridEndStatusReasonId", +job."ExeTime", +job."NEvProc" as "NEventsProcessed", +job."NEvReq", +job."WrapCPU", +job."WrapWC", +job."ExeCPU", +job."StOutWC", +job."JobType" as "oldType", +job."StageOutSE", +job."Memory", +job."PilotFlag", +job."InputSE", +job."ParentPilotId", +job."ResubmitterFlag", +job."WNHostName", +job."AccessType", +job."JobLog", +job."TargetCE", +job."CoreCount", +job."NCores", +job."PeakRss", +task."TaskId", +task."TaskMonitorId" as "TaskName", +task."TaskCreatedTimeStamp", +task."TaskTypeId", +task."NTaskSteps", +task."TaskStatusId", +task."JdlCoreId", +task."NEventsPerJob", +task."ApplicationId", +task."ApplExecId", +task."InputCollectionId", +task."DefaultSchedulerId", +task."SubmissionToolId", +task."SubmissionUIId", +task."JobProcessingTypeId", +task."TargetCE" as "TaskTargetCE", +task."SubmissionType", +task."SubToolVerId", +task_type."Type" as "TaskType", +task_type."ValidityFlag", +task_type."GenericType", +task_type."NewGenericType" as "type", +task_type."NewType" as "jobtype", +node."IpValue" as "WNIp", +users."UserId", +users."CertId", +users."RoleId", +users."VOId", +users."UnixName", +users."GridCertificateSubject", +users."GridName", +users."SaveGridName", +users."Country" as "userCountry", +site."SiteName", +site."DisplayName", +site."SiteState", +site."SiteUniqueId", +site."SiteWWW", +site."SiteEmail", +site."SiteLocation", +site."InteractiveInterfaceFlag", +site."Country" as "siteCountry", +site."Tier", +site."SamName", +site."VOName", +site."GridMapSize", +site."SiteDBId", +site."CPU", +site."LocalStore", +site."DiskStore", +site."TapeStore", +site."WanStore", +site."NationalBandwidth", +site."OpnBandwidth", +site."JobSlots", +site."LocalMonURL", +site."Federation", +application."ApplicationVersion", +application."Application", +application."ValidityFlag" as "appValitityFlag", +submission_tool."SubmissionTool", +scheduler."SchedulerName", +input_collection."InputCollection", +input_collection."RequestTimeStamp", +input_collection."ProcessingStartedTimeStamp", +input_collection."MergingStartedTimeStamp", +input_collection."FirstAnalysisAccessTimeStamp", +input_collection."LatestAnalysisAccessTimeStamp", +input_collection."RequestedEvents", +input_collection."ProcessedEvents", +input_collection."MergedEvents", +input_collection."ProdmonDatasetId", +input_collection."Status" +from CMS_DASHBOARD.job, CMS_DASHBOARD.task, CMS_DASHBOARD.task_type, CMS_DASHBOARD.node, + CMS_DASHBOARD.users, CMS_DASHBOARD.site, CMS_DASHBOARD.input_collection, + CMS_DASHBOARD.application, CMS_DASHBOARD.submission_tool, CMS_DASHBOARD.scheduler +where + job."TaskId" = task."TaskId" + and task."TaskTypeId" = task_type."TaskTypeId" + and task."UserId" = users."UserId" + and job."SiteId" = site."SiteId" + and job."SchedulerId" = scheduler."SchedulerId" + and NODE."NodeId" = JOB."WNIp" + and task."InputCollectionId" = input_collection."InputCollectionId" + and task."SubmissionToolId" = submission_tool."SubmissionToolId" + and task."ApplicationId" = application."ApplicationId" + and "DboardStatusId" = 'T' + and "DboardJobEndId" in ('S','F') + and "FinishedTimeStamp" >= to_date('14-07-04 00:00:00','YY-MM-DD HH24:MI:SS') + and "FinishedTimeStamp" < to_date('14-07-05 00:00:00','YY-MM-DD HH24:MI:SS') + and job."TimeOutFlag" != 1; \ No newline at end of file diff --git a/sqoop/queries/CMSSW-popularity.sql b/sqoop/queries/CMSSW-popularity.sql new file mode 100644 index 00000000..408a5098 --- /dev/null +++ b/sqoop/queries/CMSSW-popularity.sql @@ -0,0 +1 @@ +select * from CMS_CMSSW_POPULARITY.T_RAW_CMSSW where END_DATE>=:start and END_DATE<:date; diff --git a/sqoop/queries/dbs3-full.sql b/sqoop/queries/dbs3-full.sql new file mode 100644 index 00000000..44ce73da --- /dev/null +++ b/sqoop/queries/dbs3-full.sql @@ -0,0 +1,56 @@ +/project/awg/cms/CMS_DBS3_PROD_GLOBAL +1482188400 -> `date +'%R' -d "2016-12-20"` 00:00 + + + +SELECT * FROM CMS_DBS3_PROD_GLOBAL_OWNER.DATASETS D + +SELECT * FROM CMS_DBS3_PROD_GLOBAL_OWNER.blocks B + +SELECT * FROM CMS_DBS3_PROD_GLOBAL_OWNER.files F + +SELECT * FROM CMS_DBS3_PROD_GLOBAL_OWNER.physics_groups G +SELECT * FROM CMS_DBS3_PROD_GLOBAL_OWNER.acquisition_eras AE +SELECT * FROM CMS_DBS3_PROD_GLOBAL_OWNER.processing_eras PE +SELECT * FROM CMS_DBS3_PROD_GLOBAL_OWNER.dataset_access_types A + + +sqoop import --direct --connect jdbc:oracle:thin:@cmsr-drac10-scan:10121/cmsr.cern.ch --fetch-size 10000 --username hadoop_data_reader --password impala1234 --target-dir /project/awg/cms/CMS_DBS3_PROD_GLOBAL/datasets -m 1 \ +--query \ +"SELECT * FROM CMS_DBS3_PROD_GLOBAL_OWNER.DATASETS D where ( D.creation_date < 1482188400 and D.LAST_MODIFICATION_DATE < 1482188400 ) AND + \$CONDITIONS" \ +--fields-terminated-by , --escaped-by \\ --optionally-enclosed-by '\"' + + +sqoop import --direct --connect jdbc:oracle:thin:@cmsr-drac10-scan:10121/cmsr.cern.ch --fetch-size 10000 --username hadoop_data_reader --password impala1234 --target-dir /project/awg/cms/CMS_DBS3_PROD_GLOBAL/blocks -m 1 \ +--query \ +"SELECT * FROM CMS_DBS3_PROD_GLOBAL_OWNER.blocks b where ( b.creation_date < 1482188400 and b.LAST_MODIFICATION_DATE < 1482188400 ) and \$CONDITIONS" \ +--fields-terminated-by , --escaped-by \\ --optionally-enclosed-by '\"' + +sqoop import --direct --connect jdbc:oracle:thin:@cmsr-drac10-scan:10121/cmsr.cern.ch --fetch-size 10000 --username hadoop_data_reader --password impala1234 --target-dir /project/awg/cms/CMS_DBS3_PROD_GLOBAL/files -m 1 \ +--query \ +"SELECT * FROM CMS_DBS3_PROD_GLOBAL_OWNER.files f where ( f.creation_date < 1482188400 and f.LAST_MODIFICATION_DATE < 1482188400 ) and \$CONDITIONS" \ +--fields-terminated-by , --escaped-by \\ --optionally-enclosed-by '\"' + + +sqoop import --direct --connect jdbc:oracle:thin:@cmsr-drac10-scan:10121/cmsr.cern.ch --fetch-size 10000 --username hadoop_data_reader --password impala1234 --target-dir /project/awg/cms/CMS_DBS3_PROD_GLOBAL/physics_groups -m 1 \ +--query \ +"SELECT * FROM CMS_DBS3_PROD_GLOBAL_OWNER.physics_groups where \$CONDITIONS" \ +--fields-terminated-by , --escaped-by \\ --optionally-enclosed-by '\"' + + +sqoop import --direct --connect jdbc:oracle:thin:@cmsr-drac10-scan:10121/cmsr.cern.ch --fetch-size 10000 --username hadoop_data_reader --password impala1234 --target-dir /project/awg/cms/CMS_DBS3_PROD_GLOBAL/acquisition_eras -m 1 \ +--query \ +"SELECT * FROM CMS_DBS3_PROD_GLOBAL_OWNER.acquisition_eras ae where ( ae.creation_date < 1482188400 ) and \$CONDITIONS" \ +--fields-terminated-by , --escaped-by \\ --optionally-enclosed-by '\"' + + +sqoop import --direct --connect jdbc:oracle:thin:@cmsr-drac10-scan:10121/cmsr.cern.ch --fetch-size 10000 --username hadoop_data_reader --password impala1234 --target-dir /project/awg/cms/CMS_DBS3_PROD_GLOBAL/processing_eras -m 1 \ +--query \ +"SELECT * FROM CMS_DBS3_PROD_GLOBAL_OWNER.processing_eras pe where ( pe.creation_date < 1482188400 ) and \$CONDITIONS" \ +--fields-terminated-by , --escaped-by \\ --optionally-enclosed-by '\"' + +sqoop import --direct --connect jdbc:oracle:thin:@cmsr-drac10-scan:10121/cmsr.cern.ch --fetch-size 10000 --username hadoop_data_reader --password impala1234 --target-dir /project/awg/cms/CMS_DBS3_PROD_GLOBAL/dataset_access_types -m 1 \ +--query \ +"SELECT * FROM CMS_DBS3_PROD_GLOBAL_OWNER.dataset_access_types at where \$CONDITIONS" \ +--fields-terminated-by , --escaped-by \\ --optionally-enclosed-by '\"' diff --git a/sqoop/queries/dbs3.sql b/sqoop/queries/dbs3.sql new file mode 100644 index 00000000..7523ff6a --- /dev/null +++ b/sqoop/queries/dbs3.sql @@ -0,0 +1,62 @@ +select + + files.FILE_ID, + files.LOGICAL_FILE_NAME, + files.IS_FILE_VALID, + files.FILE_TYPE_ID, + files.CHECK_SUM, + files.EVENT_COUNT, + files.FILE_SIZE, + files.BRANCH_HASH_ID, + files.ADLER32, + files.MD5, + files.AUTO_CROSS_SECTION, + + blocks.BLOCK_ID, + blocks.BLOCK_NAME, + blocks.DATASET_ID, + blocks.OPEN_FOR_WRITING, + blocks.ORIGIN_SITE_NAME, + blocks.BLOCK_SIZE, + blocks.FILE_COUNT, + blocks.CREATION_DATE, + blocks.CREATE_BY, + blocks.LAST_MODIFICATION_DATE, + blocks.LAST_MODIFIED_BY, + + datasets.DATASET_ID, + datasets.DATASET, + datasets.IS_DATASET_VALID, + datasets.PRIMARY_DS_ID, + datasets.PROCESSED_DS_ID, + datasets.DATA_TIER_ID, + datasets.DATASET_ACCESS_TYPE_ID, + datasets.ACQUISITION_ERA_ID, + datasets.PROCESSING_ERA_ID, + datasets.PHYSICS_GROUP_ID, + datasets.XTCROSSSECTION, + datasets.PREP_ID, + datasets.CREATION_DATE, + datasets.CREATE_BY, + datasets.LAST_MODIFICATION_DATE, + datasets.LAST_MODIFIED_BY, + + files.CREATION_DATE, + files.CREATE_BY, + files.LAST_MODIFICATION_DATE, + files.LAST_MODIFIED_BY, + + +from + CMS_DBS3_PROD_GLOBAL_OWNER.FILES files, + CMS_DBS3_PROD_GLOBAL_OWNER.BLOCKS blocks, + CMS_DBS3_PROD_GLOBAL_OWNER.DATASETS datasets + +where + files.BLOCK_ID = blocks.BLOCK_ID and + files.DATASET_ID = datasets.DATASET_ID and + ( + ( files.CREATION_DATE >= $startTS and files.CREATION_DATE < $endTS ) + or + ( files.LAST_MODIFICATION_DATE >= $startTS and files.LAST_MODIFICATION_DATE < $endTS ) + ) \ No newline at end of file diff --git a/sqoop/run.sh b/sqoop/run.sh new file mode 100755 index 00000000..e16dceee --- /dev/null +++ b/sqoop/run.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# Kerberos +keytab=/etc/cmsdb/keytab +principal=`klist -k "$keytab" | tail -1 | awk '{print $2}'` +echo "principal=$principal" +kinit $principal -k -t "$keytab" +if [ $? == 1 ]; then + echo "Unable to perform kinit" + exit 1 +fi +klist -k "$keytab" + +# execute given script +export PATH=$PATH:/usr/hdp/hadoop/bin:/data:/data/sqoop +$@ +if [ $? -ne 0 ]; then + expire=`date -d '+2 hour' --rfc-3339=ns | tr ' ' 'T'` + msg="Sqoop job failure" + DATE=`date` + host=`hostname` + job=`echo $@` + amhost="http://cms-monitoring.cern.ch:30093" + amtool alert add sqoop_failure alertname='sqoop job failure' job="$job" host=$host severity=high tag=k8s alert=amtool kind=cluster service=sqoop --end=$expire --annotation=summary='$msg' --annotation=date='$DATE' --alertmanager.url=$amhost + amhost="http://cms-monitoring-ha1.cern.ch:30093" + amtool alert add sqoop_failure alertname='sqoop job failure' job="$job" host=$host severity=high tag=k8s alert=amtool kind=cluster service=sqoop --end=$expire --annotation=summary='$msg' --annotation=date='$DATE' --alertmanager.url=$amhost + amhost="http://cms-monitoring-ha2.cern.ch:30093" + amtool alert add sqoop_failure alertname='sqoop job failure' job="$job" host=$host severity=high tag=k8s alert=amtool kind=cluster service=sqoop --end=$expire --annotation=summary='$msg' --annotation=date='$DATE' --alertmanager.url=$amhost +fi diff --git a/sqoop/scripts/cms-aso.sh b/sqoop/scripts/cms-aso.sh new file mode 100755 index 00000000..e4cdfc31 --- /dev/null +++ b/sqoop/scripts/cms-aso.sh @@ -0,0 +1,52 @@ +#!/bin/bash +. $(dirname $0)/sqoop_utils.sh +setJava + + +BASE_PATH=${BASE_PATH:-/project/awg/cms/CMS_ASO/filetransfersdb} +export JDBC_URL=$(sed '1q;d' cmsr_cstring) +export USERNAME=$(sed '2q;d' cmsr_cstring) +export PASSWORD=$(sed '3q;d' cmsr_cstring) + +me=`basename $0`_$$ + +if [ -n "$1" ] +then + START_DATE=$1 +else + START_DATE=`date +'%F' -d "1 day ago"` +fi + +year=`date +'%Y' -d "$START_DATE"` +month=`date +'%-m' -d "$START_DATE"` +day=`date +'%-d' -d "$START_DATE"` +END_DATE=`date +'%F' -d "$START_DATE + 1 day"` + +START_DATE_S=`date +'%s' -d "$START_DATE"` +END_DATE_S=`date +'%s' -d "$END_DATE"` + +LOG_FILE=log/`date +'%F_%H%m%S'`_`basename $0` + +OUTPUT_FOLDER=$BASE_PATH/diff/date=$START_DATE +MERGED_FOLDER=$BASE_PATH/merged +echo "Timerange: $START_DATE to $END_DATE" >> $LOG_FILE.cron +echo "Folder: $OUTPUT_FOLDER" >> $LOG_FILE.cron +echo "quering..." >> $LOG_FILE.cron + +sqoop import -Dmapreduce.job.user.classpath.first=true -Ddfs.client.socket-timeout=120000 --direct --connect $JDBC_URL --fetch-size 10000 --username $USERNAME --password $PASSWORD --target-dir $OUTPUT_FOLDER -m 1 --query \ +"SELECT * FROM cms_analysis_reqmgr.filetransfersdb F where ( F.tm_last_update >= ${START_DATE_S} ) and ( F.tm_last_update < ${END_DATE_S} ) AND \$CONDITIONS" \ +--fields-terminated-by , --escaped-by \\ --optionally-enclosed-by '\"' \ +1>$LOG_FILE.stdout 2>$LOG_FILE.stderr + +OUTPUT_ERROR=`cat $LOG_FILE.stderr | egrep "ERROR tool.ImportTool: Error during import: Import job failed!"` +TRANSF_INFO=`cat $LOG_FILE.stderr | egrep "INFO mapreduce.ImportJobBase: Transferred"` + +if [[ $OUTPUT_ERROR == *"ERROR"* || ! $TRANSF_INFO == *"INFO"* ]] +then + echo "Error occured, check $LOG_FILE" + sendMail $LOG_FILE.stdout cms-aso-filetransfersdb $START_DATE + sendMail $LOG_FILE.stderr cms-aso-filetransfersdb $START_DATE +else + hdfs dfs -cat $OUTPUT_FOLDER/part-m-00000 | hdfs dfs -appendToFile - $MERGED_FOLDER/part-m-00000 +fi + diff --git a/sqoop/scripts/cms-dbs3-blocks.sh b/sqoop/scripts/cms-dbs3-blocks.sh new file mode 100755 index 00000000..674c0e42 --- /dev/null +++ b/sqoop/scripts/cms-dbs3-blocks.sh @@ -0,0 +1,51 @@ +#!/bin/bash +. $(dirname $0)/sqoop_utils.sh +setJava + +BASE_PATH=${BASE_PATH:-/project/awg/cms/dbs3verify/CMS_DBS3_PROD_GLOBAL/blocks} +JDBC_URL=$(sed '1q;d' cmsr_cstring) +USERNAME=$(sed '2q;d' cmsr_cstring) +PASSWORD=$(sed '3q;d' cmsr_cstring) + +me=`basename $0`_$$ + +if [ -n "$1" ] +then + START_DATE=$1 +else + START_DATE=`date +'%F' -d "1 day ago"` +fi + +year=`date +'%Y' -d "$START_DATE"` +month=`date +'%-m' -d "$START_DATE"` +day=`date +'%-d' -d "$START_DATE"` +END_DATE=`date +'%F' -d "$START_DATE + 1 day"` + +START_DATE_S=`date +'%s' -d "$START_DATE"` +END_DATE_S=`date +'%s' -d "$END_DATE"` + +LOG_FILE=log/`date +'%F_%H%m%S'`_`basename $0` + +OUTPUT_FOLDER=$BASE_PATH/diff/date=$START_DATE +MERGED_FOLDER=$BASE_PATH/merged +echo "Timerange: $START_DATE to $END_DATE" >> $LOG_FILE.cron +echo "Folder: $OUTPUT_FOLDER" >> $LOG_FILE.cron +echo "quering..." >> $LOG_FILE.cron +#exit; + +sqoop import -Dmapreduce.job.user.classpath.first=true -Ddfs.client.socket-timeout=120000 --direct --connect $JDBC_URL --fetch-size 10000 --username $USERNAME --password $PASSWORD --target-dir $OUTPUT_FOLDER -m 1 --query \ +"SELECT D.DATASET_ID, B.BLOCK_NAME, B.OPEN_FOR_WRITING, B.BLOCK_SIZE, B.FILE_COUNT FROM CMS_DBS3_PROD_GLOBAL_OWNER.BLOCKS B JOIN CMS_DBS3_PROD_GLOBAL_OWNER.DATASETS D ON D.DATASET_ID = B.DATASET_ID where ( B.creation_date >= ${START_DATE_S} or B.LAST_MODIFICATION_DATE >= ${START_DATE_S} ) and ( B.creation_date < ${END_DATE_S} and B.LAST_MODIFICATION_DATE < ${END_DATE_S} ) AND \$CONDITIONS" \ +--fields-terminated-by , --escaped-by \\ --optionally-enclosed-by '\"' \ +1>$LOG_FILE.stdout 2>$LOG_FILE.stderr + +OUTPUT_ERROR=`cat $LOG_FILE.stderr | egrep "ERROR tool.ImportTool: Error during import: Import job failed!"` +TRANSF_INFO=`cat $LOG_FILE.stderr | egrep "INFO mapreduce.ImportJobBase: Transferred"` + +if [[ $OUTPUT_ERROR == *"ERROR"* || ! $TRANSF_INFO == *"INFO"* ]] +then + echo "Error occured, check $LOG_FILE" + sendMail $LOG_FILE.stdout cms-dbs3-blocks $START_DATE + sendMail $LOG_FILE.stderr cms-dbs3-blocks $START_DATE +else + hdfs dfs -cat $OUTPUT_FOLDER/part-m-00000 | hdfs dfs -appendToFile - $MERGED_FOLDER/part-m-00000 +fi diff --git a/sqoop/scripts/cms-dbs3-datasets.sh b/sqoop/scripts/cms-dbs3-datasets.sh new file mode 100755 index 00000000..c9c766c9 --- /dev/null +++ b/sqoop/scripts/cms-dbs3-datasets.sh @@ -0,0 +1,51 @@ +#!/bin/bash +. $(dirname $0)/sqoop_utils.sh +setJava + +BASE_PATH=${BASE_PATH:-/project/awg/cms/dbs3verify/CMS_DBS3_PROD_GLOBAL/datasets} +JDBC_URL=$(sed '1q;d' cmsr_cstring) +USERNAME=$(sed '2q;d' cmsr_cstring) +PASSWORD=$(sed '3q;d' cmsr_cstring) + +me=`basename $0`_$$ + +if [ -n "$1" ] +then + START_DATE=$1 +else + START_DATE=`date +'%F' -d "1 day ago"` +fi + +year=`date +'%Y' -d "$START_DATE"` +month=`date +'%-m' -d "$START_DATE"` +day=`date +'%-d' -d "$START_DATE"` +END_DATE=`date +'%F' -d "$START_DATE + 1 day"` + +START_DATE_S=`date +'%s' -d "$START_DATE"` +END_DATE_S=`date +'%s' -d "$END_DATE"` + +LOG_FILE=log/`date +'%F_%H%m%S'`_`basename $0` + +OUTPUT_FOLDER=$BASE_PATH/diff/date=$START_DATE +MERGED_FOLDER=$BASE_PATH/merged +echo "Timerange: $START_DATE to $END_DATE" >> $LOG_FILE.cron +echo "Folder: $OUTPUT_FOLDER" >> $LOG_FILE.cron +echo "quering..." >> $LOG_FILE.cron +#exit; + +sqoop import -Dmapreduce.job.user.classpath.first=true -Ddfs.client.socket-timeout=120000 --direct --connect $JDBC_URL --fetch-size 10000 --username $USERNAME --password $PASSWORD --target-dir $OUTPUT_FOLDER -m 1 --query \ +"SELECT D.DATASET_ID, D.DATASET FROM CMS_DBS3_PROD_GLOBAL_OWNER.DATASETS D where ( creation_date >= ${START_DATE_S} or LAST_MODIFICATION_DATE >= ${START_DATE_S} ) and ( creation_date < ${END_DATE_S} and LAST_MODIFICATION_DATE < ${END_DATE_S} ) and \$CONDITIONS" \ +--fields-terminated-by , --escaped-by \\ --optionally-enclosed-by '\"' \ +1>$LOG_FILE.stdout 2>$LOG_FILE.stderr + +OUTPUT_ERROR=`cat $LOG_FILE.stderr | egrep "ERROR tool.ImportTool: Error during import: Import job failed!"` +TRANSF_INFO=`cat $LOG_FILE.stderr | egrep "INFO mapreduce.ImportJobBase: Transferred"` + +if [[ $OUTPUT_ERROR == *"ERROR"* || ! $TRANSF_INFO == *"INFO"* ]] +then + echo "Error occured, check $LOG_FILE" + sendMail $LOG_FILE.stdout cms-dbs3-datasets $START_DATE + sendMail $LOG_FILE.stderr cms-dbs3-datasets $START_DATE +else + hdfs dfs -cat $OUTPUT_FOLDER/part-m-00000 | hdfs dfs -appendToFile - $MERGED_FOLDER/part-m-00000 +fi diff --git a/sqoop/scripts/cms-dbs3-files.sh b/sqoop/scripts/cms-dbs3-files.sh new file mode 100755 index 00000000..4807c789 --- /dev/null +++ b/sqoop/scripts/cms-dbs3-files.sh @@ -0,0 +1,51 @@ +#!/bin/bash +. $(dirname $0)/sqoop_utils.sh +setJava + +BASE_PATH=${BASE_PATH:-/project/awg/cms/dbs3verify/CMS_DBS3_PROD_GLOBAL/files} +JDBC_URL=$(sed '1q;d' cmsr_cstring) +USERNAME=$(sed '2q;d' cmsr_cstring) +PASSWORD=$(sed '3q;d' cmsr_cstring) + +me=`basename $0`_$$ + +if [ -n "$1" ] +then + START_DATE=$1 +else + START_DATE=`date +'%F' -d "1 day ago"` +fi + +year=`date +'%Y' -d "$START_DATE"` +month=`date +'%-m' -d "$START_DATE"` +day=`date +'%-d' -d "$START_DATE"` +END_DATE=`date +'%F' -d "$START_DATE + 1 day"` + +START_DATE_S=`date +'%s' -d "$START_DATE"` +END_DATE_S=`date +'%s' -d "$END_DATE"` + +LOG_FILE=log/`date +'%F_%H%m%S'`_`basename $0` + + +OUTPUT_FOLDER=$BASE_PATH/diff/date=$START_DATE +MERGED_FOLDER=$BASE_PATH/merged +echo "Timerange: $START_DATE to $END_DATE" >> $LOG_FILE.cron +echo "Folder: $OUTPUT_FOLDER" >> $LOG_FILE.cron +echo "quering..." >> $LOG_FILE.cron + +sqoop import -Dmapreduce.job.user.classpath.first=true -Ddfs.client.socket-timeout=120000 --direct --connect $JDBC_URL --fetch-size 10000 --username $USERNAME --password $PASSWORD --target-dir $OUTPUT_FOLDER -m 1 --query \ +"SELECT B.BLOCK_ID, F.LOGICAL_FILE_NAME, F.FILE_SIZE, F.ADLER32 FROM CMS_DBS3_PROD_GLOBAL_OWNER.BLOCKS B JOIN CMS_DBS3_PROD_GLOBAL_OWNER.FILES F ON F.BLOCK_ID = B.BLOCK_ID where ( F.creation_date >= ${START_DATE_S} or F.LAST_MODIFICATION_DATE >= ${START_DATE_S} ) and ( F.creation_date < ${END_DATE_S} and F.LAST_MODIFICATION_DATE < ${END_DATE_S} ) AND \$CONDITIONS" \ +--fields-terminated-by , --escaped-by \\ --optionally-enclosed-by '\"' \ +1>$LOG_FILE.stdout 2>$LOG_FILE.stderr + +OUTPUT_ERROR=`cat $LOG_FILE.stderr | egrep "ERROR tool.ImportTool: Error during import: Import job failed!"` +TRANSF_INFO=`cat $LOG_FILE.stderr | egrep "INFO mapreduce.ImportJobBase: Transferred"` + +if [[ $OUTPUT_ERROR == *"ERROR"* || ! $TRANSF_INFO == *"INFO"* ]] +then + echo "Error occured, check $LOG_FILE" + sendMail $LOG_FILE.stdout cms-dbs3-files $START_DATE + sendMail $LOG_FILE.stderr cms-dbs3-files $START_DATE +else + hdfs dfs -cat $OUTPUT_FOLDER/part-m-00000 | hdfs dfs -appendToFile - $MERGED_FOLDER/part-m-00000 +fi diff --git a/sqoop/scripts/cms-dbs3-full-copy-PHYS01.sh b/sqoop/scripts/cms-dbs3-full-copy-PHYS01.sh new file mode 100755 index 00000000..6268a35c --- /dev/null +++ b/sqoop/scripts/cms-dbs3-full-copy-PHYS01.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +. $(dirname $0)/sqoop_utils.sh +setJava +##$CONFIG### + + +export JDBC_URL=$(sed '1q;d' cmsr_cstring) +export USERNAME=$(sed '2q;d' cmsr_cstring) +export PASSWORD=$(sed '3q;d' cmsr_cstring) + +export BASE_PATH=${BASE_PATH:-/project/awg/cms/CMS_DBS3_PROD_PHYS01} +export SCHEMA="CMS_DBS3_PROD_PHYS01_OWNER" + +export TABLES="RELEASE_VERSIONS PROCESSING_ERAS PROCESSED_DATASETS PRIMARY_DS_TYPES PRIMARY_DATASETS PHYSICS_GROUPS PARAMETER_SET_HASHES OUTPUT_MODULE_CONFIGS \ + MIGRATION_REQUESTS MIGRATION_BLOCKS FILE_OUTPUT_MOD_CONFIGS FILE_DATA_TYPES DBS_VERSIONS DATA_TIERS DATASET_RUNS DATASET_OUTPUT_MOD_CONFIGS DATASET_ACCESS_TYPES \ + BRANCH_HASHES ASSOCIATED_FILES APPLICATION_EXECUTABLES ACQUISITION_ERAS FILE_PARENTS DATASET_PARENTS BLOCK_PARENTS BLOCKS DATASETS FILE_LUMIS FILES" + +############# + + +me=`basename $0`_$$ + +if [ -n "$1" ] +then + START_DATE=$1 +else + START_DATE=`date +'%F'` +fi + +year=`date +'%Y' -d "$START_DATE"` +month=`date +'%-m' -d "$START_DATE"` +day=`date +'%-d' -d "$START_DATE"` + +export START_DATE_S=`date +'%s' -d "$START_DATE"` + +export LOG_FILE=log/`date +'%F_%H%m%S'`_`basename $0` + +clean + +import_tables "$TABLES" + +import_counts "$TABLES" + +deploy + + + diff --git a/sqoop/scripts/cms-dbs3-full-copy-PHYS02.sh b/sqoop/scripts/cms-dbs3-full-copy-PHYS02.sh new file mode 100755 index 00000000..b10830e4 --- /dev/null +++ b/sqoop/scripts/cms-dbs3-full-copy-PHYS02.sh @@ -0,0 +1,48 @@ +#!/bin/bash + + +. $(dirname $0)/sqoop_utils.sh +setJava +##$CONFIG### + +export JDBC_URL=$(sed '1q;d' cmsr_cstring) +export USERNAME=$(sed '2q;d' cmsr_cstring) +export PASSWORD=$(sed '3q;d' cmsr_cstring) + +BASE_PATH=${BASE_PATH:-/project/awg/cms/CMS_DBS3_PROD_PHYS02} +SCHEMA="CMS_DBS3_PROD_PHYS02_OWNER" + +TABLES="RELEASE_VERSIONS PROCESSING_ERAS PROCESSED_DATASETS PRIMARY_DS_TYPES PRIMARY_DATASETS PHYSICS_GROUPS PARAMETER_SET_HASHES OUTPUT_MODULE_CONFIGS \ + MIGRATION_REQUESTS MIGRATION_BLOCKS FILE_OUTPUT_MOD_CONFIGS FILE_DATA_TYPES DBS_VERSIONS DATA_TIERS DATASET_RUNS DATASET_OUTPUT_MOD_CONFIGS DATASET_ACCESS_TYPES \ + BRANCH_HASHES ASSOCIATED_FILES APPLICATION_EXECUTABLES ACQUISITION_ERAS FILE_PARENTS DATASET_PARENTS BLOCK_PARENTS BLOCKS DATASETS FILE_LUMIS FILES" + +############# + + +me=`basename $0`_$$ + +if [ -n "$1" ] +then + START_DATE=$1 +else + START_DATE=`date +'%F'` +fi + +year=`date +'%Y' -d "$START_DATE"` +month=`date +'%-m' -d "$START_DATE"` +day=`date +'%-d' -d "$START_DATE"` + +export START_DATE_S=`date +'%s' -d "$START_DATE"` + +export LOG_FILE=log/`date +'%F_%H%m%S'`_`basename $0` + + +clean + +import_tables "$TABLES" + +import_counts "$TABLES" + + +deploy + diff --git a/sqoop/scripts/cms-dbs3-full-copy-PHYS03.sh b/sqoop/scripts/cms-dbs3-full-copy-PHYS03.sh new file mode 100755 index 00000000..751f9e26 --- /dev/null +++ b/sqoop/scripts/cms-dbs3-full-copy-PHYS03.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +. $(dirname $0)/sqoop_utils.sh +setJava + +##$CONFIG### + + +export JDBC_URL=$(sed '1q;d' cmsr_cstring) +export USERNAME=$(sed '2q;d' cmsr_cstring) +export PASSWORD=$(sed '3q;d' cmsr_cstring) +BASE_PATH=${BASE_PATH:-/project/awg/cms/CMS_DBS3_PROD_PHYS03} +SCHEMA="CMS_DBS3_PROD_PHYS03_OWNER" + +TABLES="RELEASE_VERSIONS PROCESSING_ERAS PROCESSED_DATASETS PRIMARY_DS_TYPES PRIMARY_DATASETS PHYSICS_GROUPS PARAMETER_SET_HASHES OUTPUT_MODULE_CONFIGS \ + MIGRATION_REQUESTS MIGRATION_BLOCKS FILE_OUTPUT_MOD_CONFIGS FILE_DATA_TYPES DBS_VERSIONS DATA_TIERS DATASET_RUNS DATASET_OUTPUT_MOD_CONFIGS DATASET_ACCESS_TYPES \ + BRANCH_HASHES ASSOCIATED_FILES APPLICATION_EXECUTABLES ACQUISITION_ERAS FILE_PARENTS DATASET_PARENTS BLOCK_PARENTS BLOCKS DATASETS FILE_LUMIS FILES" + +############# + + +me=`basename $0`_$$ + +if [ -n "$1" ] +then + START_DATE=$1 +else + START_DATE=`date +'%F'` +fi + +year=`date +'%Y' -d "$START_DATE"` +month=`date +'%-m' -d "$START_DATE"` +day=`date +'%-d' -d "$START_DATE"` + +export START_DATE_S=`date +'%s' -d "$START_DATE"` + +export LOG_FILE=log/`date +'%F_%H%m%S'`_`basename $0` + +if hdfs dfs -test -e "$BASE_PATH/new" +then + hdfs dfs -rm -r $BASE_PATH/new + echo "Removing old $BASE_PATH/new" >> $LOG_FILE.cron +fi + +import_tables "$TABLES" + +import_counts "$TABLES" + +deploy + + diff --git a/sqoop/scripts/cms-dbs3-full-copy.sh b/sqoop/scripts/cms-dbs3-full-copy.sh new file mode 100755 index 00000000..400c90b8 --- /dev/null +++ b/sqoop/scripts/cms-dbs3-full-copy.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +. $(dirname $0)/sqoop_utils.sh +setJava + +##$CONFIG### + + +export JDBC_URL=$(sed '1q;d' cmsr_cstring) +export USERNAME=$(sed '2q;d' cmsr_cstring) +export PASSWORD=$(sed '3q;d' cmsr_cstring) +BASE_PATH=${BASE_PATH:-/project/awg/cms/CMS_DBS3_PROD_GLOBAL} +SCHEMA="CMS_DBS3_PROD_GLOBAL_OWNER" + +TABLES="RELEASE_VERSIONS PROCESSING_ERAS PROCESSED_DATASETS PRIMARY_DS_TYPES PRIMARY_DATASETS PHYSICS_GROUPS PARAMETER_SET_HASHES OUTPUT_MODULE_CONFIGS \ + MIGRATION_REQUESTS MIGRATION_BLOCKS FILE_OUTPUT_MOD_CONFIGS FILE_DATA_TYPES DATA_TIERS DATASET_RUNS DATASET_OUTPUT_MOD_CONFIGS DATASET_ACCESS_TYPES \ + BRANCH_HASHES ASSOCIATED_FILES APPLICATION_EXECUTABLES ACQUISITION_ERAS FILE_PARENTS DATASET_PARENTS BLOCK_PARENTS BLOCKS DATASETS FILE_LUMIS FILES" + +############# + +me=`basename $0`_$$ + +if [ -n "$1" ] +then + START_DATE=$1 +else + START_DATE=`date +'%F'` +fi + +year=`date +'%Y' -d "$START_DATE"` +month=`date +'%-m' -d "$START_DATE"` +day=`date +'%-d' -d "$START_DATE"` + +export START_DATE_S=`date +'%s' -d "$START_DATE"` + +export LOG_FILE=log/`date +'%F_%H%m%S'`_`basename $0` + + +clean + +import_tables "$TABLES" + +import_counts "$TABLES" + +deploy + diff --git a/sqoop/scripts/cms-jm.sh b/sqoop/scripts/cms-jm.sh new file mode 100755 index 00000000..65a29d6e --- /dev/null +++ b/sqoop/scripts/cms-jm.sh @@ -0,0 +1,48 @@ +#!/bin/bash +. $(dirname $0)/sqoop_utils.sh +setJava + +BASE_PATH=${BASE_PATH:-/project/awg/cms/job-monitoring/avro-snappy} +JDBC_URL=$(sed '1q;d' lcgr_cstring) +USERNAME=$(sed '2q;d' lcgr_cstring) +PASSWORD=$(sed '3q;d' lcgr_cstring) + +me=`basename $0`_$$ + +if [ -n "$1" ] +then + START_DATE=$1 +else + START_DATE=`date +'%Y-%m-%d' -d "1 day ago"` +fi + +year=`date +'%Y' -d "$START_DATE"` +month=`date +'%-m' -d "$START_DATE"` +day=`date +'%-d' -d "$START_DATE"` +END_DATE=`date +'%Y-%m-%d' -d "$START_DATE + 1 day"` + +LOG_FILE=log/`date +'%F_%H%m%S'`_`basename $0` + +OUTPUT_FOLDER=$BASE_PATH/year=$year/month=$month/day=$day +echo "Timerange: $START_DATE to $END_DATE" >> $LOG_FILE.cron +echo "Folder: $OUTPUT_FOLDER" >> $LOG_FILE.cron +echo "quering..." >> $LOG_FILE.cron +#continue + +#change to @lcgr-dg-s once it's working +sqoop import -Dmapreduce.job.user.classpath.first=true -Ddfs.client.socket-timeout=120000 --direct --connect $JDBC_URL --fetch-size 10000 --username $USERNAME --password $PASSWORD --target-dir $OUTPUT_FOLDER -m 1 --query \ +"select job."\"SchedulerJobId"\", job."\"JobId"\", job."\"JobMonitorId"\", decode(JOB."\"DboardJobEndId"\",'S',(decode(JOB."\"DboardGridEndId"\",'D','success','U','success','failed')),'failed') state, extract( day from ("\"FinishedTimeStamp"\" - "\"StartedRunningTimeStamp"\") )*24*60*60 + extract( hour from ("\"FinishedTimeStamp"\" - "\"StartedRunningTimeStamp"\") )*60*60 + extract( minute from ("\"FinishedTimeStamp"\" - "\"StartedRunningTimeStamp"\") )*60+ extract( second from ("\"FinishedTimeStamp"\" - "\"StartedRunningTimeStamp"\")) as duration, job."\"TaskJobId"\", job."\"LocalBatchJobId"\", job."\"VOJobId"\", job."\"NextJobId"\", job."\"RbId"\", job."\"EventRange"\", job."\"SubNodeIp"\", job."\"LongCEId"\", job."\"ShortCEId"\", job."\"SiteId"\", job."\"WNIp"\" as "\"JobWNIp"\", job."\"DboardGridEndId"\", job."\"DboardStatusEnterTimeStamp"\", job."\"DboardFirstInfoTimeStamp"\", job."\"DboardLatestInfoTimeStamp"\", job."\"GridStatusId"\", job."\"GridStatusReasonId"\", job."\"GridStatusTimeStamp"\", job."\"GridStatusSourceId"\", job."\"GridEndStatusId"\", job."\"GridEndStatusReasonId"\", job."\"GridEndStatusTimeStamp"\", job."\"GridFinishedTimeStamp"\", job."\"ExecutableFinishedTimeStamp"\", job."\"JobExecExitCode"\", job."\"JobExecExitReasonId"\", job."\"JobExecExitTimeStamp"\", job."\"JobApplExitCode"\", job."\"JobApplExitReasonId"\", job."\"CreatedTimeStamp"\", job."\"SubmittedTimeStamp"\", job."\"ScheduledTimeStamp"\", job."\"StartedRunningTimeStamp"\", job."\"FinishedTimeStamp"\", job."\"SchedulerId"\", job."\"JobProcessingDetailsId"\", job."\"SubAttemptStartTimeStamp"\", job."\"SubAttemptCount"\", job."\"UpdateStmtTimeStamp"\", job."\"TimeOutFlag"\", job."\"DboardGridEndStatusReasonId"\", job."\"ExeTime"\", job."\"NEvProc"\" as "\"NEventsProcessed"\", job."\"NEvReq"\", job."\"WrapCPU"\", job."\"WrapWC"\", job."\"ExeCPU"\", job."\"StOutWC"\", job."\"JobType"\" as "\"oldType"\", job."\"StageOutSE"\", job."\"Memory"\", job."\"PilotFlag"\", job."\"InputSE"\", job."\"ParentPilotId"\", job."\"ResubmitterFlag"\", job."\"WNHostName"\", job."\"AccessType"\", job."\"JobLog"\", job."\"TargetCE"\", job."\"CoreCount"\", job."\"NCores"\", job."\"PeakRss"\", task."\"TaskId"\", task."\"TaskMonitorId"\" as "\"TaskName"\", task."\"TaskCreatedTimeStamp"\", task."\"TaskTypeId"\", task."\"NTaskSteps"\", task."\"TaskStatusId"\", task."\"JdlCoreId"\", task."\"NEventsPerJob"\", task."\"ApplicationId"\", task."\"ApplExecId"\", task."\"InputCollectionId"\", task."\"DefaultSchedulerId"\", task."\"SubmissionToolId"\", task."\"SubmissionUIId"\", task."\"JobProcessingTypeId"\", task."\"TargetCE"\" as "\"TaskTargetCE"\", task."\"SubmissionType"\", task."\"SubToolVerId"\", task_type."\"Type"\" as "\"TaskType"\", task_type."\"ValidityFlag"\", task_type."\"GenericType"\", task_type."\"NewGenericType"\" as "\"type"\", task_type."\"NewType"\" as "\"jobtype"\", node."\"IpValue"\" as "\"WNIp"\", users."\"UserId"\", users."\"CertId"\", users."\"RoleId"\", users."\"VOId"\", users."\"UnixName"\", users."\"GridCertificateSubject"\", users."\"GridName"\", users."\"SaveGridName"\", users."\"Country"\" as "\"userCountry"\", site."\"SiteName"\", site."\"DisplayName"\", site."\"SiteState"\", site."\"SiteUniqueId"\", site."\"SiteWWW"\", site."\"SiteEmail"\", site."\"SiteLocation"\", site."\"InteractiveInterfaceFlag"\", site."\"Country"\" as "\"siteCountry"\", site."\"Tier"\", site."\"SamName"\", site."\"VOName"\", site."\"GridMapSize"\", site."\"SiteDBId"\", site."\"CPU"\", site."\"LocalStore"\", site."\"DiskStore"\", site."\"TapeStore"\", site."\"WanStore"\", site."\"NationalBandwidth"\", site."\"OpnBandwidth"\", site."\"JobSlots"\", site."\"LocalMonURL"\", site."\"Federation"\", application."\"ApplicationVersion"\", application."\"Application"\", application."\"ValidityFlag"\" as "\"appValitityFlag"\", submission_tool."\"SubmissionTool"\", scheduler."\"SchedulerName"\", input_collection."\"InputCollection"\", input_collection."\"RequestTimeStamp"\", input_collection."\"ProcessingStartedTimeStamp"\", input_collection."\"MergingStartedTimeStamp"\", input_collection."\"FirstAnalysisAccessTimeStamp"\", input_collection."\"LatestAnalysisAccessTimeStamp"\", input_collection."\"RequestedEvents"\", input_collection."\"ProcessedEvents"\", input_collection."\"MergedEvents"\", input_collection."\"ProdmonDatasetId"\", input_collection."\"Status"\" from CMS_DASHBOARD.job, CMS_DASHBOARD.task, CMS_DASHBOARD.task_type, CMS_DASHBOARD.node, CMS_DASHBOARD.users, CMS_DASHBOARD.site, CMS_DASHBOARD.input_collection, CMS_DASHBOARD.application, CMS_DASHBOARD.submission_tool, CMS_DASHBOARD.scheduler where job."\"TaskId"\" = task."\"TaskId"\" and task."\"TaskTypeId"\" = task_type."\"TaskTypeId"\" and task."\"UserId"\" = users."\"UserId"\" and job."\"SiteId"\" = site."\"SiteId"\" and job."\"SchedulerId"\" = scheduler."\"SchedulerId"\" and NODE."\"NodeId"\" = JOB."\"WNIp"\" and task."\"InputCollectionId"\" = input_collection."\"InputCollectionId"\" and task."\"SubmissionToolId"\" = submission_tool."\"SubmissionToolId"\" and task."\"ApplicationId"\" = application."\"ApplicationId"\" and "\"DboardStatusId"\" = 'T' and "\"DboardJobEndId"\" in ('S','F') and "\"FinishedTimeStamp"\" >= to_date('$START_DATE','YYYY-MM-DD') and "\"FinishedTimeStamp"\" < to_date('$END_DATE','YYYY-MM-DD') and job."\"TimeOutFlag"\" != 1 and \$CONDITIONS" \ +--as-avrodatafile --compression-codec snappy \ +1>$LOG_FILE.stdout 2>$LOG_FILE.stderr +#--fields-terminated-by , --escaped-by \\ --optionally-enclosed-by '\"' \ + +if ! grep 'INFO mapreduce.ImportJobBase: Transferred' $LOG_FILE.stderr 1>/dev/null +then + echo "Error occured, check $LOG_FILE" + sendMail $LOG_FILE.stdout atlas-job-monitoring $START_DATE + sendMail $LOG_FILE.stderr atlas-job-monitoring $START_DATE +fi + +#hdfs dfs -put /tmp/$me.stdout $OUTPUT_FOLDER/sqoop.stdout && rm /tmp/$me.stdout +#hdfs dfs -put /tmp/$me.stderr $OUTPUT_FOLDER/sqoop.stderr && rm /tmp/$me.stderr +#rm /tmp/$$.stdout /tmp/$$.stderr diff --git a/sqoop/scripts/cms-rucio-dump.sh b/sqoop/scripts/cms-rucio-dump.sh new file mode 100755 index 00000000..400c90b8 --- /dev/null +++ b/sqoop/scripts/cms-rucio-dump.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +. $(dirname $0)/sqoop_utils.sh +setJava + +##$CONFIG### + + +export JDBC_URL=$(sed '1q;d' cmsr_cstring) +export USERNAME=$(sed '2q;d' cmsr_cstring) +export PASSWORD=$(sed '3q;d' cmsr_cstring) +BASE_PATH=${BASE_PATH:-/project/awg/cms/CMS_DBS3_PROD_GLOBAL} +SCHEMA="CMS_DBS3_PROD_GLOBAL_OWNER" + +TABLES="RELEASE_VERSIONS PROCESSING_ERAS PROCESSED_DATASETS PRIMARY_DS_TYPES PRIMARY_DATASETS PHYSICS_GROUPS PARAMETER_SET_HASHES OUTPUT_MODULE_CONFIGS \ + MIGRATION_REQUESTS MIGRATION_BLOCKS FILE_OUTPUT_MOD_CONFIGS FILE_DATA_TYPES DATA_TIERS DATASET_RUNS DATASET_OUTPUT_MOD_CONFIGS DATASET_ACCESS_TYPES \ + BRANCH_HASHES ASSOCIATED_FILES APPLICATION_EXECUTABLES ACQUISITION_ERAS FILE_PARENTS DATASET_PARENTS BLOCK_PARENTS BLOCKS DATASETS FILE_LUMIS FILES" + +############# + +me=`basename $0`_$$ + +if [ -n "$1" ] +then + START_DATE=$1 +else + START_DATE=`date +'%F'` +fi + +year=`date +'%Y' -d "$START_DATE"` +month=`date +'%-m' -d "$START_DATE"` +day=`date +'%-d' -d "$START_DATE"` + +export START_DATE_S=`date +'%s' -d "$START_DATE"` + +export LOG_FILE=log/`date +'%F_%H%m%S'`_`basename $0` + + +clean + +import_tables "$TABLES" + +import_counts "$TABLES" + +deploy + diff --git a/sqoop/scripts/cms-rucio.sh b/sqoop/scripts/cms-rucio.sh new file mode 100755 index 00000000..629cd7cb --- /dev/null +++ b/sqoop/scripts/cms-rucio.sh @@ -0,0 +1,59 @@ +#!/bin/bash +. $(dirname $0)/sqoop_utils.sh +setJava + +BASE_PATH=${BASE_PATH:-/project/awg/cms/rucio} +JDBC_URL=$(sed '1q;d' cmsr_cstring) +USERNAME=$(sed '2q;d' cmsr_cstring) +PASSWORD=$(sed '3q;d' cmsr_cstring) + +me=`basename $0`_$$ + +if [ -n "$1" ] +then + START_DATE=$1 +else + START_DATE=`date +'%F' -d "1 day ago"` +fi + +year=`date +'%Y' -d "$START_DATE"` +month=`date +'%-m' -d "$START_DATE"` +day=`date +'%-d' -d "$START_DATE"` +END_DATE=`date +'%F' -d "$START_DATE + 1 day"` + +START_DATE_S=`date +'%s' -d "$START_DATE"` +END_DATE_S=`date +'%s' -d "$END_DATE"` + +LOG_FILE=log/`date +'%F_%H%m%S'`_`basename $0` + + +OUTPUT_FOLDER=$BASE_PATH/diff/date=$START_DATE +MERGED_FOLDER=$BASE_PATH/merged +echo "Timerange: $START_DATE to $END_DATE" >> $LOG_FILE.cron +echo "Folder: $OUTPUT_FOLDER" >> $LOG_FILE.cron +echo "quering..." >> $LOG_FILE.cron + +RUCIO_USERNAME=`cat /etc/secrets/rucio | grep -i username | awk '{split($1,a,"="); print a[1]}'` +RUCIO_PASSWORD=`cat /etc/secrets/rucio | grep -i password | awk '{split($1,a,"="); print a[1]}'` + +TZ=UTC sqoop import -Dmapreduce.job.user.classpath.first=true \ +-Doraoop.chunk.method=PARTITION -Doraoop.timestamp.string=false \ +-Dmapred.child.java.opts="-Djava.security.egd=file:/dev/../dev/urandom" \ +--connect jdbc:oracle:thin:@adcr-s.cern.ch:10121/adcr_rucio_2.cern.ch \ +--username $RUCIO_USERNAME --password-file $RUCIO_PASSWORD \ +--num-mappers 100 --fetch-size 10000 \ +--table ATLAS_RUCIO.REPLICAS --as-avrodatafile -z --direct --target-dir \ +/user/rucio01/dumps/`date +%Y-%m-%d`/replicas \ +1>$LOG_FILE.stdout 2>$LOG_FILE.stderr + +OUTPUT_ERROR=`cat $LOG_FILE.stderr | egrep "ERROR tool.ImportTool: Error during import: Import job failed!"` +TRANSF_INFO=`cat $LOG_FILE.stderr | egrep "INFO mapreduce.ImportJobBase: Transferred"` + +if [[ $OUTPUT_ERROR == *"ERROR"* || ! $TRANSF_INFO == *"INFO"* ]] +then + echo "Error occured, check $LOG_FILE" + sendMail $LOG_FILE.stdout cms-rucio $START_DATE + sendMail $LOG_FILE.stderr cms-rucio $START_DATE +else + hdfs dfs -cat $OUTPUT_FOLDER/part-m-00000 | hdfs dfs -appendToFile - $MERGED_FOLDER/part-m-00000 +fi diff --git a/sqoop/scripts/cmssw-popularity.sh b/sqoop/scripts/cmssw-popularity.sh new file mode 100755 index 00000000..4eef76f7 --- /dev/null +++ b/sqoop/scripts/cmssw-popularity.sh @@ -0,0 +1,47 @@ +#!/bin/bash +. $(dirname $0)/sqoop_utils.sh +setJava + +BASE_PATH=${BASE_PATH:-/project/awg/cms/cmssw-popularity/avro-snappy} +JDBC_URL=$(sed '1q;d' cmsr_cstring) +USERNAME=$(sed '2q;d' cmsr_cstring) +PASSWORD=$(sed '3q;d' cmsr_cstring) + +me=`basename $0`_$$ + +if [ -n "$1" ] +then + START_DATE=$1 +else + START_DATE=`date +'%Y-%m-%d' -d "1 day ago"` +fi + +year=`date +'%Y' -d "$START_DATE"` +month=`date +'%-m' -d "$START_DATE"` +day=`date +'%-d' -d "$START_DATE"` +END_DATE=`date +'%Y-%m-%d' -d "$START_DATE + 1 day"` + +LOG_FILE=log/`date +'%F_%H%m%S'`_`basename $0` + +OUTPUT_FOLDER=$BASE_PATH/year=$year/month=$month/day=$day +echo "Timerange: $START_DATE to $END_DATE" >> $LOG_FILE.cron +echo "Folder: $OUTPUT_FOLDER" >> $LOG_FILE.cron +echo "quering..." >> $LOG_FILE.cron +#continue + +#DG (once it's working) cmsr-drac10-scan.cern.ch:10121/CMSR_DRAC10.cern.ch +sqoop import -Dmapreduce.job.user.classpath.first=true -Ddfs.client.socket-timeout=120000 --direct --connect $JDBC_URL --fetch-size 10000 --username $USERNAME --password $PASSWORD --target-dir $OUTPUT_FOLDER -m 1 \ +--query "select * from CMS_CMSSW_POPULARITY.T_RAW_CMSSW where END_DATE >= to_date('${START_DATE}','YYYY-MM-DD') and END_DATE < to_date('${END_DATE}','YYYY-MM-DD') and \$CONDITIONS" \ +--as-avrodatafile --compression-codec snappy \ +1>$LOG_FILE.stdout 2>$LOG_FILE.stderr + +if ! grep 'INFO mapreduce.ImportJobBase: Transferred' $LOG_FILE.stderr && ! grep 'Map output records=0' $LOG_FILE.stderr 1>/dev/null +then + echo "Error occured, check $LOG_FILE" + sendMail $LOG_FILE.stdout cmssw-popularity $START_DATE + sendMail $LOG_FILE.stderr cmssw-popularity $START_DATE +fi + +#hdfs dfs -put /tmp/$me.stdout $OUTPUT_FOLDER/sqoop.stdout && rm /tmp/$me.stdout +#hdfs dfs -put /tmp/$me.stderr $OUTPUT_FOLDER/sqoop.stderr && rm /tmp/$me.stderr +#rm /tmp/$$.stdout /tmp/$$.stderr diff --git a/sqoop/scripts/jm-cms-data-pop.sh b/sqoop/scripts/jm-cms-data-pop.sh new file mode 100755 index 00000000..5dcb0564 --- /dev/null +++ b/sqoop/scripts/jm-cms-data-pop.sh @@ -0,0 +1,52 @@ +#!/bin/bash +. $(dirname $0)/sqoop_utils.sh +setJava + +BASE_PATH="/project/awg/cms/jm-data-popularity/avro-snappy" +JDBC_URL=$(sed '1q;d' lcgr_cstring) +USERNAME=$(sed '2q;d' lcgr_cstring) +PASSWORD=$(sed '3q;d' lcgr_cstring) + +me=`basename $0`_$$ + +if [ -n "$1" ] +then + START_DATE=$1 +else + START_DATE=`date +'%Y-%m-%d' -d "1 day ago"` +fi + +year=`date +'%Y' -d "$START_DATE"` +month=`date +'%-m' -d "$START_DATE"` +day=`date +'%-d' -d "$START_DATE"` +END_DATE=`date +'%Y-%m-%d' -d "$START_DATE + 1 day"` + +LOG_FILE=log/`date +'%F_%H%m%S'`_`basename $0` + +OUTPUT_FOLDER=$BASE_PATH/year=$year/month=$month/day=$day +hdfs dfs -rm -r -f $OUTPUT_FOLDER +echo "Timerange: $START_DATE to $END_DATE" >> $LOG_FILE.cron +echo "Folder: $OUTPUT_FOLDER" >> $LOG_FILE.cron +echo "quering..." >> $LOG_FILE.cron +#continue + +#change to @lcgr-dg-s once it's working +sqoop import -Dmapreduce.job.user.classpath.first=true -Ddfs.client.socket-timeout=120000 --direct --connect $JDBC_URL --fetch-size 10000 --username $USERNAME --password $PASSWORD --target-dir $OUTPUT_FOLDER -m 1 --query \ +"select job_file."\"JobId"\", job_file."\"FileName"\", job_file."\"IsParent"\" as "\"IsParentFile"\", job_file."\"ProtocolUsed"\", job_file."\"SuccessFlag"\", job_file."\"FileType"\", job_file."\"LumiRanges"\", job_file."\"StrippedFiles"\", job_block."\"BlockId"\", job_block."\"StrippedBlocks"\", data_block."\"BlockName"\", input_collection."\"InputCollection"\", application."\"Application"\",application."\"ApplicationVersion"\", task_type."\"Type"\", task_type."\"GenericType"\", task_type."\"NewGenericType"\", task_type."\"NewType"\", task_type."\"ValidityFlag"\", submission_tool."\"SubmissionTool"\", job."\"InputSE"\", job."\"TargetCE"\", site."\"VOName"\" as "\"SiteName"\", scheduler."\"SchedulerName"\", job."\"JobMonitorId"\", job."\"TaskJobId"\", job."\"SchedulerJobId"\" as "\"SchedulerJobIdV2"\", task."\"TaskId"\", task."\"TaskMonitorId"\", task."\"NEventsPerJob"\", task."\"NTaskSteps"\", job."\"JobExecExitCode"\", job."\"JobExecExitTimeStamp"\", job."\"StartedRunningTimeStamp"\", job."\"FinishedTimeStamp"\", job."\"WrapWC"\", job."\"WrapCPU"\", job."\"ExeCPU"\", job."\"NCores"\", job."\"NEvProc"\", job."\"NEvReq"\", job."\"WNHostName"\",job."\"JobType"\", users."\"UserId"\", users."\"GridName"\" from CMS_DASHBOARD.job, CMS_DASHBOARD.job_file, CMS_DASHBOARD.job_block, CMS_DASHBOARD.data_block, CMS_DASHBOARD.input_collection, CMS_DASHBOARD.application, CMS_DASHBOARD.task_type, CMS_DASHBOARD.submission_tool, CMS_DASHBOARD.task, CMS_DASHBOARD.site, CMS_DASHBOARD.scheduler, CMS_DASHBOARD.users where job."\"TaskId"\" = task."\"TaskId"\" and task."\"TaskTypeId"\" = task_type."\"TaskTypeId"\" and TASK."\"InputCollectionId"\" = input_collection."\"InputCollectionId"\" and job."\"SiteId"\" = site."\"SiteId"\" and job."\"SchedulerId"\" = scheduler."\"SchedulerId"\" and task."\"UserId"\" = users."\"UserId"\" and task."\"SubmissionToolId"\" = submission_tool."\"SubmissionToolId"\" and task."\"ApplicationId"\" = application."\"ApplicationId"\" and job."\"JobId"\" = job_block."\"JobId"\" and job_block."\"BlockId"\" = data_block."\"BlockId"\" and job."\"JobId"\" = job_file."\"JobId"\" and "\"FinishedTimeStamp"\" >= to_date('$START_DATE','YYYY-MM-DD') and "\"FinishedTimeStamp"\" < to_date('$END_DATE','YYYY-MM-DD') and \$CONDITIONS order by job_block."\"JobId"\"" \ +--as-avrodatafile --compression-codec snappy \ +1>$LOG_FILE.stdout 2>$LOG_FILE.stderr +#--fields-terminated-by , --escaped-by \\ --optionally-enclosed-by '\"' \ + +OUTPUT_ERROR=`cat $LOG_FILE.stderr | egrep "ERROR tool.ImportTool: Error during import: Import job failed!"` +TRANSF_INFO=`cat $LOG_FILE.stderr | egrep "INFO mapreduce.ImportJobBase: Transferred"` + +if [[ $OUTPUT_ERROR == *"ERROR"* || ! $TRANSF_INFO == *"INFO"* ]] +then + echo "Error occured, check $LOG_FILE" + sendMail $LOG_FILE.stdout cms-popularity $START_DATE + sendMail $LOG_FILE.stderr cms-popularity $START_DATE +fi + +#hdfs dfs -put /tmp/$me.stdout $OUTPUT_FOLDER/sqoop.stdout && rm /tmp/$me.stdout +#hdfs dfs -put /tmp/$me.stderr $OUTPUT_FOLDER/sqoop.stderr && rm /tmp/$me.stderr +#rm /tmp/$$.stdout /tmp/$$.stderr diff --git a/sqoop/scripts/phedex-blk-replicas-snapshot.sh b/sqoop/scripts/phedex-blk-replicas-snapshot.sh new file mode 100755 index 00000000..c1e328ba --- /dev/null +++ b/sqoop/scripts/phedex-blk-replicas-snapshot.sh @@ -0,0 +1,39 @@ +#!/bin/bash +. $(dirname $0)/sqoop_utils.sh +setJava + +BASE_PATH="/project/awg/cms/phedex/block-replicas-snapshots/csv" +#BASE_PATH="transfermgmt" +JDBC_URL=$(sed '1q;d' cmsr_cstring) +USERNAME=$(sed '2q;d' cmsr_cstring) +PASSWORD=$(sed '3q;d' cmsr_cstring) + +me=`basename $0`_$$ + +LOG_FILE=log/`date +'%F_%H%m%S'`_`basename $0` + +year=`date +'%Y'` +month=`date +'%-m'` +day=`date +'%-d'` +TIME=`date +'%Hh%mm%Ss'` + +OUTPUT_FOLDER=$BASE_PATH/time=`date +'%F'`_${TIME} +echo "Folder: $OUTPUT_FOLDER" >> $LOG_FILE.cron +echo "quering..." >> $LOG_FILE.cron +#exit; + +sqoop import -Dmapreduce.job.user.classpath.first=true -Ddfs.client.socket-timeout=120000 --direct --connect $JDBC_URL --fetch-size 10000 --username $USERNAME --password $PASSWORD --target-dir $OUTPUT_FOLDER -m 1 \ +--query "select cms_transfermgmt.now, ds.name as dataset_name, ds.id as dataset_id, ds.is_open as dataset_is_open, ds.time_create as dataset_time_create, ds.time_update as dataset_time_update, bk.name as block_name, bk.id as block_id, bk.files as block_files, bk.bytes as block_bytes, bk.is_open as block_is_open, bk.time_create as block_time_create, bk.time_update as block_time_update, n.name as node_name, n.id as node_id, br.is_active, br.src_files, br.src_bytes, br.dest_files, br.dest_bytes, br.node_files, br.node_bytes, br.xfer_files, br.xfer_bytes, br.is_custodial, br.user_group, br.time_create as replica_time_create, br.time_update as replica_time_update from cms_transfermgmt.t_dps_dataset ds join cms_transfermgmt.t_dps_block bk on bk.dataset=ds.id join cms_transfermgmt.t_dps_block_replica br on br.block=bk.id join cms_transfermgmt.t_adm_node n on n.id=br.node and \$CONDITIONS" \ +--fields-terminated-by , --escaped-by \\ --optionally-enclosed-by '\"' \ +1>$LOG_FILE.stdout 2>$LOG_FILE.stderr + +if ! grep 'INFO mapreduce.ImportJobBase: Transferred' $LOG_FILE.stderr 1>/dev/null +then + echo "Error occured, check $LOG_FILE" + sendMail $LOG_FILE.stdout cms-transfermgmt-snapshot $START_DATE + sendMail $LOG_FILE.stderr cms-transfermgmt-snapshot $START_DATE +fi + +#hdfs dfs -put /tmp/$me.stdout $OUTPUT_FOLDER/sqoop.stdout && rm /tmp/$me.stdout +#hdfs dfs -put /tmp/$me.stderr $OUTPUT_FOLDER/sqoop.stderr && rm /tmp/$me.stderr +#rm /tmp/$$.stdout /tmp/$$.stderr diff --git a/sqoop/scripts/phedex-file-catalog.sh b/sqoop/scripts/phedex-file-catalog.sh new file mode 100755 index 00000000..e4d83e38 --- /dev/null +++ b/sqoop/scripts/phedex-file-catalog.sh @@ -0,0 +1,55 @@ +#!/bin/bash +. $(dirname $0)/sqoop_utils.sh +setJava +BASE_PATH="/project/awg/cms/phedex/catalog/csv" +#BASE_PATH="cms-catalog" +JDBC_URL=$(sed '1q;d' cmsr_cstring) +USERNAME=$(sed '2q;d' cmsr_cstring) +PASSWORD=$(sed '3q;d' cmsr_cstring) + +me=`basename $0`_$$ + +if [ -n "$1" ] +then + START_DATE=$1 +else + START_DATE=`date +'%F' -d "1 day ago"` +fi + +year=`date +'%Y' -d "$START_DATE"` +month=`date +'%-m' -d "$START_DATE"` +day=`date +'%-d' -d "$START_DATE"` +END_DATE=`date +'%F' -d "$START_DATE + 1 day"` + +START_DATE_S=`date +'%s' -d "$START_DATE"` +END_DATE_S=`date +'%s' -d "$END_DATE"` + +LOG_FILE=log/`date +'%F_%H%m%S'`_`basename $0` + +OUTPUT_FOLDER=$BASE_PATH/diff/date=$START_DATE +MERGED_FOLDER=$BASE_PATH/merged +echo "Timerange: $START_DATE to $END_DATE" >> $LOG_FILE.cron +echo "Folder: $OUTPUT_FOLDER" >> $LOG_FILE.cron +echo "quering..." >> $LOG_FILE.cron +#exit; + +sqoop import -Dmapreduce.job.user.classpath.first=true -Ddfs.client.socket-timeout=120000 --direct --connect $JDBC_URL --fetch-size 10000 --username $USERNAME --password $PASSWORD --target-dir $OUTPUT_FOLDER -m 1 \ +--query "select ds.name as dataset_name, ds.id as dataset_id, ds.is_open as dataset_is_open, ds.time_create as dataset_time_create, bk.name as block_name, bk.id as block_id, bk.time_create as block_time_create, bk.is_open as block_is_open, f.logical_name as file_lfn, f.id as file_id, f.filesize, f.checksum, f.time_create as file_time_create from cms_transfermgmt.t_dps_dataset ds join cms_transfermgmt.t_dps_block bk on bk.dataset=ds.id join cms_transfermgmt.t_dps_file f on f.inblock=bk.id where f.time_create >= ${START_DATE_S} and f.time_create < ${END_DATE_S} and \$CONDITIONS" \ +--fields-terminated-by , --escaped-by \\ --optionally-enclosed-by '\"' \ +1>$LOG_FILE.stdout 2>$LOG_FILE.stderr + +OUTPUT_ERROR=`cat $LOG_FILE.stderr | egrep "ERROR tool.ImportTool: Error during import: Import job failed!"` +TRANSF_INFO=`cat $LOG_FILE.stderr | egrep "INFO mapreduce.ImportJobBase: Transferred"` + +if [[ $OUTPUT_ERROR == *"ERROR"* || ! $TRANSF_INFO == *"INFO"* ]] +then + echo "Error occured, check $LOG_FILE" + sendMail $LOG_FILE.stdout cms-transfermgmt-catalog $START_DATE + sendMail $LOG_FILE.stderr cms-transfermgmt-catalog $START_DATE +else + hdfs dfs -cat $OUTPUT_FOLDER/part-m-00000 | hdfs dfs -appendToFile - $MERGED_FOLDER/part-m-00000 +fi + +#hdfs dfs -put /tmp/$me.stdout $OUTPUT_FOLDER/sqoop.stdout && rm /tmp/$me.stdout +#hdfs dfs -put /tmp/$me.stderr $OUTPUT_FOLDER/sqoop.stderr && rm /tmp/$me.stderr +#rm /tmp/$$.stdout /tmp/$$.stderr diff --git a/sqoop/scripts/rucio_contents.sh b/sqoop/scripts/rucio_contents.sh new file mode 100755 index 00000000..c88c33a2 --- /dev/null +++ b/sqoop/scripts/rucio_contents.sh @@ -0,0 +1,70 @@ +#!/bin/bash +set -e + +# Imports CMS_RUCIO_PROD.CONTENTS table for access time of datasets + +# Hdfs output path +BASE_PATH=/project/awg/cms/rucio_contents/ +TARGET_DIR=$BASE_PATH"$(date +%Y-%m-%d)" + +# Rucio table name +TABLE=CMS_RUCIO_PROD.CONTENTS +# Oracle jdbc conn +JDBC_URL=jdbc:oracle:thin:@cms-nrac-scan.cern.ch:10121/CMSR_CMS_NRAC.cern.ch + +# Log file +LOG_FILE=log/$(date +'%F_%H%m%S')_$(basename "$0") +# Timezone +TZ=UTC + +#### +trap 'onFailExit' ERR +onFailExit() { + echo "Finished with error!" >>"$LOG_FILE".stdout + echo "Log files: ${LOG_FILE}" >>"$LOG_FILE".stdout + echo FAILED + exit 1 +} + +#### +if [ -f /etc/secrets/rucio ]; then + USERNAME=$(grep username >"$LOG_FILE".stdout + exit 1 +fi + +# Check sqoop and hadoop executables exist +if ! [ -x "$(command -v hadoop)" ]; then + echo "It seems 'hadoop' is not exist in PATH! Exiting..." >>"$LOG_FILE".stdout + exit 1 +fi + +echo "[INFO] Sqoob job for Rucio CONTENTS table is starting.." >>"$LOG_FILE".stdout +echo "[INFO] Rucio table will be imported: ${TABLE}" >>"$LOG_FILE".stdout +# Start sqoop import +/usr/hdp/sqoop/bin/sqoop import \ + -Dmapreduce.job.user.classpath.first=true \ + -Doraoop.timestamp.string=false \ + -Dmapred.child.java.opts="-Djava.security.egd=file:/dev/../dev/urandom" \ + -Ddfs.client.socket-timeout=120000 \ + --username "$USERNAME" --password "$PASSWORD" \ + -z \ + --direct \ + --throw-on-error \ + --connect $JDBC_URL \ + --num-mappers 100 \ + --fetch-size 10000 \ + --as-avrodatafile \ + --target-dir "$TARGET_DIR" \ + --table "$TABLE" 1>"$LOG_FILE".stdout 2>"$LOG_FILE".stderr + +# change permission of HDFS area +hadoop fs -chmod -R o+rx "$TARGET_DIR" + +{ + echo "[INFO] Sqoob job for Rucio CONTENTS table is finished." + echo "[INFO] Output hdfs path : ${TARGET_DIR}" + echo "SUCCESS" +} >>"$LOG_FILE".stdout diff --git a/sqoop/scripts/rucio_dids.sh b/sqoop/scripts/rucio_dids.sh new file mode 100755 index 00000000..285ee910 --- /dev/null +++ b/sqoop/scripts/rucio_dids.sh @@ -0,0 +1,72 @@ +#!/bin/bash +set -e + +# Imports CMS_RUCIO_PROD.DIDS table for access time of datasets + +# Hdfs output path +BASE_PATH=/project/awg/cms/rucio_dids/ +TARGET_DIR=$BASE_PATH"$(date +%Y-%m-%d)" + +# Oracle jdbc conn +JDBC_URL=jdbc:oracle:thin:@cms-nrac-scan.cern.ch:10121/CMSR_CMS_NRAC.cern.ch +# Rucio table +TABLE=CMS_RUCIO_PROD.DIDS + +# Local log file for both sqoop job stdout and stderr +LOG_FILE=log/$(date +'%F_%H%m%S')_$(basename "$0") +# Timezone +TZ=UTC + +#### +trap 'onFailExit' ERR +onFailExit() { + echo "Finished with error!" >>"$LOG_FILE".stdout + echo "FAILED" >>"$LOG_FILE".stdout + exit 1 +} + +#### +if [ -f /etc/secrets/rucio ]; then + USERNAME=$(grep username >"$LOG_FILE".stdout + exit 1 +fi + +# Check sqoop and hadoop executables exist +if ! [ -x "$(command -v hadoop)" ]; then + echo "[ERROR] It seems 'hadoop' is not exist in PATH! Exiting..." >>"$LOG_FILE".stdout + exit 1 +fi + +{ + echo "[INFO] Sqoob job for Rucio DIDS table is starting.." + echo "[INFO] Rucio table will be imported: ${TABLE}" +} >>"$LOG_FILE".stdout + +# Start sqoop +/usr/hdp/sqoop/bin/sqoop import \ + -Dmapreduce.job.user.classpath.first=true \ + -Doraoop.timestamp.string=false \ + -Dmapred.child.java.opts="-Djava.security.egd=file:/dev/../dev/urandom" \ + -Ddfs.client.socket-timeout=120000 \ + --username "$USERNAME" --password "$PASSWORD" \ + -z \ + --direct \ + --throw-on-error \ + --connect $JDBC_URL \ + --num-mappers 100 \ + --fetch-size 10000 \ + --as-avrodatafile \ + --target-dir "$TARGET_DIR" \ + --table "$TABLE" 1>"$LOG_FILE".stdout 2>"$LOG_FILE".stderr + +# change permission of HDFS area +hadoop fs -chmod -R o+rx "$TARGET_DIR" + +{ + echo "[INFO] Sqoob job for Rucio DIDS table is finished." + echo "[INFO] Output hdfs path : ${TARGET_DIR}" + echo "SUCCESS" +} >>"$LOG_FILE".stdout diff --git a/sqoop/scripts/rucio_replicas.sh b/sqoop/scripts/rucio_replicas.sh new file mode 100755 index 00000000..8c9fa416 --- /dev/null +++ b/sqoop/scripts/rucio_replicas.sh @@ -0,0 +1,24 @@ +# set up variables +BASE_PATH=/project/awg/cms/rucio/ +JDBC_URL=jdbc:oracle:thin:@cms-nrac-scan.cern.ch:10121/CMSR_CMS_NRAC.cern.ch +if [ -f /etc/secrets/rucio ]; then + USERNAME=`cat /etc/secrets/rucio | grep username | awk '{print $2}'` + PASSWORD=`cat /etc/secrets/rucio | grep password | awk '{print $2}'` +else + echo "Unable to read Rucio credentials" + exit 1 +fi +LOG_FILE=log/`date +'%F_%H%m%S'`_`basename $0` +TABLE=cms_rucio_prod.replicas +TZ=UTC + +/usr/hdp/sqoop/bin/sqoop import -Dmapreduce.job.user.classpath.first=true \ + -Doraoop.timestamp.string=false \ + -Dmapred.child.java.opts="-Djava.security.egd=file:/dev/../dev/urandom" \ + --connect $JDBC_URL --username $USERNAME --password $PASSWORD \ + --num-mappers 100 --fetch-size 10000 --table $TABLE --as-avrodatafile \ + -z --direct --target-dir $BASE_PATH`date +%Y-%m-%d`/replicas \ +1>$LOG_FILE.stdout 2>$LOG_FILE.stderr + +# change permossion of HDFS area +hadoop fs -chmod -R o+rx $BASE_PATH`date +%Y-%m-%d` diff --git a/sqoop/scripts/sqoop_utils.sh b/sqoop/scripts/sqoop_utils.sh new file mode 100644 index 00000000..db46f8d9 --- /dev/null +++ b/sqoop/scripts/sqoop_utils.sh @@ -0,0 +1,169 @@ +function setJava() +{ + export PATH="$PATH:/usr/hdp/sqoop/bin/" + DEFAULT_JAVA_HOME='/usr/lib/jvm/java-1.8.0' + if [ -z $1 ] + then + JAVA_HOME=$DEFAULT_JAVA_HOME + else + JAVA_HOME=$1 + fi + export JAVA_HOME +} + +function sendMail() +{ + + OUTPUT_ERROR=`cat $1 | egrep -i error` + + SUBJECT="Error in $2 loading [$3]" + #MAIL=`cat ~/.forward` + if [[ $OUTPUT_ERROR == *"ERROR"* ]]; then + (echo "Check file [$1] for more info." && echo "===========" && echo "${OUTPUT_ERROR}") #| mail -s "$SUBJECT" $MAIL + fi +} + +function exit_on_failure() +{ + OUTPUT_ERROR=`cat $TMP_OUT | egrep "ERROR tool.ImportTool: Error during import: Import job failed!"` + TRANSF_INFO=`cat $TMP_ERR | egrep "mapreduce.ImportJobBase: Transferred"` + ROWS_TRANSFERED=`grep 'Map output records=0' $TMP_ERR |wc -l ` + + if [[ $ROWS_TRANSFERED == "1" ]] + then + sendMail $LOG_FILE.stderr $SCHEMA $START_DATE + fi + + if [[ $OUTPUT_ERROR == *"ERROR"* || ! $TRANSF_INFO == *"INFO"* ]] + then + echo "Error occured, check $LOG_FILE" + sendMail $LOG_FILE.stdout $SCHEMA $START_DATE + sendMail $LOG_FILE.stderr $SCHEMA $START_DATE + + if hdfs dfs -test -e "$BASE_PATH/new" + then + hdfs dfs -rm -r -skipTrash $BASE_PATH/new >/dev/null 2>&1 + fi + exit 1 + fi +} + +function clean() +{ + kinit -R + if hdfs dfs -test -e "$BASE_PATH/new" + then + hdfs dfs -rm -r -skipTrash $BASE_PATH/new >> $LOG_FILE.cron + echo "Removing old $BASE_PATH/new" >> $LOG_FILE.cron + fi +} +function deploy() +{ + kinit -R + error=0 + if ! hdfs dfs -test -e "$BASE_PATH/new" + then + echo "$BASE_PATH/new DOES NOT EXISTS! Nothing to deploy! " >> $LOG_FILE.cron + echo "$BASE_PATH/new DOES NOT EXISTS! Nothing to deploy! " >> $LOG_FILE.stderr + sendMail $LOG_FILE.stderr $SCHEMA $START_DATE + return + fi + + if hdfs dfs -test -e "$BASE_PATH/old" + then + echo "Removing old $BASE_PATH/old" >> $LOG_FILE.cron + hdfs dfs -rm -r -skipTrash $BASE_PATH/old >>$LOG_FILE.stderr 2>>$LOG_FILE.stderr + error=$(($error+$?)) + fi + if hdfs dfs -test -e "$BASE_PATH/current" + then + echo "Moving $BASE_PATH/current to $BASE_PATH/old" >> $LOG_FILE.cron + hdfs dfs -mv $BASE_PATH/current $BASE_PATH/old 2>>$LOG_FILE.stderr + error=$(($error+$?)) + fi + echo "Deploying $BASE_PATH/new to $BASE_PATH/current" >> $LOG_FILE.cron + hdfs dfs -mv $BASE_PATH/new $BASE_PATH/current 2>>$LOG_FILE.stderr + error=$(($error+$?)) + + if [ $error -ne 0 ] + then + echo "ERROR Deployment failed!!!">>$LOG_FILE.stderr + sendMail $LOG_FILE.stderr $SCHEMA $START_DATE + fi +} + +function import_table() +{ + kinit -R + TABLE=$1 + TMP_OUT=log/$TABLE.stdout + TMP_ERR=log/$TABLE.stderr + + OUTPUT_FOLDER=$BASE_PATH/new/$TABLE + Q="SELECT * FROM ${SCHEMA}.$TABLE F where \$CONDITIONS" + echo "Timerange: $START_DATE to $END_DATE" >> $LOG_FILE.cron + echo "Folder: $OUTPUT_FOLDER" >> $LOG_FILE.cron + echo "quering...$Q" >> $LOG_FILE.cron + + echo "sqoop import..." + + sqoop import -Dmapreduce.job.user.classpath.first=true -Ddfs.client.socket-timeout=120000 --direct --connect $JDBC_URL --fetch-size 10000 --username $USERNAME --password $PASSWORD --target-dir $OUTPUT_FOLDER -m 1 --query "$Q" \ + --fields-terminated-by , --escaped-by \\ --optionally-enclosed-by '\"' \ + 1>$TMP_OUT 2>$TMP_ERR + + + + cat $TMP_OUT >>$LOG_FILE.stdout + cat $TMP_ERR >>$LOG_FILE.stderr + EXIT_STATUS=$? + exit_on_failure + +} + + + +function generateCountQuery +{ + TABS=$1 + i=0 + QUERY="" + for T in $TABS + do + + if [ $i -ne 0 ] + then + QUERY="$QUERY union all " + fi + i=$((i+1)) + QUERY="$QUERY select '$T', count (*) from $SCHEMA.$T" + + done + QUERY="$QUERY where \$CONDITIONS" +} + +function import_tables() +{ + for TABLE_NAME in $1 + do + import_table $TABLE_NAME + done +} + +function import_counts() +{ + kinit -R + TABS=$1 + generateCountQuery "$TABS" + + OUTPUT_FOLDER=$BASE_PATH/new/ROW_COUNT + echo "Timerange: $START_DATE to $END_DATE" >> $LOG_FILE.cron + echo "Folder: $OUTPUT_FOLDER" >> $LOG_FILE.cron + echo "quering...$QUERY" >> $LOG_FILE.cron + + sqoop import -Dmapreduce.job.user.classpath.first=true -Ddfs.client.socket-timeout=120000 --direct --connect $JDBC_URL --fetch-size 10000 --username $USERNAME --password $PASSWORD --target-dir $OUTPUT_FOLDER -m 1 --query "$QUERY" \ + --fields-terminated-by , --escaped-by \\ --optionally-enclosed-by '\"' \ + 1>>$LOG_FILE.stdout 2>>$LOG_FILE.stderr + + exit_on_failure + +} diff --git a/sqoop/tests/test_rucio_dids.sh b/sqoop/tests/test_rucio_dids.sh new file mode 100644 index 00000000..c8b17b75 --- /dev/null +++ b/sqoop/tests/test_rucio_dids.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +# Create ./test/log directory if not exist +[ -d ./log ] || mkdir -p ./log + +BASE_PATH=/tmp/cmssqoop/rucio_dids/ + +sed -e "s,BASE_PATH=.*,BASE_PATH=${BASE_PATH},g" \ + -e "s,WHERE scope,WHERE ROWNUM <= 10 AND scope,g" \ + /data/sqoop/rucio_dids.sh >.test_rucio_dids.tmp +#bash .test_rucio_dids.tmp +#rm .test_rucio_dids.tmp