From 7c13d73601cd1a98b7815c325e62017db27cf79e Mon Sep 17 00:00:00 2001 From: bert <1131482051@qq.com> Date: Tue, 3 Dec 2019 09:07:38 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E5=9B=BE=E7=89=87=E6=95=B0=E6=8D=AE?= =?UTF-8?q?=E9=9B=86=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 删除不必要信息,整合保留图片地址、图片标签和图片内的详细信息 --- datasetHandleCode/datasetHandle.ipynb | 580 ++++++++++++++++++++++++++ 1 file changed, 580 insertions(+) create mode 100644 datasetHandleCode/datasetHandle.ipynb diff --git a/datasetHandleCode/datasetHandle.ipynb b/datasetHandleCode/datasetHandle.ipynb new file mode 100644 index 0000000..d4cede5 --- /dev/null +++ b/datasetHandleCode/datasetHandle.ipynb @@ -0,0 +1,580 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = pd.read_csv('./train-annotations-bbox.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ImageIDSourceLabelNameConfidenceXMinXMaxYMinYMaxIsOccludedIsTruncatedIsGroupOfIsDepictionIsInside
0000002b66c9c498exclick/m/01g31710.0125000.1953120.1484380.58750001000
1000002b66c9c498exclick/m/01g31710.0250000.2765630.7140630.94843801000
2000002b66c9c498exclick/m/01g31710.1515620.3109370.1984370.59062510000
3000002b66c9c498exclick/m/01g31710.2562500.4296880.6515630.92500010000
4000002b66c9c498exclick/m/01g31710.2578120.3468750.2359380.38593810000
\n", + "
" + ], + "text/plain": [ + " ImageID Source LabelName Confidence XMin XMax \\\n", + "0 000002b66c9c498e xclick /m/01g317 1 0.012500 0.195312 \n", + "1 000002b66c9c498e xclick /m/01g317 1 0.025000 0.276563 \n", + "2 000002b66c9c498e xclick /m/01g317 1 0.151562 0.310937 \n", + "3 000002b66c9c498e xclick /m/01g317 1 0.256250 0.429688 \n", + "4 000002b66c9c498e xclick /m/01g317 1 0.257812 0.346875 \n", + "\n", + " YMin YMax IsOccluded IsTruncated IsGroupOf IsDepiction \\\n", + "0 0.148438 0.587500 0 1 0 0 \n", + "1 0.714063 0.948438 0 1 0 0 \n", + "2 0.198437 0.590625 1 0 0 0 \n", + "3 0.651563 0.925000 1 0 0 0 \n", + "4 0.235938 0.385938 1 0 0 0 \n", + "\n", + " IsInside \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# dataset的数据为每张图像中每个物体的具体位置\n", + "dataset.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1743042" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(dataset['ImageID'].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "dataset1 = pd.read_csv('./train-images-boxable-with-rotation.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1743042" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(dataset1['ImageID'].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ImageIDSubsetOriginalURLOriginalLandingURLLicenseAuthorProfileURLAuthorTitleOriginalSizeOriginalMD5Thumbnail300KURLRotation
04fa8054781a4c382trainhttps://farm3.staticflickr.com/5310/5898076654...https://www.flickr.com/photos/michael-beat/589...https://creativecommons.org/licenses/by/2.0/https://www.flickr.com/people/michael-beat/Michael Beat...die FNF-Kerze4405052KFukvivpCM5QXl5SqKe41g==https://c1.staticflickr.com/6/5310/5898076654_...0.0
1b37f763ae67d0888trainhttps://c1.staticflickr.com/1/67/197493648_628...https://www.flickr.com/photos/drstarbuck/19749...https://creativecommons.org/licenses/by/2.0/https://www.flickr.com/people/drstarbuck/KarenThree boys on a hill4945559IzEn38GRNsVpATuv7gzEA==https://c3.staticflickr.com/1/67/197493648_628...0.0
27e8584b0f487cb9etrainhttps://c7.staticflickr.com/8/7056/7143870979_...https://www.flickr.com/photos/circasassy/71438...https://creativecommons.org/licenses/by/2.0/https://www.flickr.com/people/circasassy/CircaSassyA Christmas carol and The cricket on the heart...23715843hQwu0iSzY1VIoXiwp0/Mg==https://c7.staticflickr.com/8/7056/7143870979_...0.0
386638230febe21c4trainhttps://farm5.staticflickr.com/5128/5301868579...https://www.flickr.com/photos/ajcreencia/53018...https://creativecommons.org/licenses/by/2.0/https://www.flickr.com/people/ajcreencia/AlexAbbey and Kenny949267onB+rCZnGQg5PRX7xOs18Q==https://c4.staticflickr.com/6/5128/5301868579_...NaN
4249086e72671397dtrainhttps://c6.staticflickr.com/4/3930/15342460029...https://www.flickr.com/photos/codnewsroom/1534...https://creativecommons.org/licenses/by/2.0/https://www.flickr.com/people/codnewsroom/COD NewsroomSuburban Law Enforcement Academy 20th Annivers...6541758MjpaAVbMAWbCusSaxI1D7w==https://c1.staticflickr.com/4/3930/15342460029...0.0
\n", + "
" + ], + "text/plain": [ + " ImageID Subset OriginalURL \\\n", + "0 4fa8054781a4c382 train https://farm3.staticflickr.com/5310/5898076654... \n", + "1 b37f763ae67d0888 train https://c1.staticflickr.com/1/67/197493648_628... \n", + "2 7e8584b0f487cb9e train https://c7.staticflickr.com/8/7056/7143870979_... \n", + "3 86638230febe21c4 train https://farm5.staticflickr.com/5128/5301868579... \n", + "4 249086e72671397d train https://c6.staticflickr.com/4/3930/15342460029... \n", + "\n", + " OriginalLandingURL \\\n", + "0 https://www.flickr.com/photos/michael-beat/589... \n", + "1 https://www.flickr.com/photos/drstarbuck/19749... \n", + "2 https://www.flickr.com/photos/circasassy/71438... \n", + "3 https://www.flickr.com/photos/ajcreencia/53018... \n", + "4 https://www.flickr.com/photos/codnewsroom/1534... \n", + "\n", + " License \\\n", + "0 https://creativecommons.org/licenses/by/2.0/ \n", + "1 https://creativecommons.org/licenses/by/2.0/ \n", + "2 https://creativecommons.org/licenses/by/2.0/ \n", + "3 https://creativecommons.org/licenses/by/2.0/ \n", + "4 https://creativecommons.org/licenses/by/2.0/ \n", + "\n", + " AuthorProfileURL Author \\\n", + "0 https://www.flickr.com/people/michael-beat/ Michael Beat \n", + "1 https://www.flickr.com/people/drstarbuck/ Karen \n", + "2 https://www.flickr.com/people/circasassy/ CircaSassy \n", + "3 https://www.flickr.com/people/ajcreencia/ Alex \n", + "4 https://www.flickr.com/people/codnewsroom/ COD Newsroom \n", + "\n", + " Title OriginalSize \\\n", + "0 ...die FNF-Kerze 4405052 \n", + "1 Three boys on a hill 494555 \n", + "2 A Christmas carol and The cricket on the heart... 2371584 \n", + "3 Abbey and Kenny 949267 \n", + "4 Suburban Law Enforcement Academy 20th Annivers... 6541758 \n", + "\n", + " OriginalMD5 \\\n", + "0 KFukvivpCM5QXl5SqKe41g== \n", + "1 9IzEn38GRNsVpATuv7gzEA== \n", + "2 3hQwu0iSzY1VIoXiwp0/Mg== \n", + "3 onB+rCZnGQg5PRX7xOs18Q== \n", + "4 MjpaAVbMAWbCusSaxI1D7w== \n", + "\n", + " Thumbnail300KURL Rotation \n", + "0 https://c1.staticflickr.com/6/5310/5898076654_... 0.0 \n", + "1 https://c3.staticflickr.com/1/67/197493648_628... 0.0 \n", + "2 https://c7.staticflickr.com/8/7056/7143870979_... 0.0 \n", + "3 https://c4.staticflickr.com/6/5128/5301868579_... NaN \n", + "4 https://c1.staticflickr.com/4/3930/15342460029... 0.0 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# dataset1的数据集为图片集的来源\n", + "dataset1.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# 提取dataset1中的图片ID和url\n", + "dataset2 = dataset1.drop(['Subset','OriginalURL','License','AuthorProfileURL','Author','Title','OriginalSize','OriginalMD5','Thumbnail300KURL','Rotation'],axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ImageIDOriginalLandingURL
04fa8054781a4c382https://www.flickr.com/photos/michael-beat/589...
1b37f763ae67d0888https://www.flickr.com/photos/drstarbuck/19749...
\n", + "
" + ], + "text/plain": [ + " ImageID OriginalLandingURL\n", + "0 4fa8054781a4c382 https://www.flickr.com/photos/michael-beat/589...\n", + "1 b37f763ae67d0888 https://www.flickr.com/photos/drstarbuck/19749..." + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset2.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# save\n", + "dataset2.to_csv('./urllink.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['ImageID', 'OriginalLandingURL'], dtype='object')" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset2.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "dataset3 = dataset.drop(['Source','Confidence'],axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['ImageID', 'LabelName', 'XMin', 'XMax', 'YMin', 'YMax', 'IsOccluded',\n", + " 'IsTruncated', 'IsGroupOf', 'IsDepiction', 'IsInside'],\n", + " dtype='object')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset3.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# save\n", + "dataset3.to_csv('./detail.csv',header=False) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file From 15e65698a0fd8614f5adc135f081470cb0ebadae Mon Sep 17 00:00:00 2001 From: bert <1131482051@qq.com> Date: Wed, 4 Dec 2019 14:00:06 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E4=B8=8B=E8=BD=BD?= =?UTF-8?q?=E9=83=A8=E5=88=86=E7=9A=84=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ImagePoolProject/DatabaseQuery.go | 18 +++++ imagePoolCode/ImagePoolProject/Downloader.go | 68 +++++++++++++++++++ .../ImagePoolProject/conf/config.json | 3 + imagePoolCode/ImagePoolProject/filecontorl.go | 42 ++++++++++++ imagePoolCode/ImagePoolProject/go.mod | 9 +++ imagePoolCode/ImagePoolProject/go.sum | 39 +++++++++++ imagePoolCode/ImagePoolProject/main.go | 24 +++++++ 7 files changed, 203 insertions(+) create mode 100644 imagePoolCode/ImagePoolProject/DatabaseQuery.go create mode 100644 imagePoolCode/ImagePoolProject/Downloader.go create mode 100644 imagePoolCode/ImagePoolProject/conf/config.json create mode 100644 imagePoolCode/ImagePoolProject/filecontorl.go create mode 100644 imagePoolCode/ImagePoolProject/go.mod create mode 100644 imagePoolCode/ImagePoolProject/go.sum create mode 100644 imagePoolCode/ImagePoolProject/main.go diff --git a/imagePoolCode/ImagePoolProject/DatabaseQuery.go b/imagePoolCode/ImagePoolProject/DatabaseQuery.go new file mode 100644 index 0000000..dbfcfb4 --- /dev/null +++ b/imagePoolCode/ImagePoolProject/DatabaseQuery.go @@ -0,0 +1,18 @@ +package main +import ( + "fmt" + "github.com/astaxie/beego/orm" +) + +func init() { +} + +func GroupUrlGet(Group string){ + o:=orm.NewOrm() + var obj []interface{} + _,err:=o.Raw("select imageID from imageLabelTable where labelName = ? and confidence=1 limit 10;",Group).QueryRows(&obj) + if err != nil { + fmt.Println(err) + } + fmt.Printf("%#v",obj) +} \ No newline at end of file diff --git a/imagePoolCode/ImagePoolProject/Downloader.go b/imagePoolCode/ImagePoolProject/Downloader.go new file mode 100644 index 0000000..42fa542 --- /dev/null +++ b/imagePoolCode/ImagePoolProject/Downloader.go @@ -0,0 +1,68 @@ +package main + +import ( + "errors" + "fmt" + "io" + "net/http" + "os" + "github.com/astaxie/beego/httplib" + "strconv" + "strings" + "sync" +) +//图片下载函数,保存图片到TempImage文件夹下 +func ImageDownload(imageID string ,url string )(status bool,err error){ + resp,err := http.Get(url) + if err != nil { + fmt.Println(err) + return false,errors.New("can't connected") + } + if resp.StatusCode==200{ + img,err:=os.OpenFile("./TempImage/"+imageID+".jpg",os.O_WRONLY|os.O_CREATE|os.O_EXCL,0666) + defer img.Close() + if err != nil { + println(err) + } + image:=make([]byte,10240) + for { + n,err := resp.Body.Read(image) + if err == io.EOF { + break + } + n,err = img.Write(image[:n]) + if err != nil { + fmt.Println("img write:",err) + } + } + return true ,nil + } else{ + err:=errors.New("StatusCode Error") + return false,err + } +} +func urlDownload(url string)[]string{ + + resp:=httplib.Get(url) + text,err:=resp.String() + if err != nil { + fmt.Println(err) + } + urls:=strings.Split(text,"\n") + return urls +} +func ImageNetDownload(urls []string){ + println(len(urls)) + w:=sync.WaitGroup{} + + for i:=0;i Date: Sat, 7 Dec 2019 20:17:15 +0800 Subject: [PATCH 3/3] 12.7 update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 更新下载器部分代码,解决图片下载损坏问题和次序问题 --- imagePoolCode/ImagePoolProject/Downloader.go | 68 +++++++++++++++++--- imagePoolCode/ImagePoolProject/main.go | 64 +++++++++++++++--- 2 files changed, 114 insertions(+), 18 deletions(-) diff --git a/imagePoolCode/ImagePoolProject/Downloader.go b/imagePoolCode/ImagePoolProject/Downloader.go index 42fa542..0054444 100644 --- a/imagePoolCode/ImagePoolProject/Downloader.go +++ b/imagePoolCode/ImagePoolProject/Downloader.go @@ -11,15 +11,20 @@ import ( "strings" "sync" ) + +const ( + ChanSize int = 5 +) + //图片下载函数,保存图片到TempImage文件夹下 -func ImageDownload(imageID string ,url string )(status bool,err error){ +func ImageDownload(imageID string ,url string,label string)(status bool,err error){ resp,err := http.Get(url) if err != nil { fmt.Println(err) return false,errors.New("can't connected") } if resp.StatusCode==200{ - img,err:=os.OpenFile("./TempImage/"+imageID+".jpg",os.O_WRONLY|os.O_CREATE|os.O_EXCL,0666) + img,err:=os.OpenFile("./TempImage/"+label+"/"+imageID+".jpg",os.O_WRONLY|os.O_CREATE,0666) defer img.Close() if err != nil { println(err) @@ -30,7 +35,38 @@ func ImageDownload(imageID string ,url string )(status bool,err error){ if err == io.EOF { break } - n,err = img.Write(image[:n]) + _,err = img.Write(image[:n]) + if err != nil { + fmt.Println("img write:",err) + } + } + return true ,nil + } else{ + err:=errors.New("StatusCode Error") + return false,err + } +} +func ImageDownload2(count chan int,url string,label string)(status bool,err error){ + resp,err := http.Get(url) + if err != nil { + fmt.Println(err) + return false,errors.New("can't connected") + } + if resp.StatusCode==200{ + a:=<-count + println(a,url) + img,err:=os.OpenFile("./TempImage/"+label+"/"+strconv.Itoa(a)+".jpg",os.O_WRONLY|os.O_CREATE,0777) + defer img.Close() + if err != nil { + println(err) + } + image:=make([]byte,256) + for { + n,err := resp.Body.Read(image) + if err == io.EOF { + break + } + _,err = img.Write(image[:n]) if err != nil { fmt.Println("img write:",err) } @@ -51,18 +87,32 @@ func urlDownload(url string)[]string{ urls:=strings.Split(text,"\n") return urls } -func ImageNetDownload(urls []string){ +func ImageNetDownload(urls []string,label string)(int){ println(len(urls)) w:=sync.WaitGroup{} - + lock:=sync.Mutex{} + sum :=9 + count:=make(chan int,ChanSize) + for i:=0;i