394 lines
12 KiB
JavaScript
394 lines
12 KiB
JavaScript
// interencheresData.js
|
|
const {ScraperTools} = require('../Scraper');
|
|
const urlModule = require('url');
|
|
const moment = require('moment-timezone');
|
|
const { Console } = require('console');
|
|
const { title } = require('process');
|
|
|
|
class DrouotData extends ScraperTools {
|
|
|
|
_Name = 'drouot'
|
|
|
|
_LiveData = null
|
|
|
|
getUrlInfo = async (url) => {
|
|
|
|
// URL Lot : https://drouot.com/fr/l/25184163-john-conde-17651794-britanniqu
|
|
// https://drouot.com/fr/{{v: Vente/ l: Lot}}/{{LotID}}-john-conde-{{????}}-britanniqu
|
|
|
|
// URL Sale : https://drouot.com/fr/v/152658-fine-paintings-and-frames
|
|
// https://drouot.com/fr/{{v: Vente/ l: Lot}}/{{SaleID}}-fine-paintings-and-frames
|
|
|
|
|
|
let parsedUrl = new urlModule.URL(url);
|
|
let pathParts = parsedUrl.pathname.split('/').filter(Boolean);
|
|
|
|
// if sale URL
|
|
let saleID = 0
|
|
let lotID = 0
|
|
let TypeUrl = ''
|
|
let urlSale = ''
|
|
let urlLot = ''
|
|
|
|
if(pathParts[1] == 'v'){
|
|
TypeUrl = 'Sale'
|
|
saleID = pathParts[2].split('-')[0];
|
|
urlSale = parsedUrl.origin + parsedUrl.pathname
|
|
}else if(pathParts[1] == 'l'){
|
|
TypeUrl = 'Lot'
|
|
lotID = pathParts[2].split('-')[0];
|
|
urlLot = parsedUrl.origin + parsedUrl.pathname
|
|
}
|
|
|
|
|
|
|
|
return {
|
|
'TypeUrl': TypeUrl,
|
|
'saleID': saleID,
|
|
'lotID': lotID,
|
|
'urlSale': urlSale,
|
|
'urlLot': urlLot
|
|
}
|
|
}
|
|
|
|
// ## Lot
|
|
|
|
getPictures = async (page, Url) => {
|
|
|
|
console.log("getPictures "+this._Name+": "+Url)
|
|
|
|
//add the picture to the list
|
|
let PictList = []
|
|
page.on('response', async response => {
|
|
|
|
const url = response.url();
|
|
if (response.request().resourceType() === 'image' && url.match("size=fullHD")) {
|
|
response.buffer().then(file => {
|
|
console.log("push "+url)
|
|
PictList.push(url)
|
|
});
|
|
}
|
|
});
|
|
|
|
function checkDup() {
|
|
const toFindDuplicates = array => array.filter((item, index) => array.indexOf(item) !== index)
|
|
const duplicateElements = toFindDuplicates(PictList);
|
|
|
|
// if dupplicate pictures added in the array
|
|
if (duplicateElements.length > 0) {
|
|
|
|
// remove diplucated content
|
|
PictList = PictList.filter(function (elem, pos) {
|
|
return PictList.indexOf(elem) == pos;
|
|
})
|
|
|
|
// stop the process
|
|
return false
|
|
|
|
// no dupplicated picture
|
|
} else {
|
|
|
|
// continue the process
|
|
return true
|
|
}
|
|
}
|
|
|
|
// Navigate the page to a URL
|
|
await page.goto(Url);
|
|
|
|
await page.waitForTimeout(500);
|
|
|
|
//get the Next link
|
|
const [ButtonNext] = await page.$x("//*[@id='next']");
|
|
|
|
if (ButtonNext) {
|
|
let condition = true
|
|
do {
|
|
console.log("click")
|
|
await ButtonNext.evaluate(b => b.click());
|
|
await page.waitForTimeout(500);
|
|
condition = checkDup();
|
|
} while (condition);
|
|
}
|
|
|
|
return PictList
|
|
}
|
|
|
|
getLotNumber = async (page) => {
|
|
|
|
const lotNumberXPath = [
|
|
'/html/body/div[1]/div[4]/div[2]/div/div/div/div[3]/div/div[1]/div[2]/div[1]/span/span',
|
|
]
|
|
let lotNumberString = await this.getTextContent(lotNumberXPath, page, 'lotNumberXPath')
|
|
let lotNumber = '';
|
|
if(lotNumberString != ''){
|
|
lotNumber = lotNumberString.replace('Lot ', '');
|
|
}
|
|
|
|
return lotNumber
|
|
}
|
|
|
|
getLotTitle = async (page) => {
|
|
|
|
const lotTitleXPath = [
|
|
'/html/body/div[1]/div[4]/div[2]/div/div/div/div[3]/div/div[1]/div[2]/div[5]/h1',
|
|
]
|
|
let lotTitleString = await this.getTextContent(lotTitleXPath, page, 'lotTitleXPath')
|
|
if (lotTitleString.length > 90) {
|
|
lotTitleString = lotTitleString.substring(0, 90) + '...';
|
|
}
|
|
|
|
return lotTitleString
|
|
}
|
|
|
|
getEstimate = async (page) => {
|
|
const EstimateXPath = [
|
|
'/html/body/div[1]/div[4]/div[2]/div/div/div/div[3]/div/div[1]/div[2]/div[7]/div/div[1]/span/span',
|
|
]
|
|
|
|
let EstimateString = await this.getTextContent(EstimateXPath, page, 'EstimateXPath')
|
|
//console.log('EstimateString : '+EstimateString)
|
|
|
|
let EstimateLow = 0
|
|
let EstimateHigh = 0
|
|
if(EstimateString != ''){
|
|
let matches = EstimateString.match(/(\d{1,3}(?:\s\d{3})*)/g);
|
|
|
|
if (matches) {
|
|
if (matches.length >= 2) {
|
|
EstimateLow = parseInt(matches[0].replace(/\s/g, ''), 10);
|
|
EstimateHigh = parseInt(matches[1].replace(/\s/g, ''), 10);
|
|
|
|
console.log('Low:', EstimateLow);
|
|
console.log('High:', EstimateHigh);
|
|
} else
|
|
if(matches.length == 1){
|
|
EstimateLow = parseInt(matches[0].replace(/\s/g, ''), 10);
|
|
EstimateHigh = 0;
|
|
} else {
|
|
console.error('Could not extract numbers.');
|
|
}
|
|
}
|
|
}
|
|
|
|
return {EstimateLow, EstimateHigh}
|
|
}
|
|
|
|
getDescription = async (page) => {
|
|
const DescriptionXPath = [
|
|
'//h3[contains(@class, "descriptionLineWrap")]',
|
|
]
|
|
let Description = await this.getTextContent(DescriptionXPath, page, 'DescriptionXPath')
|
|
|
|
return Description
|
|
}
|
|
|
|
getFees = async (page) => {
|
|
let feesText = ''
|
|
let fees = 0
|
|
|
|
const FeesXPath = [
|
|
'/html/body/div[1]/div[4]/div[2]/div/div/div/div[3]/div/div[1]/div[2]/div[7]/div/div[3]/a/span',
|
|
]
|
|
feesText = await this.getTextContent(FeesXPath, page, 'FeesXPath')
|
|
|
|
// detect digit
|
|
if (!/\d/.test(feesText)) {
|
|
const FeesXPath = [
|
|
'/html/body/div[1]/div[4]/div[2]/div/div/div/div[3]/div/div[1]/div[2]/div[7]/div/div[2]/a/span',
|
|
]
|
|
feesText = await this.getTextContent(FeesXPath, page, 'FeesXPath')
|
|
}
|
|
|
|
|
|
feesText = feesText.replace(/[\n]/g, '').replace(/\s+/g, ' ').trim();
|
|
let matches = feesText.match(/(\d+(\.\d+)?)/)
|
|
if (matches) {
|
|
fees = matches[0];
|
|
}
|
|
|
|
return {feesText, fees}
|
|
}
|
|
|
|
getLotID = async (url) => {
|
|
|
|
let UrlInfo = await this.getUrlInfo(url);
|
|
let id_lot = UrlInfo.lotID
|
|
return id_lot
|
|
}
|
|
|
|
getSaleID = async (page) => {
|
|
|
|
const UrlCatalogueXPath = [
|
|
'/html/body/div[1]/div[4]/div[2]/div/div/div/div[3]/div/div[1]/div[2]/div[10]/div/div[2]/div[3]/a[1]',
|
|
]
|
|
let UrlCatalogue = await this.getAttribute(UrlCatalogueXPath, page, "href", "UrlCatalogueXPath")
|
|
console.log('UrlCatalogue : '+UrlCatalogue)
|
|
|
|
UrlCatalogue = UrlCatalogue.substring(0,1) == '/' ? 'https://drouot.com'+UrlCatalogue : UrlCatalogue
|
|
|
|
|
|
let UrlInfo = await this.getUrlInfo(UrlCatalogue);
|
|
let id_sale = UrlInfo.saleID
|
|
let urlSale = UrlInfo.urlSale
|
|
|
|
return {id_sale, urlSale}
|
|
}
|
|
|
|
// ## Lot List
|
|
_getLotInfoList = async (page, Elements) => {
|
|
let LotList = [];
|
|
for (let element of Elements) {
|
|
let Lot = {}
|
|
try{
|
|
let LotnameXPath = [
|
|
'.//a/div/div/div[5]/div/div[1]',
|
|
'.//a/div/div/div[4]/div/div[1]',
|
|
]
|
|
let Lotname = await this.getTextContentElement(LotnameXPath, page, element, 'LotnameXPath')
|
|
|
|
// idPlatform from the url
|
|
let LotUrlXPath = [
|
|
'.//a'
|
|
]
|
|
let urlLot = await this.getAttributeElement(LotUrlXPath, page, element, 'href', 'UrlListLot')
|
|
let match = urlLot.match(/lot-(.*).html/);
|
|
let idPlatform = match[1];
|
|
|
|
Lot = {
|
|
title: Lotname,
|
|
idPlatform: idPlatform,
|
|
platform: this._Name,
|
|
lotNumber: Lotname.split('Lot ')[1]
|
|
}
|
|
}catch(e){
|
|
console.error(e)
|
|
}
|
|
//console.log(LotList)
|
|
LotList.push(Lot);
|
|
};
|
|
|
|
return LotList;
|
|
}
|
|
|
|
getLotList = async (page) => {
|
|
|
|
let LotList = []
|
|
|
|
let NextBtn = false
|
|
do {
|
|
|
|
// extract Lot List
|
|
const LotListXPath = [
|
|
'//div[contains(@class, "sale-item-wrapper")]',
|
|
]
|
|
let Elements = await page.$x(LotListXPath[0]);
|
|
|
|
if (Elements.length > 0) {
|
|
LotList = [].concat(LotList, await this._getLotInfoList(page, Elements))
|
|
}
|
|
|
|
// search for the Button Next (only if enabled)
|
|
let NextPageButtonXPath = "//button[contains(@aria-label, 'Page suivante') and not(contains(@class, 'v-pagination__navigation--disabled'))]"
|
|
let NextPageButton = await page.$x(NextPageButtonXPath);
|
|
if (NextPageButton.length > 0) {
|
|
NextBtn = true
|
|
await NextPageButton[0].evaluate(b => b.click());
|
|
await page.waitForTimeout(1000);
|
|
console.log('Next Page')
|
|
}else{
|
|
NextBtn = false
|
|
}
|
|
|
|
|
|
} while (NextBtn);
|
|
|
|
return LotList
|
|
}
|
|
|
|
|
|
// ## Sale
|
|
|
|
getSaleTitle = async (page) => {
|
|
const SaleTitleXPath = [
|
|
'/html/body/div/div[4]/div[2]/div/div/div/div/div[3]/div/div[2]/div[1]/div/h1',
|
|
]
|
|
let SaleTitle = await this.getTextContent(SaleTitleXPath, page, 'SaleTitleXPath')
|
|
return SaleTitle
|
|
}
|
|
|
|
getSaleDate = async (page) => {
|
|
|
|
// Test if Live Sale
|
|
let BoolLive = false;
|
|
try {
|
|
// const VideoXPath = [
|
|
// '//*[@id="streaming-subscriber"]',
|
|
// ]
|
|
// let VideoExists = await this.ElementExists(VideoXPath, page, 'VideoXPath')
|
|
// console.log('VideoExists : '+VideoExists)
|
|
// BoolLive = VideoExists
|
|
|
|
} catch (error) {}
|
|
|
|
let SaleDate;
|
|
|
|
// if futur sale
|
|
if(!BoolLive){
|
|
|
|
await page.waitForTimeout(400);
|
|
const SaleDateXPath = [
|
|
'/html/body/div/div[4]/div[2]/div/div/div/div/div[3]/div/div[2]/div[1]/div/div[1]/div',
|
|
]
|
|
let SaleDateString = await this.getTextContent(SaleDateXPath, page, 'SaleDateXPath')
|
|
SaleDateString = SaleDateString.trim()
|
|
|
|
let cleanStr = SaleDateString.replace(/\\s|\\n/g, ' ').replace(/\s+/g, ' ');
|
|
|
|
SaleDate = moment.tz(cleanStr, 'dddd D MMMM à HH:mm (z)', 'fr', 'Europe/Paris').format();
|
|
|
|
// Live Sale
|
|
}else{
|
|
SaleDate = moment.tz('Europe/Paris').format();
|
|
}
|
|
|
|
console.log('SaleDate : '+SaleDate)
|
|
return SaleDate
|
|
|
|
}
|
|
|
|
getSaleLocation = async (page) => {
|
|
const SaleLocationXPath = [
|
|
'/html/body/div/div[4]/div[2]/div/div/div/div/div[3]/div/div[2]/div[1]/div/div[4]',
|
|
]
|
|
let SaleLocation = await this.getTextContent(SaleLocationXPath, page, 'SaleLocationXPath')
|
|
return SaleLocation.trim()
|
|
}
|
|
|
|
getSaleHouseName = async (page) => {
|
|
const SaleHouseNameXPath = [
|
|
'/html/body/div/div[4]/div[2]/div/div/div/div/div[3]/div/div[2]/div[1]/div/h4/a[1]/span',
|
|
]
|
|
let SaleHouseName = await this.getTextContent(SaleHouseNameXPath, page, 'SaleHouseNameXPath')
|
|
|
|
SaleHouseName = SaleHouseName.replace(/[\n]/g, '').replace(/\s+/g, ' ').trim();
|
|
return SaleHouseName.trim()
|
|
}
|
|
|
|
// ## Live Data
|
|
setLiveData = (Data) => {
|
|
this._LiveData = Data
|
|
}
|
|
|
|
getLiveDataLot = async(lotId) => {
|
|
for (let lot of this._LiveData.lots) {
|
|
if (lot.id === lotId) {
|
|
return lot;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
}
|
|
|
|
module.exports = DrouotData |