Clickhouse高可用集群搭建

April 11, 2020Comments

Clickhouse高可用集群搭建

一、环境信息

$ uname -a
Linux clicehouse-01.t4x.org 3.10.0-957.el7.x86_64 #1 SMP Thu Nov 8 23:39:32 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux
$ cat /etc/redhat-release 
CentOS Linux release 7.6.1810 (Core)

$ uname -a

Linux clicehouse-01.t4x.org 3.10.0-957.el7.x86_64 #1 SMP Thu Nov 8 23:39:32 UTC 2018 x86_64 x86_64 x86_64 GNU/Linux

$ cat /etc/redhat-release

CentOS Linux release 7.6.1810 (Core)

文章源自 note.t4x.orgByrd's Blog-https://note.t4x.org/database/clickhouse-cluster-deployment/

二、clickhouse安装

$ sudo yum install yum-utils
$ sudo yum install yum-utils
$ sudo yum-config-manager --add-repo https://repo.yandex.ru/clickhouse/rpm/stable/x86_64
$ sudo yum install clickhouse-server clickhouse-client

$ sudo yum install yum-utils

$ sudo yum-config-manager --add-repo https://repo.yandex.ru/clickhouse/rpm/stable/x86_64

$ sudo yum install clickhouse-server clickhouse-client

文章源自 note.t4x.orgByrd's Blog-https://note.t4x.org/database/clickhouse-cluster-deployment/

三、clickhouse配置

$ cat /etc/clickhouse-server/config.xml
<?xml version="1.0"?>
<!--
  NOTE: User and query level settings are set up in "users.xml" file.
-->
<yandex>
    <logger>
        <!-- Possible levels: https://github.com/pocoproject/poco/blob/develop/Foundation/include/Poco/Logger.h#L105 -->
        <level>trace</level>
        <log>/var/log/clickhouse-server/clickhouse-server.log</log>
        <errorlog>/var/log/clickhouse-server/clickhouse-server.err.log</errorlog>
        <size>1000M</size>
        <count>10</count>
        <!-- <console>1</console> --> <!-- Default behavior is autodetection (log to console if not daemon mode and is tty) -->
    </logger>
    <!--display_name>production</display_name--> <!-- It is the name that will be shown in the client -->
    <http_port>8123</http_port>
    <tcp_port>9000</tcp_port>

    <!-- For HTTPS and SSL over native protocol. -->
    <!--
    <https_port>8443</https_port>
    <tcp_port_secure>9440</tcp_port_secure>
    -->

    <!-- Used with https_port and tcp_port_secure. Full ssl options list: https://github.com/ClickHouse-Extras/poco/blob/master/NetSSL_OpenSSL/include/Poco/Net/SSLManager.h#L71 -->
    <openSSL>
        <server> <!-- Used for https server AND secure tcp port -->
            <!-- openssl req -subj "/CN=localhost" -new -newkey rsa:2048 -days 365 -nodes -x509 -keyout /etc/clickhouse-server/server.key -out /etc/clickhouse-server/server.crt -->
            <certificateFile>/etc/clickhouse-server/server.crt</certificateFile>
            <privateKeyFile>/etc/clickhouse-server/server.key</privateKeyFile>
            <!-- openssl dhparam -out /etc/clickhouse-server/dhparam.pem 4096 -->
            <dhParamsFile>/etc/clickhouse-server/dhparam.pem</dhParamsFile>
            <verificationMode>none</verificationMode>
            <loadDefaultCAFile>true</loadDefaultCAFile>
            <cacheSessions>true</cacheSessions>
            <disableProtocols>sslv2,sslv3</disableProtocols>
            <preferServerCiphers>true</preferServerCiphers>
        </server>

        <client> <!-- Used for connecting to https dictionary source -->
            <loadDefaultCAFile>true</loadDefaultCAFile>
            <cacheSessions>true</cacheSessions>
            <disableProtocols>sslv2,sslv3</disableProtocols>
            <preferServerCiphers>true</preferServerCiphers>
            <!-- Use for self-signed: <verificationMode>none</verificationMode> -->
            <invalidCertificateHandler>
                <!-- Use for self-signed: <name>AcceptCertificateHandler</name> -->
                <name>RejectCertificateHandler</name>
            </invalidCertificateHandler>
        </client>
    </openSSL>

    <!-- Default root page on http[s] server. For example load UI from https://tabix.io/ when opening http://localhost:8123 -->
    <!--
    <http_server_default_response><![CDATA[<html ng-app="SMI2"><head><base href="http://ui.tabix.io/"></head><body><div ui-view="" class="content-ui"></div><script src="http://loader.tabix.io/master.js"></script></body></html>]]></http_server_default_response>
    -->

    <!-- Port for communication between replicas. Used for data exchange. -->
    <interserver_http_port>9009</interserver_http_port>

    <!-- Hostname that is used by other replicas to request this server.
         If not specified, than it is determined analoguous to 'hostname -f' command.
         This setting could be used to switch replication to another network interface.
      -->
    <interserver_http_host>cliechouse-01.t4x.org</interserver_http_host>

    <!-- Listen specified host. use :: (wildcard IPv6 address), if you want to accept connections both with IPv4 and IPv6 from everywhere. -->
    <!-- <listen_host>::</listen_host> -->
    <!-- Same for hosts with disabled ipv6: -->
    <listen_host>0.0.0.0</listen_host> 

    <!-- Default values - try listen localhost on ipv4 and ipv6: -->
    <!--
    <listen_host>::1</listen_host>
    <listen_host>127.0.0.1</listen_host>
    -->
    <!-- Don't exit if ipv6 or ipv4 unavailable, but listen_host with this protocol specified -->
    <!-- <listen_try>0</listen_try> -->

    <!-- Allow listen on same address:port -->
    <!-- <listen_reuse_port>0</listen_reuse_port> -->

    <!-- <listen_backlog>64</listen_backlog> -->

    <max_connections>4096</max_connections>
    <keep_alive_timeout>3</keep_alive_timeout>

    <!-- Maximum number of concurrent queries. -->
    <max_concurrent_queries>100</max_concurrent_queries>

    <!-- Set limit on number of open files (default: maximum). This setting makes sense on Mac OS X because getrlimit() fails to retrieve
         correct maximum value. -->
    <!-- <max_open_files>262144</max_open_files> -->

    <!-- Size of cache of uncompressed blocks of data, used in tables of MergeTree family.
         In bytes. Cache is single for server. Memory is allocated only on demand.
         Cache is used when 'use_uncompressed_cache' user setting turned on (off by default).
         Uncompressed cache is advantageous only for very short queries and in rare cases.
      -->
    <uncompressed_cache_size>8589934592</uncompressed_cache_size>

    <!-- Approximate size of mark cache, used in tables of MergeTree family.
         In bytes. Cache is single for server. Memory is allocated only on demand.
         You should not lower this value.
      -->
    <mark_cache_size>5368709120</mark_cache_size>


    <!-- Path to data directory, with trailing slash. -->
    <path>/data/clickhouse/data/</path>

    <!-- Path to temporary data for processing hard queries. -->
    <tmp_path>/data/clickhouse/tmp/</tmp_path>

    <!-- Directory with user provided files that are accessible by 'file' table function. -->
    <user_files_path>/data/clickhouse/user_files/</user_files_path>

    <!-- Path to configuration file with users, access rights, profiles of settings, quotas. -->
    <users_config>users.xml</users_config>

    <!-- Default profile of settings. -->
    <default_profile>default</default_profile>

    <!-- System profile of settings. This settings are used by internal processes (Buffer storage, Distibuted DDL worker and so on). -->
    <!-- <system_profile>default</system_profile> -->

    <!-- Default database. -->
    <default_database>default</default_database>
    <timezone>Asia/Shanghai</timezone>
    <!-- Server time zone could be set here.

         Time zone is used when converting between String and DateTime types,
          when printing DateTime in text formats and parsing DateTime from text,
          it is used in date and time related functions, if specific time zone was not passed as an argument.

         Time zone is specified as identifier from IANA time zone database, like UTC or Africa/Abidjan.
         If not specified, system time zone at server startup is used.

         Please note, that server could display time zone alias instead of specified name.
         Example: W-SU is an alias for Europe/Moscow and Zulu is an alias for UTC.
    -->
    <!-- <timezone>Europe/Moscow</timezone> -->

    <!-- You can specify umask here (see "man umask"). Server will apply it on startup.
         Number is always parsed as octal. Default umask is 027 (other users cannot read logs, data files, etc; group can only read).
    -->
    <!-- <umask>022</umask> -->

    <!-- Perform mlockall after startup to lower first queries latency
          and to prevent clickhouse executable from being paged out under high IO load.
         Enabling this option is recommended but will lead to increased startup time for up to a few seconds.
    -->
    <mlock_executable>false</mlock_executable>
    <remote_servers incl="clickhouse_remote_servers" optional="true" />
    <!-- Configuration of clusters that could be used in Distributed tables.
         https://clickhouse.yandex/docs/en/table_engines/distributed/
    <remote_servers incl="clickhouse_remote_servers" >
        <test_shard_localhost>
            <shard>
                <replica>
                    <host>localhost</host>
                    <port>9000</port>
                </replica>
            </shard>
        </test_shard_localhost>
    </remote_servers>

      -->
    <!-- If element has 'incl' attribute, then for it's value will be used corresponding substitution from another file.
         By default, path to file with substitutions is /etc/metrika.xml. It could be changed in config in 'include_from' element.
         Values for substitutions are specified in /yandex/name_of_substitution elements in that file.
      -->

    <!-- ZooKeeper is used to store metadata about replicas, when using Replicated tables.
         Optional. If you don't use replicated tables, you could omit that.

         See https://clickhouse.yandex/docs/en/table_engines/replication/
      -->

    <zookeeper incl="zookeeper-servers" optional="true" />

    <!-- Substitutions for parameters of replicated tables.
          Optional. If you don't use replicated tables, you could omit that.

         See https://clickhouse.yandex/docs/en/table_engines/replication/#creating-replicated-tables
      -->
    <macros incl="macros" optional="true" />


    <!-- Reloading interval for embedded dictionaries, in seconds. Default: 3600. -->
    <builtin_dictionaries_reload_interval>3600</builtin_dictionaries_reload_interval>


    <!-- Maximum session timeout, in seconds. Default: 3600. -->
    <max_session_timeout>3600</max_session_timeout>

    <!-- Default session timeout, in seconds. Default: 60. -->
    <default_session_timeout>60</default_session_timeout>

    <!-- Sending data to Graphite for monitoring. Several sections can be defined. -->
    <!--
        interval - send every X second
        root_path - prefix for keys
        hostname_in_path - append hostname to root_path (default = true)
        metrics - send data from table system.metrics
        events - send data from table system.events
        asynchronous_metrics - send data from table system.asynchronous_metrics
    -->
    <!--
    <graphite>
        <host>localhost</host>
        <port>42000</port>
        <timeout>0.1</timeout>
        <interval>60</interval>
        <root_path>one_min</root_path>
        <hostname_in_path>true</hostname_in_path>

        <metrics>true</metrics>
        <events>true</events>
        <events_cumulative>false</events_cumulative>
        <asynchronous_metrics>true</asynchronous_metrics>
    </graphite>
    <graphite>
        <host>localhost</host>
        <port>42000</port>
        <timeout>0.1</timeout>
        <interval>1</interval>
        <root_path>one_sec</root_path>

        <metrics>true</metrics>
        <events>true</events>
        <events_cumulative>false</events_cumulative>
        <asynchronous_metrics>false</asynchronous_metrics>
    </graphite>
    -->


    <!-- Query log. Used only for queries with setting log_queries = 1. -->
    <query_log>
        <!-- What table to insert data. If table is not exist, it will be created.
             When query log structure is changed after system update,
              then old table will be renamed and new table will be created automatically.
        -->
        <database>system</database>
        <table>query_log</table>
        <!--
            PARTITION BY expr https://clickhouse.yandex/docs/en/table_engines/custom_partitioning_key/
            Example:
                event_date
                toMonday(event_date)
                toYYYYMM(event_date)
                toStartOfHour(event_time)
        -->
        <partition_by>toYYYYMM(event_date)</partition_by>
        <!-- Interval of flushing data. -->
        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
    </query_log>

    <!-- Trace log. Stores stack traces collected by query profilers.
         See query_profiler_real_time_period_ns and query_profiler_cpu_time_period_ns settings. -->
    <trace_log>
        <database>system</database>
        <table>trace_log</table>

        <partition_by>toYYYYMM(event_date)</partition_by>
        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
    </trace_log>

    <!-- Query thread log. Has information about all threads participated in query execution.
         Used only for queries with setting log_query_threads = 1. -->
    <query_thread_log>
        <database>system</database>
        <table>query_thread_log</table>
        <partition_by>toYYYYMM(event_date)</partition_by>
        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
    </query_thread_log>

    <!-- Uncomment if use part log.
         Part log contains information about all actions with parts in MergeTree tables (creation, deletion, merges, downloads).
    <part_log>
        <database>system</database>
        <table>part_log</table>
        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
    </part_log>
    -->

    <!-- Uncomment to write text log into table.
         Text log contains all information from usual server log but stores it in structured and efficient way.
    <text_log>
        <database>system</database>
        <table>text_log</table>
        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
    </text_log>
    -->

    <!-- Uncomment to write metric log into table.
         Metric log contains rows with current values of ProfileEvents, CurrentMetrics collected with "collect_interval_milliseconds" interval.
    <metric_log>
        <database>system</database>
        <table>metric_log</table>
        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
        <collect_interval_milliseconds>1000</collect_interval_milliseconds>
    </metric_log>
    -->

    <!-- Parameters for embedded dictionaries, used in Yandex.Metrica.
         See https://clickhouse.yandex/docs/en/dicts/internal_dicts/
    -->

    <!-- Path to file with region hierarchy. -->
    <!-- <path_to_regions_hierarchy_file>/opt/geo/regions_hierarchy.txt</path_to_regions_hierarchy_file> -->

    <!-- Path to directory with files containing names of regions -->
    <!-- <path_to_regions_names_files>/opt/geo/</path_to_regions_names_files> -->


    <!-- Configuration of external dictionaries. See:
         https://clickhouse.yandex/docs/en/dicts/external_dicts/
    -->
    <dictionaries_config>*_dictionary.xml</dictionaries_config>

    <!-- Uncomment if you want data to be compressed 30-100% better.
         Don't do that if you just started using ClickHouse.
      -->
    <compression incl="clickhouse_compression">
    <!--
        <!- - Set of variants. Checked in order. Last matching case wins. If nothing matches, lz4 will be used. - ->
        <case>

            <!- - Conditions. All must be satisfied. Some conditions may be omitted. - ->
            <min_part_size>10000000000</min_part_size>        <!- - Min part size in bytes. - ->
            <min_part_size_ratio>0.01</min_part_size_ratio>   <!- - Min size of part relative to whole table size. - ->

            <!- - What compression method to use. - ->
            <method>zstd</method>
        </case>
    -->
    </compression>

    <!-- Allow to execute distributed DDL queries (CREATE, DROP, ALTER, RENAME) on cluster.
         Works only if ZooKeeper is enabled. Comment it if such functionality isn't required. -->
    <distributed_ddl>
        <!-- Path in ZooKeeper to queue with DDL queries -->
        <path>/clickhouse/task_queue/ddl</path>

        <!-- Settings from this profile will be used to execute DDL queries -->
        <!-- <profile>default</profile> -->
    </distributed_ddl>

    <!-- Settings to fine tune MergeTree tables. See documentation in source code, in MergeTreeSettings.h -->
    <!--
    <merge_tree>
        <max_suspicious_broken_parts>5</max_suspicious_broken_parts>
    </merge_tree>
    -->

    <!-- Protection from accidental DROP.
         If size of a MergeTree table is greater than max_table_size_to_drop (in bytes) than table could not be dropped with any DROP query.
         If you want do delete one table and don't want to restart clickhouse-server, you could create special file <clickhouse-path>/flags/force_drop_table and make DROP once.
         By default max_table_size_to_drop is 50GB; max_table_size_to_drop=0 allows to DROP any tables.
         The same for max_partition_size_to_drop.
         Uncomment to disable protection.
    -->
    <!-- <max_table_size_to_drop>0</max_table_size_to_drop> -->
    <!-- <max_partition_size_to_drop>0</max_partition_size_to_drop> -->

    <!-- Example of parameters for GraphiteMergeTree table engine -->
    <graphite_rollup_example>
        <pattern>
            <regexp>click_cost</regexp>
            <function>any</function>
            <retention>
                <age>0</age>
                <precision>3600</precision>
            </retention>
            <retention>
                <age>86400</age>
                <precision>60</precision>
            </retention>
        </pattern>
        <default>
            <function>max</function>
            <retention>
                <age>0</age>
                <precision>60</precision>
            </retention>
            <retention>
                <age>3600</age>
                <precision>300</precision>
            </retention>
            <retention>
                <age>86400</age>
                <precision>3600</precision>
            </retention>
        </default>
    </graphite_rollup_example>

    <!-- Directory in <clickhouse-path> containing schema files for various input formats.
         The directory will be created if it doesn't exist.
      -->
    <format_schema_path>/var/lib/clickhouse/format_schemas/</format_schema_path>


    <!-- Uncomment to use query masking rules.
        name - name for the rule (optional)
        regexp - RE2 compatible regular expression (mandatory)
        replace - substitution string for sensitive data (optional, by default - six asterisks)
    <query_masking_rules>
        <rule>
            <name>hide SSN</name>
            <regexp>\b\d{3}-\d{2}-\d{4}\b</regexp>
            <replace>000-00-0000</replace>
        </rule>
    </query_masking_rules>
    -->

    <!-- Uncomment to disable ClickHouse internal DNS caching. -->
    <!-- <disable_internal_dns_cache>1</disable_internal_dns_cache> -->
</yandex>

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

$ cat /etc/clickhouse-server/config.xml

<?xml version="1.0"?>

<!--

NOTE: User and query level settings are set up in "users.xml" file.

-->

<level>trace</level>

<log>/var/log/clickhouse-server/clickhouse-server.log</log>

<errorlog>/var/log/clickhouse-server/clickhouse-server.err.log</errorlog>

</logger>

<http_port>8123</http_port>

<tcp_port>9000</tcp_port>

<!--

<https_port>8443</https_port>

<tcp_port_secure>9440</tcp_port_secure>

-->

<certificateFile>/etc/clickhouse-server/server.crt</certificateFile>

<privateKeyFile>/etc/clickhouse-server/server.key</privateKeyFile>

<dhParamsFile>/etc/clickhouse-server/dhparam.pem</dhParamsFile>

</server>

<name>RejectCertificateHandler</name>

</invalidCertificateHandler>

</client>

</openSSL>

<!--

<http_server_default_response><![CDATA[<html ng-app="SMI2"><head><base href="http://ui.tabix.io/"></head><body><div ui-view="" class="content-ui"></div><script src="http://loader.tabix.io/master.js"></script></body></html>]]></http_server_default_response>

-->

<interserver_http_port>9009</interserver_http_port>

<!-- Hostname that is used by other replicas to request this server.

If not specified, than it is determined analoguous to 'hostname -f' command.

This setting could be used to switch replication to another network interface.

-->

<interserver_http_host>cliechouse-01.t4x.org</interserver_http_host>

<listen_host>0.0.0.0</listen_host>

<!--

<listen_host>::1</listen_host>

<listen_host>127.0.0.1</listen_host>

-->

<max_connections>4096</max_connections>

<keep_alive_timeout>3</keep_alive_timeout>

<max_concurrent_queries>100</max_concurrent_queries>

<!-- Set limit on number of open files (default: maximum). This setting makes sense on Mac OS X because getrlimit() fails to retrieve

correct maximum value. -->

<!-- Size of cache of uncompressed blocks of data, used in tables of MergeTree family.

In bytes. Cache is single for server. Memory is allocated only on demand.

Cache is used when 'use_uncompressed_cache' user setting turned on (off by default).

Uncompressed cache is advantageous only for very short queries and in rare cases.

-->

<uncompressed_cache_size>8589934592</uncompressed_cache_size>

<!-- Approximate size of mark cache, used in tables of MergeTree family.

In bytes. Cache is single for server. Memory is allocated only on demand.

You should not lower this value.

-->

<mark_cache_size>5368709120</mark_cache_size>

<path>/data/clickhouse/data/</path>

<tmp_path>/data/clickhouse/tmp/</tmp_path>

<user_files_path>/data/clickhouse/user_files/</user_files_path>

<users_config>users.xml</users_config>

<default_profile>default</default_profile>

<default_database>default</default_database>

<timezone>Asia/Shanghai</timezone>

<!-- Server time zone could be set here.

Time zone is used when converting between String and DateTime types,

when printing DateTime in text formats and parsing DateTime from text,

it is used in date and time related functions, if specific time zone was not passed as an argument.

Time zone is specified as identifier from IANA time zone database, like UTC or Africa/Abidjan.

If not specified, system time zone at server startup is used.

Please note, that server could display time zone alias instead of specified name.

Example: W-SU is an alias for Europe/Moscow and Zulu is an alias for UTC.

-->

<!-- You can specify umask here (see "man umask"). Server will apply it on startup.

Number is always parsed as octal. Default umask is 027 (other users cannot read logs, data files, etc; group can only read).

-->

<!-- Perform mlockall after startup to lower first queries latency

and to prevent clickhouse executable from being paged out under high IO load.

Enabling this option is recommended but will lead to increased startup time for up to a few seconds.

-->

<mlock_executable>false</mlock_executable>

<remote_servers incl="clickhouse_remote_servers" optional="true" />

<!-- Configuration of clusters that could be used in Distributed tables.

https://clickhouse.yandex/docs/en/table_engines/distributed/

<remote_servers incl="clickhouse_remote_servers" >

<test_shard_localhost>

<shard>

<host>localhost</host>

</replica>

</shard>

</test_shard_localhost>

</remote_servers>

-->

<!-- If element has 'incl' attribute, then for it's value will be used corresponding substitution from another file.

By default, path to file with substitutions is /etc/metrika.xml. It could be changed in config in 'include_from' element.

Values for substitutions are specified in /yandex/name_of_substitution elements in that file.

-->

<!-- ZooKeeper is used to store metadata about replicas, when using Replicated tables.

Optional. If you don't use replicated tables, you could omit that.

See https://clickhouse.yandex/docs/en/table_engines/replication/

-->

<!-- Substitutions for parameters of replicated tables.

Optional. If you don't use replicated tables, you could omit that.

See https://clickhouse.yandex/docs/en/table_engines/replication/#creating-replicated-tables

-->

<builtin_dictionaries_reload_interval>3600</builtin_dictionaries_reload_interval>

<max_session_timeout>3600</max_session_timeout>

<default_session_timeout>60</default_session_timeout>

<!--

interval - send every X second

root_path - prefix for keys

hostname_in_path - append hostname to root_path (default = true)

metrics - send data from table system.metrics

events - send data from table system.events

asynchronous_metrics - send data from table system.asynchronous_metrics

-->

<!--

<host>localhost</host>

<root_path>one_min</root_path>

<hostname_in_path>true</hostname_in_path>

<events_cumulative>false</events_cumulative>

<asynchronous_metrics>true</asynchronous_metrics>

</graphite>

<host>localhost</host>

<root_path>one_sec</root_path>

<events_cumulative>false</events_cumulative>

<asynchronous_metrics>false</asynchronous_metrics>

</graphite>

-->

<query_log>

<!-- What table to insert data. If table is not exist, it will be created.

When query log structure is changed after system update,

then old table will be renamed and new table will be created automatically.

-->

<database>system</database>

<table>query_log</table>

<!--

PARTITION BY expr https://clickhouse.yandex/docs/en/table_engines/custom_partitioning_key/

Example:

event_date

toMonday(event_date)

toYYYYMM(event_date)

toStartOfHour(event_time)

-->

<partition_by>toYYYYMM(event_date)</partition_by>

<flush_interval_milliseconds>7500</flush_interval_milliseconds>

</query_log>

<!-- Trace log. Stores stack traces collected by query profilers.

See query_profiler_real_time_period_ns and query_profiler_cpu_time_period_ns settings. -->

<trace_log>

<database>system</database>

<table>trace_log</table>

<partition_by>toYYYYMM(event_date)</partition_by>

<flush_interval_milliseconds>7500</flush_interval_milliseconds>

</trace_log>

<!-- Query thread log. Has information about all threads participated in query execution.

Used only for queries with setting log_query_threads = 1. -->

<query_thread_log>

<database>system</database>

<table>query_thread_log</table>

<partition_by>toYYYYMM(event_date)</partition_by>

<flush_interval_milliseconds>7500</flush_interval_milliseconds>

</query_thread_log>

<!-- Uncomment if use part log.

Part log contains information about all actions with parts in MergeTree tables (creation, deletion, merges, downloads).

<part_log>

<database>system</database>

<flush_interval_milliseconds>7500</flush_interval_milliseconds>

</part_log>

-->

<!-- Uncomment to write text log into table.

Text log contains all information from usual server log but stores it in structured and efficient way.

<text_log>

<database>system</database>

<flush_interval_milliseconds>7500</flush_interval_milliseconds>

</text_log>

-->

<!-- Uncomment to write metric log into table.

Metric log contains rows with current values of ProfileEvents, CurrentMetrics collected with "collect_interval_milliseconds" interval.

<metric_log>

<database>system</database>

<table>metric_log</table>

<flush_interval_milliseconds>7500</flush_interval_milliseconds>

<collect_interval_milliseconds>1000</collect_interval_milliseconds>

</metric_log>

-->

<!-- Parameters for embedded dictionaries, used in Yandex.Metrica.

See https://clickhouse.yandex/docs/en/dicts/internal_dicts/

-->

<!-- Configuration of external dictionaries. See:

https://clickhouse.yandex/docs/en/dicts/external_dicts/

-->

<dictionaries_config>*_dictionary.xml</dictionaries_config>

<!-- Uncomment if you want data to be compressed 30-100% better.

Don't do that if you just started using ClickHouse.

-->

<!--

<!- - Set of variants. Checked in order. Last matching case wins. If nothing matches, lz4 will be used. - ->

<case>

<!- - Conditions. All must be satisfied. Some conditions may be omitted. - ->

<min_part_size>10000000000</min_part_size> <!- - Min part size in bytes. - ->

<min_part_size_ratio>0.01</min_part_size_ratio> <!- - Min size of part relative to whole table size. - ->

<!- - What compression method to use. - ->

</case>

-->

</compression>

<!-- Allow to execute distributed DDL queries (CREATE, DROP, ALTER, RENAME) on cluster.

Works only if ZooKeeper is enabled. Comment it if such functionality isn't required. -->

<distributed_ddl>

<path>/clickhouse/task_queue/ddl</path>

</distributed_ddl>

<!--

<merge_tree>

<max_suspicious_broken_parts>5</max_suspicious_broken_parts>

</merge_tree>

-->

<!-- Protection from accidental DROP.

If size of a MergeTree table is greater than max_table_size_to_drop (in bytes) than table could not be dropped with any DROP query.

If you want do delete one table and don't want to restart clickhouse-server, you could create special file <clickhouse-path>/flags/force_drop_table and make DROP once.

By default max_table_size_to_drop is 50GB; max_table_size_to_drop=0 allows to DROP any tables.

The same for max_partition_size_to_drop.

Uncomment to disable protection.

-->

<graphite_rollup_example>

<regexp>click_cost</regexp>

</retention>

</retention>

</pattern>

</retention>

</retention>

</retention>

</default>

</graphite_rollup_example>

<!-- Directory in <clickhouse-path> containing schema files for various input formats.

The directory will be created if it doesn't exist.

-->

<format_schema_path>/var/lib/clickhouse/format_schemas/</format_schema_path>

<!-- Uncomment to use query masking rules.

name - name for the rule (optional)

regexp - RE2 compatible regular expression (mandatory)

replace - substitution string for sensitive data (optional, by default - six asterisks)

<query_masking_rules>

<rule>

</rule>

</query_masking_rules>

-->

</yandex>

文章源自 note.t4x.orgByrd's Blog-https://note.t4x.org/database/clickhouse-cluster-deployment/

四、clickhous高可用方案1

子表使用MergeTree引擎，Insert写Distributed表文章源自 note.t4x.orgByrd's Blog-https://note.t4x.org/database/clickhouse-cluster-deployment/
在这种情况下，分布式表会跨服务器分发插入数据。为了写入分布式表，必须要配置分片键（最后一个参数）。当然，如果只有一个分片，则写操作在没有分片键的情况下也能工作，因为这种情况下分片键没有意义，所有数据都将发送到一个分片。文章源自 note.t4x.orgByrd's Blog-https://note.t4x.org/database/clickhouse-cluster-deployment/
通常将internal_replication参数设置为false，这样写操作会将数据写入所有副本以实现高可用。实质上，这意味着要分布式表本身来复制数据。这种方式不如使用复制表的好，因为不会检查副本的一致性，并且随着时间的推移，副本数据可能会有些不一样。文章源自 note.t4x.orgByrd's Blog-https://note.t4x.org/database/clickhouse-cluster-deployment/

$ cat /etc/metrika.xml
<yandex>
    <!-- cluster setting -->
    <clickhouse_remote_servers>
        <perftest_3shards_2replicas>
            <!-- shard01 -->
            <shard>
                <!-- Optional. Shard weight when writing data. Default: 1. -->
                <weight>1</weight>
                <!-- Optional. Whether to write data to just one of the replicas. Default: false (write data to all replicas). 
                1. internal_replication为false时候，（前提是往分布表写入数据）会自动往同一shard下所有备份表写入相同数据，不需要任何其他外力，单独设置这个参数即可；但是会出现各备份之间数据不同步的情况，因为此种情况下往分布式表里面写数据，后台算法会先按照weight将数据分成shard数量堆，然后将对应堆的数据分别写入该shard下面的所有备份表中，有可能存在同样的数据写入A备份成功但是写入B备份失败的情况，这里是没有校验的；
                2. internal_replication为true时，一定要配合zookeeper和ReplicatedMergeTree引擎表使用，如果不配合这些，经本人测试查询数据时会出现严重错误，请切记
                -->
                <internal_replication>false</internal_replication>
                <replica>
                    <host>10.1.20.202</host>
                    <port>9000</port>
                    <user>default</user>
                    <password>V53oq9M2</password>
                </replica>

                <replica>
                    <host>10.1.20.216</host>
                    <port>9000</port>
                    <user>default</user>
                    <password>V53oq9M2</password>
                </replica>
            </shard>
            
            <!-- shard02 -->
            <shard>
                <weight>1</weight>
                <internal_replication>false</internal_replication>
                <replica>
                    <host>10.1.20.203</host>
                    <port>9000</port>
                    <user>default</user>
                    <password>V53oq9M2</password>
                </replica>
                <replica>
                    <host>10.1.20.217</host>
                    <port>9000</port>
                    <user>default</user>
                    <password>V53oq9M2</password>
                </replica>
            </shard>


            <!-- shard03 -->
            <shard>
                <weight>1</weight>
                <internal_replication>false</internal_replication>
                <replica>
                    <host>10.1.20.204</host>
                    <port>9000</port>
                    <user>default</user>
                    <password>V53oq9M2</password>
                </replica>
                <replica>
                    <host>10.1.20.218</host>
                    <port>9000</port>
                    <user>default</user>
                    <password>V53oq9M2</password>
                </replica>
            </shard>
        </perftest_3shards_2replicas>
    </clickhouse_remote_servers>

    <!-- ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/hits', '{replica}') 
    1. /clickhouse/tables/{layer}-{shard}/hits作为整体可以理解为表在zookeeper中的定位和识别符
    2. 同一个{layer}-{shard}下面的表互为备份，会自动同步
    3. layer我将之理解为集群识别符，虽然其可以与上面clickhouse_remote_servers配置中的cluster名称不一样；同一个配置文件是可以配置多个集群的，因此有layer这一说法
    4. hits可以与表名相同，但是表名改变时这里不会变，因为这里的含义是，表在zookeeper中的定位和识别符
    5. replica是备份序号的识别符，只要不同即可，可以设置为与该节点hostname相同（注意replica一定要不同）
    6. macros中的值设置会对应到ReplicatedMergeTree中的变量（就是大括号的部分），这样在每个节点建立local表时，可以使用完全相同的SQL语句，因为变量由这里的配置控制
    -->
    
    <macros>
        <layer>perftest_3shards_2replicas</layer>
        <!--根据前面shard的配置，例子中总共2个shard-->
        <shard>shard01</shard>
        <!--每个节点配置本地主机名即可，或者唯一的数字id-->
        <replica>10.1.20.202</replica>
    </macros>

    <!-- 监听网络（貌似重复） -->
    <networks>
       <ip>::/0</ip>
    </networks>
    
    <!-- zookeeper  -->
    <zookeeper-servers>
      <node index="1">
        <host>x.x.x.x</host>
        <port>2181</port>
     </node>

  <node index="2">
    <host>x.x.x.x</host>
    <port>2181</port>
  </node>
  <node index="3">
    <host>x.x.x.x</host>
    <port>2181</port>
  </node>
    </zookeeper-servers>

    <!-- 数据压缩算法  -->
    <clickhouse_compression>
        <case>
            <min_part_size>10000000000</min_part_size>
            <min_part_size_ratio>0.01</min_part_size_ratio>
            <method>lz4</method>
        </case>
    </clickhouse_compression>
</yandex>

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

$ cat /etc/metrika.xml

<clickhouse_remote_servers>

<perftest_3shards_2replicas>

<shard>

<!-- Optional. Whether to write data to just one of the replicas. Default: false (write data to all replicas).

1. internal_replication为false时候，（前提是往分布表写入数据）会自动往同一shard下所有备份表写入相同数据，不需要任何其他外力，单独设置这个参数即可；但是会出现各备份之间数据不同步的情况，因为此种情况下往分布式表里面写数据，后台算法会先按照weight将数据分成shard数量堆，然后将对应堆的数据分别写入该shard下面的所有备份表中，有可能存在同样的数据写入A备份成功但是写入B备份失败的情况，这里是没有校验的；

2. internal_replication为true时，一定要配合zookeeper和ReplicatedMergeTree引擎表使用，如果不配合这些，经本人测试查询数据时会出现严重错误，请切记

-->

<internal_replication>false</internal_replication>

<user>default</user>

</replica>

<user>default</user>

</replica>

</shard>

<shard>

<internal_replication>false</internal_replication>

<user>default</user>

</replica>

<user>default</user>

</replica>

</shard>

<shard>

<internal_replication>false</internal_replication>

<user>default</user>

</replica>

<user>default</user>

</replica>

</shard>

</perftest_3shards_2replicas>

</clickhouse_remote_servers>

<!-- ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/hits', '{replica}')

1. /clickhouse/tables/{layer}-{shard}/hits作为整体可以理解为表在zookeeper中的定位和识别符

2. 同一个{layer}-{shard}下面的表互为备份，会自动同步

3. layer我将之理解为集群识别符，虽然其可以与上面clickhouse_remote_servers配置中的cluster名称不一样；同一个配置文件是可以配置多个集群的，因此有layer这一说法

4. hits可以与表名相同，但是表名改变时这里不会变，因为这里的含义是，表在zookeeper中的定位和识别符

5. replica是备份序号的识别符，只要不同即可，可以设置为与该节点hostname相同（注意replica一定要不同）

6. macros中的值设置会对应到ReplicatedMergeTree中的变量（就是大括号的部分），这样在每个节点建立local表时，可以使用完全相同的SQL语句，因为变量由这里的配置控制

-->

<layer>perftest_3shards_2replicas</layer>

<shard>shard01</shard>

</macros>

</networks>

<zookeeper-servers>

</node>

</node>

</node>

</zookeeper-servers>

<clickhouse_compression>

<case>

<min_part_size>10000000000</min_part_size>

<min_part_size_ratio>0.01</min_part_size_ratio>

</case>

</clickhouse_compression>

</yandex>

测试语句:

CREATE TABLE ontime_local (FlightDate Date,Year UInt16) ENGINE = MergeTree(FlightDate, (Year, FlightDate), 8192);
CREATE TABLE ontime_all AS ontime_local ENGINE = Distributed(perftest_3shards_2replicas, default, ontime_local, rand());

insert into ontime_all (FlightDate,Year)values('2001-10-12',2001);
insert into ontime_all (FlightDate,Year)values('2002-10-12',2002);
insert into ontime_all (FlightDate,Year)values('2003-10-12',2003);

SELECT * FROM system.clusters;

select * from  ontime_all;

CREATE TABLE ontime_local (FlightDate Date,Year UInt16) ENGINE = MergeTree(FlightDate, (Year, FlightDate), 8192);

CREATE TABLE ontime_all AS ontime_local ENGINE = Distributed(perftest_3shards_2replicas, default, ontime_local, rand());

insert into ontime_all (FlightDate,Year)values('2001-10-12',2001);

insert into ontime_all (FlightDate,Year)values('2002-10-12',2002);

insert into ontime_all (FlightDate,Year)values('2003-10-12',2003);

SELECT * FROM system.clusters;

select * from ontime_all;

文章源自 note.t4x.orgByrd's Blog-https://note.t4x.org/database/clickhouse-cluster-deployment/

五、clickhous高可用方案2

子表使用ReplicatedMergeTree引擎，Insert写子表
可以将internal_replication参数设置为true，因为可以由ClickHouse来保证数据副本的一致性。你可以自已指定要将哪些数据写入哪些服务器，并直接在每个分片上执行写入，并且你可以使用任何分片方案。对于复杂业务特性的需求，这可能是非常重要的。官方推荐这种方案。
使用复制表并不影响效率
SELECT 查询并不需要借助 ZooKeeper ，复本并不影响 SELECT 的性能，查询复制表与非复制表速度是一样的。查询分布式表时，ClickHouse的处理方式可通过设置 max_replica_delay_for_distributed_queries 和 fallback_to_stale_replicas_for_distributed_queries 修改。文章源自 note.t4x.orgByrd's Blog-https://note.t4x.org/database/clickhouse-cluster-deployment/
对于每个 INSERT 语句，会通过几个事务将十来个记录添加到 ZooKeeper。（确切地说，这是针对每个插入的数据块; 每个 INSERT 语句的每 max_insert_block_size = 1048576 行和最后剩余的都各算作一个块。）相比非复制表，写 zk 会导致 INSERT 的延迟略长一些。但只要你按照建议每秒不超过一个 INSERT 地批量插入数据，不会有任何问题。一个 ZooKeeper 集群能给整个 ClickHouse 集群支撑协调每秒几百个 INSERT。数据插入的吞吐量（每秒的行数）可以跟不用复制的数据一样高。文章源自 note.t4x.orgByrd's Blog-https://note.t4x.org/database/clickhouse-cluster-deployment/